pheval 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (56) hide show
  1. {pheval-0.3.2 → pheval-0.3.4}/PKG-INFO +1 -1
  2. {pheval-0.3.2 → pheval-0.3.4}/pyproject.toml +1 -1
  3. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/generate_plots.py +6 -3
  4. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/cli.py +2 -0
  5. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/cli_pheval_utils.py +124 -14
  6. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/prepare/create_spiked_vcf.py +132 -95
  7. pheval-0.3.4/src/pheval/prepare/prepare_corpus.py +67 -0
  8. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/prepare/update_phenopacket.py +1 -2
  9. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/utils/phenopacket_utils.py +69 -7
  10. {pheval-0.3.2 → pheval-0.3.4}/LICENSE +0 -0
  11. {pheval-0.3.2 → pheval-0.3.4}/README.md +0 -0
  12. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/__init__.py +0 -0
  13. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/__init__.py +0 -0
  14. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/analysis.py +0 -0
  15. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/benchmark_generator.py +0 -0
  16. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/benchmarking_data.py +0 -0
  17. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/binary_classification_stats.py +0 -0
  18. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/disease_prioritisation_analysis.py +0 -0
  19. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/gene_prioritisation_analysis.py +0 -0
  20. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/generate_summary_outputs.py +0 -0
  21. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/parse_benchmark_summary.py +0 -0
  22. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/parse_pheval_result.py +0 -0
  23. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/prioritisation_rank_recorder.py +0 -0
  24. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/prioritisation_result_types.py +0 -0
  25. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/rank_stats.py +0 -0
  26. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/run_data_parser.py +0 -0
  27. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/analyse/variant_prioritisation_analysis.py +0 -0
  28. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/cli_pheval.py +0 -0
  29. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/config_parser.py +0 -0
  30. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/constants.py +0 -0
  31. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/implementations/__init__.py +0 -0
  32. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/infra/__init__.py +0 -0
  33. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/infra/exomiserdb.py +0 -0
  34. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/post_processing/__init__.py +0 -0
  35. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/post_processing/post_processing.py +0 -0
  36. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/prepare/__init__.py +0 -0
  37. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/prepare/create_noisy_phenopackets.py +0 -0
  38. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/prepare/custom_exceptions.py +0 -0
  39. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/resources/alternate_ouputs/CADA_results.txt +0 -0
  40. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/resources/alternate_ouputs/DeepPVP_results.txt +0 -0
  41. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/resources/alternate_ouputs/OVA_results.txt +0 -0
  42. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/resources/alternate_ouputs/Phen2Gene_results.json +0 -0
  43. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/resources/alternate_ouputs/Phenolyzer_results.txt +0 -0
  44. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/resources/alternate_ouputs/lirical_results.tsv +0 -0
  45. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/resources/alternate_ouputs/svanna_results.tsv +0 -0
  46. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/resources/hgnc_complete_set.txt +0 -0
  47. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/run_metadata.py +0 -0
  48. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/runners/__init__.py +0 -0
  49. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/runners/runner.py +0 -0
  50. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/utils/__init__.py +0 -0
  51. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/utils/docs_gen.py +0 -0
  52. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/utils/docs_gen.sh +0 -0
  53. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/utils/exomiser.py +0 -0
  54. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/utils/file_utils.py +0 -0
  55. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/utils/semsim_utils.py +0 -0
  56. {pheval-0.3.2 → pheval-0.3.4}/src/pheval/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pheval
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary:
5
5
  Author: Yasemin Bridges
6
6
  Author-email: y.bridges@qmul.ac.uk
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "pheval"
3
- version = "0.3.2"
3
+ version = "0.3.4"
4
4
  description = ""
5
5
  authors = ["Yasemin Bridges <y.bridges@qmul.ac.uk>",
6
6
  "Julius Jacobsen <j.jacobsen@qmul.ac.uk>",
@@ -482,6 +482,7 @@ def generate_plots(
482
482
  benchmark_generator: BenchmarkRunOutputGenerator,
483
483
  plot_type: str,
484
484
  title: str = None,
485
+ generate_from_tsv: bool = False,
485
486
  ) -> None:
486
487
  """
487
488
  Generate summary statistics bar plots for prioritisation.
@@ -493,10 +494,12 @@ def generate_plots(
493
494
  benchmark_generator (BenchmarkRunOutputGenerator): Object containing benchmarking output generation details.
494
495
  plot_type (str): Type of plot to be generated ("bar_stacked", "bar_cumulative", "bar_non_cumulative").
495
496
  title (str, optional): Title for the generated plot. Defaults to None.
497
+ generate_from_tsv (bool): Specify whether to generate plots from the TSV file. Defaults to False.
496
498
  """
497
499
  plot_generator = PlotGenerator()
498
- plot_generator.generate_roc_curve(benchmarking_results, benchmark_generator)
499
- plot_generator.generate_precision_recall(benchmarking_results, benchmark_generator)
500
+ if not generate_from_tsv:
501
+ plot_generator.generate_roc_curve(benchmarking_results, benchmark_generator)
502
+ plot_generator.generate_precision_recall(benchmarking_results, benchmark_generator)
500
503
  if plot_type == "bar_stacked":
501
504
  plot_generator.generate_stacked_bar_plot(benchmarking_results, benchmark_generator, title)
502
505
  elif plot_type == "bar_cumulative":
@@ -541,4 +544,4 @@ def generate_plots_from_benchmark_summary_tsv(
541
544
  raise ValueError(
542
545
  "Specify one analysis type (gene_analysis, variant_analysis, or disease_analysis)"
543
546
  )
544
- generate_plots(benchmarking_results, benchmark_generator, plot_type, title)
547
+ generate_plots(benchmarking_results, benchmark_generator, plot_type, title, True)
@@ -10,6 +10,7 @@ from .cli_pheval_utils import (
10
10
  benchmark_comparison,
11
11
  create_spiked_vcfs_command,
12
12
  generate_stats_plot,
13
+ prepare_corpus_command,
13
14
  scramble_phenopackets_command,
14
15
  semsim_scramble_command,
15
16
  semsim_to_exomiserdb_command,
@@ -60,6 +61,7 @@ pheval_utils.add_command(benchmark)
60
61
  pheval_utils.add_command(benchmark_comparison)
61
62
  pheval_utils.add_command(semsim_to_exomiserdb_command)
62
63
  pheval_utils.add_command(generate_stats_plot)
64
+ pheval_utils.add_command(prepare_corpus_command)
63
65
 
64
66
  if __name__ == "__main__":
65
67
  main()
@@ -15,6 +15,7 @@ from pheval.analyse.run_data_parser import parse_run_data_text_file
15
15
  from pheval.prepare.create_noisy_phenopackets import scramble_phenopackets
16
16
  from pheval.prepare.create_spiked_vcf import spike_vcfs
17
17
  from pheval.prepare.custom_exceptions import InputError, MutuallyExclusiveOptionError
18
+ from pheval.prepare.prepare_corpus import prepare_corpus
18
19
  from pheval.prepare.update_phenopacket import update_phenopackets
19
20
  from pheval.utils.exomiser import semsim_to_exomiserdb
20
21
  from pheval.utils.semsim_utils import percentage_diff, semsim_heatmap_plot
@@ -253,22 +254,19 @@ def update_phenopackets_command(
253
254
  mutually_exclusive=["phenopacket_path"],
254
255
  )
255
256
  @click.option(
256
- "--template-vcf-path",
257
- "-t",
258
- cls=MutuallyExclusiveOptionError,
257
+ "--hg19-template-vcf",
258
+ "-hg19",
259
259
  metavar="PATH",
260
260
  required=False,
261
- help="Template VCF file",
262
- mutually_exclusive=["vcf_dir"],
261
+ help="Template hg19 VCF file",
263
262
  type=Path,
264
263
  )
265
264
  @click.option(
266
- "--vcf-dir",
267
- "-v",
268
- cls=MutuallyExclusiveOptionError,
265
+ "--hg38-template-vcf",
266
+ "-hg38",
269
267
  metavar="PATH",
270
- help="Directory containing template VCF files",
271
- mutually_exclusive=["template_vcf"],
268
+ required=False,
269
+ help="Template hg38 VCF file",
272
270
  type=Path,
273
271
  )
274
272
  @click.option(
@@ -284,13 +282,22 @@ def create_spiked_vcfs_command(
284
282
  phenopacket_path: Path,
285
283
  phenopacket_dir: Path,
286
284
  output_dir: Path,
287
- template_vcf_path: Path = None,
288
- vcf_dir: Path = None,
285
+ hg19_template_vcf: Path = None,
286
+ hg38_template_vcf: Path = None,
289
287
  ):
290
- """Spikes variants into a template VCF file for a directory of phenopackets."""
288
+ """
289
+ Create spiked VCF from either a Phenopacket or a Phenopacket directory.
290
+
291
+ Args:
292
+ phenopacket_path (Path): Path to a single Phenopacket file (optional).
293
+ phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional).
294
+ output_dir (Path): The directory to store the generated spiked VCF file(s).
295
+ hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
296
+ hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
297
+ """
291
298
  if phenopacket_path is None and phenopacket_dir is None:
292
299
  raise InputError("Either a phenopacket or phenopacket directory must be specified")
293
- spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, template_vcf_path, vcf_dir)
300
+ spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, hg19_template_vcf, hg38_template_vcf)
294
301
 
295
302
 
296
303
  @click.command()
@@ -600,3 +607,106 @@ def generate_stats_plot(
600
607
  generate_plots_from_benchmark_summary_tsv(
601
608
  benchmarking_tsv, gene_analysis, variant_analysis, disease_analysis, plot_type, title
602
609
  )
610
+
611
+
612
+ @click.command("prepare-corpus")
613
+ @click.option(
614
+ "--phenopacket-dir",
615
+ "-p",
616
+ required=True,
617
+ metavar="PATH",
618
+ help="Path to phenopacket corpus directory..",
619
+ type=Path,
620
+ )
621
+ @click.option(
622
+ "--variant-analysis/--no-variant-analysis",
623
+ default=False,
624
+ required=False,
625
+ type=bool,
626
+ show_default=True,
627
+ help="Specify whether to check for complete variant records in the phenopackets.",
628
+ )
629
+ @click.option(
630
+ "--gene-analysis/--no-gene-analysis",
631
+ default=False,
632
+ required=False,
633
+ type=bool,
634
+ show_default=True,
635
+ help="Specify whether to check for complete gene records in the phenopackets.",
636
+ )
637
+ @click.option(
638
+ "--disease-analysis/--no-disease-analysis",
639
+ default=False,
640
+ required=False,
641
+ type=bool,
642
+ show_default=True,
643
+ help="Specify whether to check for complete disease records in the phenopackets.",
644
+ )
645
+ @click.option(
646
+ "--gene-identifier",
647
+ "-g",
648
+ required=False,
649
+ help="Gene identifier to update in phenopacket",
650
+ type=click.Choice(["ensembl_id", "entrez_id", "hgnc_id"]),
651
+ )
652
+ @click.option(
653
+ "--hg19-template-vcf",
654
+ "-hg19",
655
+ metavar="PATH",
656
+ required=False,
657
+ help="Template hg19 VCF file",
658
+ type=Path,
659
+ )
660
+ @click.option(
661
+ "--hg38-template-vcf",
662
+ "-hg38",
663
+ metavar="PATH",
664
+ required=False,
665
+ help="Template hg38 VCF file",
666
+ type=Path,
667
+ )
668
+ @click.option(
669
+ "--output-dir",
670
+ "-o",
671
+ metavar="PATH",
672
+ required=True,
673
+ help="Path to output prepared corpus.",
674
+ default="prepared_corpus",
675
+ type=Path,
676
+ )
677
+ def prepare_corpus_command(
678
+ phenopacket_dir: Path,
679
+ variant_analysis: bool,
680
+ gene_analysis: bool,
681
+ disease_analysis: bool,
682
+ gene_identifier: str,
683
+ hg19_template_vcf: Path,
684
+ hg38_template_vcf: Path,
685
+ output_dir: Path,
686
+ ):
687
+ """
688
+ Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
689
+ gene identifiers.
690
+
691
+ Args:
692
+ phenopacket_dir (Path): The path to the directory containing Phenopackets.
693
+ variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
694
+ gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
695
+ disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
696
+ gene_identifier (str): Identifier for updating gene identifiers, if applicable.
697
+ hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into
698
+ VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
699
+ hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
700
+ VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
701
+ output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
702
+ """
703
+ prepare_corpus(
704
+ phenopacket_dir,
705
+ variant_analysis,
706
+ gene_analysis,
707
+ disease_analysis,
708
+ gene_identifier,
709
+ hg19_template_vcf,
710
+ hg38_template_vcf,
711
+ output_dir,
712
+ )
@@ -1,7 +1,6 @@
1
1
  import gzip
2
2
  import logging
3
3
  import re
4
- import secrets
5
4
  import urllib.parse
6
5
  from copy import copy
7
6
  from dataclasses import dataclass
@@ -10,6 +9,8 @@ from typing import List, Union
10
9
 
11
10
  from phenopackets import Family, File, Phenopacket
12
11
 
12
+ from pheval.prepare.custom_exceptions import InputError
13
+ from pheval.utils.file_utils import files_with_suffix, is_gzipped
13
14
  from pheval.utils.phenopacket_utils import (
14
15
  IncompatibleGenomeAssemblyError,
15
16
  PhenopacketRebuilder,
@@ -19,9 +20,6 @@ from pheval.utils.phenopacket_utils import (
19
20
  write_phenopacket,
20
21
  )
21
22
 
22
- from .custom_exceptions import InputError
23
- from ..utils.file_utils import all_files, files_with_suffix, is_gzipped
24
-
25
23
  info_log = logging.getLogger("info")
26
24
 
27
25
  genome_assemblies = {
@@ -91,39 +89,6 @@ class VcfHeader:
91
89
  chr_status: bool
92
90
 
93
91
 
94
- class VcfPicker:
95
- """Choose a VCF file randomly from a directory if provided, otherwise selects the single template."""
96
-
97
- def __init__(self, template_vcf: Path or None, vcf_dir: Path or None):
98
- """
99
- Initialise the VcfPicker.
100
-
101
- Args:
102
- template_vcf (Path or None): The path to a template VCF file, or None if not provided.
103
- vcf_dir (Path or None): The directory containing VCF files, or None if not provided.
104
- """
105
- self.template_vcf = template_vcf
106
- self.vcf_dir = vcf_dir
107
-
108
- def pick_file_from_dir(self) -> Path:
109
- """
110
- Selects a VCF file from a directory at random.
111
-
112
- Returns:
113
- Path: The randomly selected VCF file path from the directory.
114
- """
115
- return secrets.choice(all_files(self.vcf_dir))
116
-
117
- def pick_file(self) -> Path:
118
- """
119
- Select a VCF file randomly when given a directory; if not, the template VCF is assigned.
120
-
121
- Returns:
122
- Path: The selected VCF file path.
123
- """
124
- return self.pick_file_from_dir() if self.vcf_dir is not None else self.template_vcf
125
-
126
-
127
92
  def read_vcf(vcf_file: Path) -> List[str]:
128
93
  """
129
94
  Read the contents of a VCF file into memory, handling both uncompressed and gzipped files.
@@ -206,6 +171,72 @@ class VcfHeaderParser:
206
171
  return VcfHeader(sample_id, assembly, chr_status)
207
172
 
208
173
 
174
+ @dataclass
175
+ class VcfFile:
176
+ """
177
+ Represents a VCF file with its name, contents, and header information.
178
+
179
+ Attributes:
180
+ vcf_file_name (str): The name of the VCF file.
181
+ vcf_contents (List[str]): The contents of the VCF file.
182
+ vcf_header (VcfHeader): The parsed header information of the VCF file.
183
+ """
184
+
185
+ vcf_file_name: str = None
186
+ vcf_contents: List[str] = None
187
+ vcf_header: VcfHeader = None
188
+
189
+ @staticmethod
190
+ def populate_fields(template_vcf: Path):
191
+ """
192
+ Populate the fields of the VcfFile instance using the contents of a template VCF file.
193
+
194
+ Args:
195
+ template_vcf (Path): The path to the template VCF file.
196
+
197
+ Returns:
198
+ VcfFile: An instance of VcfFile with populated fields.
199
+
200
+ """
201
+ contents = read_vcf(template_vcf)
202
+ return VcfFile(template_vcf.name, contents, VcfHeaderParser(contents).parse_vcf_header())
203
+
204
+
205
+ def select_vcf_template(
206
+ phenopacket_path: Path,
207
+ proband_causative_variants: List[ProbandCausativeVariant],
208
+ hg19_vcf_info: VcfFile,
209
+ hg38_vcf_info: VcfFile,
210
+ ) -> VcfFile:
211
+ """
212
+ Select the appropriate VCF template based on the assembly information of the proband causative variants.
213
+
214
+ Args:
215
+ phenopacket_path (Path): The path to the Phenopacket file.
216
+ proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband.
217
+ hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
218
+ hg38_vcf_info (VcfFile): CF file info for hg38 template vcf.
219
+
220
+ Returns:
221
+ VcfFile: The selected VCF template file based on the assembly information of the proband causative variants.
222
+
223
+ """
224
+ if proband_causative_variants[0].assembly in ["hg19", "GRCh37"]:
225
+ if hg19_vcf_info:
226
+ return hg19_vcf_info
227
+ else:
228
+ raise InputError("Must specify hg19 template VCF!")
229
+ elif proband_causative_variants[0].assembly in ["hg38", "GRCh38"]:
230
+ if hg38_vcf_info:
231
+ return hg38_vcf_info
232
+ else:
233
+ raise InputError("Must specify hg38 template VCF!")
234
+ else:
235
+ raise IncompatibleGenomeAssemblyError(
236
+ proband_causative_variants[0].assembly, phenopacket_path
237
+ )
238
+
239
+
209
240
  def check_variant_assembly(
210
241
  proband_causative_variants: list[ProbandCausativeVariant],
211
242
  vcf_header: VcfHeader,
@@ -229,7 +260,13 @@ def check_variant_assembly(
229
260
  raise ValueError("Too many genome assemblies!")
230
261
  if phenopacket_assembly[0] not in compatible_genome_assembly:
231
262
  raise IncompatibleGenomeAssemblyError(phenopacket_assembly, phenopacket_path)
232
- if phenopacket_assembly[0] != vcf_header.assembly:
263
+ if (
264
+ phenopacket_assembly[0] in {"hg19", "GRCh37"}
265
+ and vcf_header.assembly not in {"hg19", "GRCh37"}
266
+ ) or (
267
+ phenopacket_assembly[0] in {"hg38", "GRCh38"}
268
+ and vcf_header.assembly not in {"hg38", "GRCh38"}
269
+ ):
233
270
  raise IncompatibleGenomeAssemblyError(
234
271
  assembly=phenopacket_assembly, phenopacket=phenopacket_path
235
272
  )
@@ -387,7 +424,8 @@ class VcfWriter:
387
424
  def spike_vcf_contents(
388
425
  phenopacket: Union[Phenopacket, Family],
389
426
  phenopacket_path: Path,
390
- chosen_template_vcf: Path,
427
+ hg19_vcf_info: VcfFile,
428
+ hg38_vcf_info: VcfFile,
391
429
  ) -> tuple[str, List[str]]:
392
430
  """
393
431
  Spike VCF records with variants obtained from a Phenopacket or Family.
@@ -395,22 +433,28 @@ def spike_vcf_contents(
395
433
  Args:
396
434
  phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants.
397
435
  phenopacket_path (Path): Path to the Phenopacket file.
398
- chosen_template_vcf (Path): Path to the chosen template VCF file.
436
+ hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
437
+ hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
399
438
 
400
439
  Returns:
401
440
  A tuple containing:
402
441
  assembly (str): The genome assembly information extracted from VCF header.
403
442
  modified_vcf_contents (List[str]): Modified VCF records with spiked variants.
404
443
  """
405
- # this is a separate function to a click command as it will fail if annotated with click annotations
406
- # and referenced from another click command
407
444
  phenopacket_causative_variants = PhenopacketUtil(phenopacket).causative_variants()
408
- vcf_contents = read_vcf(chosen_template_vcf)
409
- vcf_header = VcfHeaderParser(vcf_contents).parse_vcf_header()
410
- check_variant_assembly(phenopacket_causative_variants, vcf_header, phenopacket_path)
445
+ chosen_template_vcf = select_vcf_template(
446
+ phenopacket_path, phenopacket_causative_variants, hg19_vcf_info, hg38_vcf_info
447
+ )
448
+ check_variant_assembly(
449
+ phenopacket_causative_variants, chosen_template_vcf.vcf_header, phenopacket_path
450
+ )
411
451
  return (
412
- vcf_header.assembly,
413
- VcfSpiker(vcf_contents, phenopacket_causative_variants, vcf_header).construct_vcf(),
452
+ chosen_template_vcf.vcf_header.assembly,
453
+ VcfSpiker(
454
+ chosen_template_vcf.vcf_contents,
455
+ phenopacket_causative_variants,
456
+ chosen_template_vcf.vcf_header,
457
+ ).construct_vcf(),
414
458
  )
415
459
 
416
460
 
@@ -418,7 +462,8 @@ def generate_spiked_vcf_file(
418
462
  output_dir: Path,
419
463
  phenopacket: Union[Phenopacket, Family],
420
464
  phenopacket_path: Path,
421
- chosen_template_vcf: Path,
465
+ hg19_vcf_info: VcfFile,
466
+ hg38_vcf_info: VcfFile,
422
467
  ) -> File:
423
468
  """
424
469
  Write spiked VCF contents to a new file.
@@ -427,21 +472,17 @@ def generate_spiked_vcf_file(
427
472
  output_dir (Path): Path to the directory to store the generated file.
428
473
  phenopacket (Union[Phenopacket, Family]): Phenopacket or Family containing causative variants.
429
474
  phenopacket_path (Path): Path to the Phenopacket file.
430
- chosen_template_vcf (Path): Path to the chosen template VCF file.
431
-
475
+ hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
476
+ hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
432
477
  Returns:
433
478
  File: The generated File object representing the newly created spiked VCF file.
434
479
  """
435
480
  output_dir.mkdir(exist_ok=True)
436
481
  info_log.info(f" Created a directory {output_dir}")
437
482
  vcf_assembly, spiked_vcf = spike_vcf_contents(
438
- phenopacket, phenopacket_path, chosen_template_vcf
439
- )
440
- spiked_vcf_path = (
441
- output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
442
- if is_gzipped(chosen_template_vcf)
443
- else output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf"))
483
+ phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
444
484
  )
485
+ spiked_vcf_path = output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
445
486
  VcfWriter(spiked_vcf, spiked_vcf_path).write_vcf_file()
446
487
  return File(
447
488
  uri=urllib.parse.unquote(spiked_vcf_path.as_uri()),
@@ -449,8 +490,19 @@ def generate_spiked_vcf_file(
449
490
  )
450
491
 
451
492
 
493
+ def spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path):
494
+ phenopacket = phenopacket_reader(phenopacket_path)
495
+ spiked_vcf_file_message = generate_spiked_vcf_file(
496
+ output_dir, phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
497
+ )
498
+ updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
499
+ spiked_vcf_file_message
500
+ )
501
+ write_phenopacket(updated_phenopacket, phenopacket_path)
502
+
503
+
452
504
  def create_spiked_vcf(
453
- output_dir: Path, phenopacket_path: Path, template_vcf_path: Path, vcf_dir: Path
505
+ output_dir: Path, phenopacket_path: Path, hg19_template_vcf: Path, hg38_template_vcf: Path
454
506
  ) -> None:
455
507
  """
456
508
  Create a spiked VCF for a Phenopacket.
@@ -458,27 +510,21 @@ def create_spiked_vcf(
458
510
  Args:
459
511
  output_dir (Path): The directory to store the generated spiked VCF file.
460
512
  phenopacket_path (Path): Path to the Phenopacket file.
461
- template_vcf_path (Path): Path to the template VCF file (optional).
462
- vcf_dir (Path): Path to the directory containing VCF files (optional).
513
+ hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
514
+ hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
463
515
 
464
516
  Raises:
465
- InputError: If both template_vcf_path and vcf_dir are None.
517
+ InputError: If both hg19_template_vcf and hg38_template_vcf are None.
466
518
  """
467
- if template_vcf_path is None and vcf_dir is None:
468
- raise InputError("Either a template_vcf or vcf_dir must be specified")
469
- vcf_file_path = VcfPicker(template_vcf_path, vcf_dir).pick_file()
470
- phenopacket = phenopacket_reader(phenopacket_path)
471
- spiked_vcf_file_message = generate_spiked_vcf_file(
472
- output_dir, phenopacket, phenopacket_path, vcf_file_path
473
- )
474
- updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
475
- spiked_vcf_file_message
476
- )
477
- write_phenopacket(updated_phenopacket, phenopacket_path)
519
+ if hg19_template_vcf is None and hg38_template_vcf is None:
520
+ raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
521
+ hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
522
+ hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
523
+ spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path)
478
524
 
479
525
 
480
526
  def create_spiked_vcfs(
481
- output_dir: Path, phenopacket_dir: Path, template_vcf_path: Path, vcf_dir: Path
527
+ output_dir: Path, phenopacket_dir: Path, hg19_template_vcf: Path, hg38_template_vcf: Path
482
528
  ) -> None:
483
529
  """
484
530
  Create a spiked VCF for a directory of Phenopackets.
@@ -486,35 +532,26 @@ def create_spiked_vcfs(
486
532
  Args:
487
533
  output_dir (Path): The directory to store the generated spiked VCF file.
488
534
  phenopacket_dir (Path): Path to the Phenopacket directory.
489
- template_vcf_path (Path): Path to the template VCF file (optional).
490
- vcf_dir (Path): Path to the directory containing VCF files (optional).
535
+ hg19_template_vcf (Path): Path to the template hg19 VCF file (optional).
536
+ hg38_template_vcf (Path): Path to the template hg19 VCF file (optional).
491
537
 
492
538
  Raises:
493
- InputError: If both template_vcf_path and vcf_dir are None.
539
+ InputError: If both hg19_template_vcf and hg38_template_vcf are None.
494
540
  """
495
- if template_vcf_path is None and vcf_dir is None:
496
- raise InputError("Either a template_vcf or vcf_dir must be specified")
541
+ if hg19_template_vcf is None and hg38_template_vcf is None:
542
+ raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
543
+ hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
544
+ hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
497
545
  for phenopacket_path in files_with_suffix(phenopacket_dir, ".json"):
498
- vcf_file_path = VcfPicker(template_vcf_path, vcf_dir).pick_file()
499
- phenopacket = phenopacket_reader(phenopacket_path)
500
- spiked_vcf_file_message = generate_spiked_vcf_file(
501
- output_dir, phenopacket, phenopacket_path, vcf_file_path
502
- )
503
- updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
504
- spiked_vcf_file_message
505
- )
506
- write_phenopacket(updated_phenopacket, phenopacket_path)
507
- # or made a lambda one-liner for maximum wtf...
508
- # [spike_vcf(path, output_dir, template_vcf, vcf_dir) for path in phenopacket_dir.iterdir() if path.suffix ==
509
- # ".json"]
546
+ spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path)
510
547
 
511
548
 
512
549
  def spike_vcfs(
513
550
  output_dir: Path,
514
551
  phenopacket_path: Path,
515
552
  phenopacket_dir: Path,
516
- template_vcf_path: Path,
517
- vcf_dir: Path,
553
+ hg19_template_vcf: Path,
554
+ hg38_template_vcf: Path,
518
555
  ) -> None:
519
556
  """
520
557
  Create spiked VCF from either a Phenopacket or a Phenopacket directory.
@@ -523,10 +560,10 @@ def spike_vcfs(
523
560
  output_dir (Path): The directory to store the generated spiked VCF file(s).
524
561
  phenopacket_path (Path): Path to a single Phenopacket file (optional).
525
562
  phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional).
526
- template_vcf_path (Path): Path to the template VCF file (optional).
527
- vcf_dir (Path): Path to the directory containing VCF files (optional).
563
+ hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
564
+ hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
528
565
  """
529
566
  if phenopacket_path is not None:
530
- create_spiked_vcf(output_dir, phenopacket_path, template_vcf_path, vcf_dir)
567
+ create_spiked_vcf(output_dir, phenopacket_path, hg19_template_vcf, hg38_template_vcf)
531
568
  elif phenopacket_dir is not None:
532
- create_spiked_vcfs(output_dir, phenopacket_dir, template_vcf_path, vcf_dir)
569
+ create_spiked_vcfs(output_dir, phenopacket_dir, hg19_template_vcf, hg38_template_vcf)
@@ -0,0 +1,67 @@
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from pheval.prepare.create_spiked_vcf import create_spiked_vcf
5
+ from pheval.prepare.update_phenopacket import create_updated_phenopacket
6
+ from pheval.utils.file_utils import all_files
7
+ from pheval.utils.phenopacket_utils import PhenopacketUtil, phenopacket_reader
8
+
9
+ info_log = logging.getLogger("info")
10
+
11
+
12
+ def prepare_corpus(
13
+ phenopacket_dir: Path,
14
+ variant_analysis: bool,
15
+ gene_analysis: bool,
16
+ disease_analysis: bool,
17
+ gene_identifier: str,
18
+ hg19_template_vcf: Path,
19
+ hg38_template_vcf: Path,
20
+ output_dir: Path,
21
+ ) -> None:
22
+ """
23
+ Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
24
+ gene identifiers.
25
+
26
+ Args:
27
+ phenopacket_dir (Path): The path to the directory containing Phenopackets.
28
+ variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
29
+ gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
30
+ disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
31
+ gene_identifier (str): Identifier for updating gene identifiers, if applicable.
32
+ hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into
33
+ VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
34
+ hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
35
+ VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
36
+ output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
37
+ """
38
+ output_dir.joinpath("phenopackets").mkdir(exist_ok=True, parents=True)
39
+ for phenopacket_path in all_files(phenopacket_dir):
40
+ phenopacket_util = PhenopacketUtil(phenopacket_reader(phenopacket_path))
41
+ if variant_analysis:
42
+ if phenopacket_util.check_incomplete_variant_record():
43
+ info_log.warning(
44
+ f"Removed {phenopacket_path.name} from the corpus due to missing variant fields."
45
+ )
46
+ continue
47
+ if gene_analysis:
48
+ if phenopacket_util.check_incomplete_gene_record():
49
+ info_log.warning(
50
+ f"Removed {phenopacket_path.name} from the corpus due to missing gene fields."
51
+ )
52
+ continue
53
+ if disease_analysis:
54
+ if phenopacket_util.check_incomplete_disease_record():
55
+ info_log.warning(
56
+ f"Removed {phenopacket_path.name} from the corpus due to missing disease fields."
57
+ )
58
+ continue
59
+ if gene_identifier:
60
+ create_updated_phenopacket(
61
+ gene_identifier, phenopacket_path, output_dir.joinpath("phenopackets")
62
+ )
63
+ if hg19_template_vcf or hg38_template_vcf:
64
+ output_dir.joinpath("vcf").mkdir(exist_ok=True)
65
+ create_spiked_vcf(
66
+ output_dir.joinpath("vcf"), phenopacket_path, hg19_template_vcf, hg38_template_vcf
67
+ )
@@ -38,8 +38,7 @@ def update_outdated_gene_context(
38
38
  interpretations = PhenopacketUtil(phenopacket).interpretations()
39
39
  updated_interpretations = GeneIdentifierUpdater(
40
40
  hgnc_data=hgnc_data, gene_identifier=gene_identifier
41
- ).update_genomic_interpretations_gene_identifier(interpretations)
42
-
41
+ ).update_genomic_interpretations_gene_identifier(interpretations, phenopacket_path)
43
42
  return PhenopacketRebuilder(phenopacket).update_interpretations(updated_interpretations)
44
43
 
45
44
 
@@ -1,6 +1,5 @@
1
1
  import json
2
-
3
- # import logging
2
+ import logging
4
3
  import os
5
4
  from collections import defaultdict
6
5
  from copy import copy
@@ -22,6 +21,8 @@ from phenopackets import (
22
21
 
23
22
  from pheval.prepare.custom_exceptions import IncorrectFileFormatError
24
23
 
24
+ info_log = logging.getLogger("info")
25
+
25
26
 
26
27
  class IncompatibleGenomeAssemblyError(Exception):
27
28
  """Exception raised for incompatible genome assembly."""
@@ -467,7 +468,9 @@ class PhenopacketUtil:
467
468
  for i in pheno_interpretation:
468
469
  for g in i.diagnosis.genomic_interpretations:
469
470
  variant = GenomicVariant(
470
- chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom,
471
+ chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
472
+ "chr", ""
473
+ ),
471
474
  pos=g.variant_interpretation.variation_descriptor.vcf_record.pos,
472
475
  ref=g.variant_interpretation.variation_descriptor.vcf_record.ref,
473
476
  alt=g.variant_interpretation.variation_descriptor.vcf_record.alt,
@@ -475,6 +478,59 @@ class PhenopacketUtil:
475
478
  variants.append(variant)
476
479
  return variants
477
480
 
481
+ def check_incomplete_variant_record(self) -> bool:
482
+ """
483
+ Check if any variant record in the phenopacket has incomplete information.
484
+
485
+ This method iterates through the diagnosed variant records and checks if any of them
486
+ have missing or incomplete information such as empty chromosome, position, reference,
487
+ or alternate allele.
488
+
489
+ Returns:
490
+ bool: True if any variant record is incomplete, False otherwise.
491
+ """
492
+ variants = self.diagnosed_variants()
493
+ for variant in variants:
494
+ if (
495
+ variant.chrom == ""
496
+ or variant.pos == 0
497
+ or variant.pos == ""
498
+ or variant.ref == ""
499
+ or variant.alt == ""
500
+ ):
501
+ return True
502
+ return False
503
+
504
+ def check_incomplete_gene_record(self) -> bool:
505
+ """
506
+ Check if any gene record in the phenopacket has incomplete information.
507
+
508
+ This method iterates through the diagnosed gene records and checks if any of them
509
+ have missing or incomplete information such as gene name, or gene identifier.
510
+
511
+ Returns:
512
+ bool: True if any gene record is incomplete, False otherwise.
513
+ """
514
+ genes = self.diagnosed_genes()
515
+ for gene in genes:
516
+ if gene.gene_symbol == "" or gene.gene_identifier == "":
517
+ return True
518
+ return False
519
+
520
+ def check_incomplete_disease_record(self) -> bool:
521
+ """
522
+ Check if any disease record in the phenopacket has incomplete information.
523
+
524
+ This method iterates through the diagnosed disease records and checks if any of them
525
+ have missing or incomplete information such as empty disease name, or disease identifier.
526
+
527
+ Returns:
528
+ bool: True if any disease record is incomplete, False otherwise.
529
+ """
530
+ if len(self.diagnoses()) == 0:
531
+ return True
532
+ return False
533
+
478
534
 
479
535
  class PhenopacketRebuilder:
480
536
  """Class for rebuilding a Phenopacket"""
@@ -653,7 +709,7 @@ class GeneIdentifierUpdater:
653
709
  ]
654
710
 
655
711
  def update_genomic_interpretations_gene_identifier(
656
- self, interpretations: List[Interpretation]
712
+ self, interpretations: List[Interpretation], phenopacket_path: Path
657
713
  ) -> List[Interpretation]:
658
714
  """
659
715
  Update the genomic interpretations of a Phenopacket.
@@ -667,10 +723,16 @@ class GeneIdentifierUpdater:
667
723
  updated_interpretations = copy(list(interpretations))
668
724
  for updated_interpretation in updated_interpretations:
669
725
  for g in updated_interpretation.diagnosis.genomic_interpretations:
726
+ updated_gene_identifier = self.find_identifier(
727
+ g.variant_interpretation.variation_descriptor.gene_context.symbol
728
+ )
729
+ info_log.info(
730
+ f"Updating gene identifier in {phenopacket_path} from "
731
+ f"{g.variant_interpretation.variation_descriptor.gene_context.value_id}"
732
+ f"to {updated_gene_identifier}"
733
+ )
670
734
  g.variant_interpretation.variation_descriptor.gene_context.value_id = (
671
- self.find_identifier(
672
- g.variant_interpretation.variation_descriptor.gene_context.symbol
673
- )
735
+ updated_gene_identifier
674
736
  )
675
737
  del g.variant_interpretation.variation_descriptor.gene_context.alternate_ids[:]
676
738
  g.variant_interpretation.variation_descriptor.gene_context.alternate_ids.extend(
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes