pheval 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

@@ -1,6 +1,8 @@
1
+ import ast
2
+ import re
1
3
  from collections import defaultdict
2
4
  from pathlib import Path
3
- from typing import List
5
+ from typing import List, Union
4
6
 
5
7
  from pheval.analyse.benchmarking_data import BenchmarkRunResults
6
8
  from pheval.analyse.binary_classification_stats import BinaryClassificationStats
@@ -140,6 +142,24 @@ class AssessGenePrioritisation:
140
142
  )
141
143
  )
142
144
 
145
+ @staticmethod
146
+ def _check_string_representation(entity: str) -> Union[List[str], str]:
147
+ """
148
+ Check if the input string is a representation of a list and returns the list if true, otherwise the string.
149
+
150
+ Args:
151
+ entity (str): The input entity to check.
152
+
153
+ Returns:
154
+ Union[List[str], str]: A list if the input string is a list representation, otherwise
155
+ the original string.
156
+ """
157
+ list_pattern = re.compile(r"^\[\s*(?:[^\[\],\s]+(?:\s*,\s*[^\[\],\s]+)*)?\s*\]$")
158
+ if list_pattern.match(str(entity)):
159
+ return ast.literal_eval(entity)
160
+ else:
161
+ return entity
162
+
143
163
  def assess_gene_prioritisation(
144
164
  self,
145
165
  rank_stats: RankStats,
@@ -161,9 +181,21 @@ class AssessGenePrioritisation:
161
181
  rank_stats.total += 1
162
182
  gene_match = GenePrioritisationResult(self.phenopacket_path, gene.gene_symbol)
163
183
  for standardised_gene_result in self.standardised_gene_results:
184
+ gene_identifier = self._check_string_representation(
185
+ standardised_gene_result.gene_identifier
186
+ )
187
+ gene_symbol = self._check_string_representation(
188
+ standardised_gene_result.gene_symbol
189
+ )
164
190
  if (
165
- gene.gene_identifier == standardised_gene_result.gene_identifier
166
- or gene.gene_symbol == standardised_gene_result.gene_symbol
191
+ isinstance(gene_identifier, list)
192
+ and gene.gene_identifier in gene_identifier
193
+ or isinstance(gene_identifier, str)
194
+ and gene.gene_identifier == str
195
+ or isinstance(gene_symbol, list)
196
+ and gene.gene_symbol in gene_symbol
197
+ or isinstance(gene_symbol, str)
198
+ and gene.gene_symbol == gene_symbol
167
199
  ):
168
200
  gene_match = self._record_matched_gene(
169
201
  gene, rank_stats, standardised_gene_result
@@ -162,8 +162,8 @@ class AssessVariantPrioritisation:
162
162
  variant_match = VariantPrioritisationResult(self.phenopacket_path, variant)
163
163
  for result in self.standardised_variant_results:
164
164
  result_variant = GenomicVariant(
165
- chrom=result.chromosome,
166
- pos=result.start,
165
+ chrom=str(result.chromosome),
166
+ pos=int(result.start),
167
167
  ref=result.ref,
168
168
  alt=result.alt,
169
169
  )
@@ -260,6 +260,8 @@ def update_phenopackets_command(
260
260
  required=False,
261
261
  help="Template hg19 VCF file",
262
262
  type=Path,
263
+ cls=MutuallyExclusiveOptionError,
264
+ mutually_exclusive=["hg19_vcf_dir"],
263
265
  )
264
266
  @click.option(
265
267
  "--hg38-template-vcf",
@@ -268,6 +270,28 @@ def update_phenopackets_command(
268
270
  required=False,
269
271
  help="Template hg38 VCF file",
270
272
  type=Path,
273
+ cls=MutuallyExclusiveOptionError,
274
+ mutually_exclusive=["hg38_vcf_dir"],
275
+ )
276
+ @click.option(
277
+ "--hg19-vcf-dir",
278
+ "-hg19-dir",
279
+ metavar="PATH",
280
+ required=False,
281
+ help="Path to directory containing hg19 VCF templates.",
282
+ type=Path,
283
+ cls=MutuallyExclusiveOptionError,
284
+ mutually_exclusive=["hg19_template_vcf"],
285
+ )
286
+ @click.option(
287
+ "--hg38-vcf-dir",
288
+ "-hg38-dir",
289
+ metavar="PATH",
290
+ required=False,
291
+ help="Path to directory containing hg38 VCF templates.",
292
+ type=Path,
293
+ cls=MutuallyExclusiveOptionError,
294
+ mutually_exclusive=["hg38_template_vcf"],
271
295
  )
272
296
  @click.option(
273
297
  "--output-dir",
@@ -284,6 +308,8 @@ def create_spiked_vcfs_command(
284
308
  output_dir: Path,
285
309
  hg19_template_vcf: Path = None,
286
310
  hg38_template_vcf: Path = None,
311
+ hg19_vcf_dir: Path = None,
312
+ hg38_vcf_dir: Path = None,
287
313
  ):
288
314
  """
289
315
  Create spiked VCF from either a Phenopacket or a Phenopacket directory.
@@ -294,10 +320,20 @@ def create_spiked_vcfs_command(
294
320
  output_dir (Path): The directory to store the generated spiked VCF file(s).
295
321
  hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
296
322
  hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
323
+ hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional).
324
+ hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional).
297
325
  """
298
326
  if phenopacket_path is None and phenopacket_dir is None:
299
327
  raise InputError("Either a phenopacket or phenopacket directory must be specified")
300
- spike_vcfs(output_dir, phenopacket_path, phenopacket_dir, hg19_template_vcf, hg38_template_vcf)
328
+ spike_vcfs(
329
+ output_dir,
330
+ phenopacket_path,
331
+ phenopacket_dir,
332
+ hg19_template_vcf,
333
+ hg38_template_vcf,
334
+ hg19_vcf_dir,
335
+ hg38_vcf_dir,
336
+ )
301
337
 
302
338
 
303
339
  @click.command()
@@ -656,6 +692,8 @@ def generate_stats_plot(
656
692
  required=False,
657
693
  help="Template hg19 VCF file",
658
694
  type=Path,
695
+ cls=MutuallyExclusiveOptionError,
696
+ mutually_exclusive=["hg19_vcf_dir"],
659
697
  )
660
698
  @click.option(
661
699
  "--hg38-template-vcf",
@@ -664,6 +702,28 @@ def generate_stats_plot(
664
702
  required=False,
665
703
  help="Template hg38 VCF file",
666
704
  type=Path,
705
+ cls=MutuallyExclusiveOptionError,
706
+ mutually_exclusive=["hg38_vcf_dir"],
707
+ )
708
+ @click.option(
709
+ "--hg19-vcf-dir",
710
+ "-hg19-dir",
711
+ metavar="PATH",
712
+ required=False,
713
+ help="Path to directory containing hg19 VCF templates.",
714
+ type=Path,
715
+ cls=MutuallyExclusiveOptionError,
716
+ mutually_exclusive=["hg19_template_vcf"],
717
+ )
718
+ @click.option(
719
+ "--hg38-vcf-dir",
720
+ "-hg38-dir",
721
+ metavar="PATH",
722
+ required=False,
723
+ help="Path to directory containing hg38 VCF templates.",
724
+ type=Path,
725
+ cls=MutuallyExclusiveOptionError,
726
+ mutually_exclusive=["hg38_template_vcf"],
667
727
  )
668
728
  @click.option(
669
729
  "--output-dir",
@@ -682,23 +742,28 @@ def prepare_corpus_command(
682
742
  gene_identifier: str,
683
743
  hg19_template_vcf: Path,
684
744
  hg38_template_vcf: Path,
745
+ hg19_vcf_dir: Path,
746
+ hg38_vcf_dir: Path,
685
747
  output_dir: Path,
686
748
  ):
687
749
  """
688
750
  Prepare a corpus of Phenopackets for analysis, optionally checking for complete variant records and updating
689
751
  gene identifiers.
690
752
 
691
- Args:
692
- phenopacket_dir (Path): The path to the directory containing Phenopackets.
693
- variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
694
- gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
695
- disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
696
- gene_identifier (str): Identifier for updating gene identifiers, if applicable.
697
- hg19_template_vcf (Path): Path to the hg19 template VCF file (optional), to spike variants into
698
- VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
699
- hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
700
- VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
701
- output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
753
+ Args:
754
+ phenopacket_dir (Path): The path to the directory containing Phenopackets.
755
+ variant_analysis (bool): If True, check for complete variant records in the Phenopackets.
756
+ gene_analysis (bool): If True, check for complete gene records in the Phenopackets.
757
+ disease_analysis (bool): If True, check for complete disease records in the Phenopackets.
758
+ gene_identifier (str): Identifier for updating gene identifiers, if applicable.
759
+ hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
760
+ hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
761
+ hg19_vcf_dir (Path): Path to the directory containing the hg19 VCF files (optional).
762
+ hg38_vcf_dir (Path): Path to the directory containing the hg38 VCF files (optional).
763
+ output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
764
+ Notes:
765
+ To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf,
766
+ hg19_vcf_dir or hg38_vcf_dir is required.
702
767
  """
703
768
  prepare_corpus(
704
769
  phenopacket_dir,
@@ -708,5 +773,7 @@ def prepare_corpus_command(
708
773
  gene_identifier,
709
774
  hg19_template_vcf,
710
775
  hg38_template_vcf,
776
+ hg19_vcf_dir,
777
+ hg38_vcf_dir,
711
778
  output_dir,
712
779
  )
@@ -3,6 +3,7 @@ import operator
3
3
  from dataclasses import dataclass
4
4
  from enum import Enum
5
5
  from pathlib import Path
6
+ from typing import List, Union
6
7
 
7
8
  import pandas as pd
8
9
 
@@ -30,8 +31,8 @@ class PhEvalResult:
30
31
  class PhEvalGeneResult(PhEvalResult):
31
32
  """Minimal data required from tool-specific output for gene prioritisation result
32
33
  Args:
33
- gene_symbol (str): The gene symbol for the result entry
34
- gene_identifier (str): The ENSEMBL gene identifier for the result entry
34
+ gene_symbol (Union[List[str], str]): The gene symbol(s) for the result entry
35
+ gene_identifier (Union[List[str], str]): The ENSEMBL gene identifier(s) for the result entry
35
36
  score (float): The score for the gene result entry
36
37
  Notes:
37
38
  While we recommend providing the gene identifier in the ENSEMBL namespace,
@@ -39,8 +40,8 @@ class PhEvalGeneResult(PhEvalResult):
39
40
  in the analysis.
40
41
  """
41
42
 
42
- gene_symbol: str
43
- gene_identifier: str
43
+ gene_symbol: Union[List[str], str]
44
+ gene_identifier: Union[List[str], str]
44
45
  score: float
45
46
 
46
47
 
@@ -1,5 +1,6 @@
1
1
  import gzip
2
2
  import logging
3
+ import random
3
4
  import re
4
5
  import urllib.parse
5
6
  from copy import copy
@@ -10,7 +11,7 @@ from typing import List, Union
10
11
  from phenopackets import Family, File, Phenopacket
11
12
 
12
13
  from pheval.prepare.custom_exceptions import InputError
13
- from pheval.utils.file_utils import files_with_suffix, is_gzipped
14
+ from pheval.utils.file_utils import all_files, files_with_suffix, is_gzipped
14
15
  from pheval.utils.phenopacket_utils import (
15
16
  IncompatibleGenomeAssemblyError,
16
17
  PhenopacketRebuilder,
@@ -207,6 +208,8 @@ def select_vcf_template(
207
208
  proband_causative_variants: List[ProbandCausativeVariant],
208
209
  hg19_vcf_info: VcfFile,
209
210
  hg38_vcf_info: VcfFile,
211
+ hg19_vcf_dir: Path,
212
+ hg38_vcf_dir: Path,
210
213
  ) -> VcfFile:
211
214
  """
212
215
  Select the appropriate VCF template based on the assembly information of the proband causative variants.
@@ -216,6 +219,8 @@ def select_vcf_template(
216
219
  proband_causative_variants (List[ProbandCausativeVariant]): A list of causative variants from the proband.
217
220
  hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
218
221
  hg38_vcf_info (VcfFile): CF file info for hg38 template vcf.
222
+ hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
223
+ hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
219
224
 
220
225
  Returns:
221
226
  VcfFile: The selected VCF template file based on the assembly information of the proband causative variants.
@@ -224,11 +229,15 @@ def select_vcf_template(
224
229
  if proband_causative_variants[0].assembly in ["hg19", "GRCh37"]:
225
230
  if hg19_vcf_info:
226
231
  return hg19_vcf_info
232
+ elif hg19_vcf_dir:
233
+ return VcfFile.populate_fields(random.choice(all_files(hg19_vcf_dir)))
227
234
  else:
228
235
  raise InputError("Must specify hg19 template VCF!")
229
236
  elif proband_causative_variants[0].assembly in ["hg38", "GRCh38"]:
230
237
  if hg38_vcf_info:
231
238
  return hg38_vcf_info
239
+ elif hg38_vcf_dir:
240
+ return VcfFile.populate_fields(random.choice(all_files(hg38_vcf_dir)))
232
241
  else:
233
242
  raise InputError("Must specify hg38 template VCF!")
234
243
  else:
@@ -445,6 +454,8 @@ def spike_vcf_contents(
445
454
  phenopacket_path: Path,
446
455
  hg19_vcf_info: VcfFile,
447
456
  hg38_vcf_info: VcfFile,
457
+ hg19_vcf_dir: Path,
458
+ hg38_vcf_dir: Path,
448
459
  ) -> tuple[str, List[str]]:
449
460
  """
450
461
  Spike VCF records with variants obtained from a Phenopacket or Family.
@@ -454,6 +465,8 @@ def spike_vcf_contents(
454
465
  phenopacket_path (Path): Path to the Phenopacket file.
455
466
  hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
456
467
  hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
468
+ hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
469
+ hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
457
470
 
458
471
  Returns:
459
472
  A tuple containing:
@@ -462,7 +475,12 @@ def spike_vcf_contents(
462
475
  """
463
476
  phenopacket_causative_variants = PhenopacketUtil(phenopacket).causative_variants()
464
477
  chosen_template_vcf = select_vcf_template(
465
- phenopacket_path, phenopacket_causative_variants, hg19_vcf_info, hg38_vcf_info
478
+ phenopacket_path,
479
+ phenopacket_causative_variants,
480
+ hg19_vcf_info,
481
+ hg38_vcf_info,
482
+ hg19_vcf_dir,
483
+ hg38_vcf_dir,
466
484
  )
467
485
  check_variant_assembly(
468
486
  phenopacket_causative_variants, chosen_template_vcf.vcf_header, phenopacket_path
@@ -483,6 +501,8 @@ def generate_spiked_vcf_file(
483
501
  phenopacket_path: Path,
484
502
  hg19_vcf_info: VcfFile,
485
503
  hg38_vcf_info: VcfFile,
504
+ hg19_vcf_dir: Path,
505
+ hg38_vcf_dir: Path,
486
506
  ) -> File:
487
507
  """
488
508
  Write spiked VCF contents to a new file.
@@ -493,13 +513,15 @@ def generate_spiked_vcf_file(
493
513
  phenopacket_path (Path): Path to the Phenopacket file.
494
514
  hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
495
515
  hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
516
+ hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
517
+ hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
496
518
  Returns:
497
519
  File: The generated File object representing the newly created spiked VCF file.
498
520
  """
499
521
  output_dir.mkdir(exist_ok=True)
500
522
  info_log.info(f" Created a directory {output_dir}")
501
523
  vcf_assembly, spiked_vcf = spike_vcf_contents(
502
- phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
524
+ phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir
503
525
  )
504
526
  spiked_vcf_path = output_dir.joinpath(phenopacket_path.name.replace(".json", ".vcf.gz"))
505
527
  VcfWriter(spiked_vcf, spiked_vcf_path).write_vcf_file()
@@ -509,10 +531,38 @@ def generate_spiked_vcf_file(
509
531
  )
510
532
 
511
533
 
512
- def spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path):
534
+ def spike_and_update_phenopacket(
535
+ hg19_vcf_info: VcfFile,
536
+ hg38_vcf_info: VcfFile,
537
+ hg19_vcf_dir: Path,
538
+ hg38_vcf_dir: Path,
539
+ output_dir: Path,
540
+ phenopacket_path: Path,
541
+ ) -> None:
542
+ """
543
+ Spike the VCF files with genetic variants relevant to the provided Phenopacket, update the Phenopacket
544
+ accordingly, and write the updated Phenopacket to the specified output directory.
545
+
546
+ Args:
547
+ hg19_vcf_info (VcfFile): VCF file info for hg19 template vcf.
548
+ hg38_vcf_info (VcfFile): VCF file info for hg38 template vcf.
549
+ hg19_vcf_dir (Path): The directory containing the hg19 VCF files.
550
+ hg38_vcf_dir (Path): The directory containing the hg38 VCF files.
551
+ output_dir (Path): Directory where the updated Phenopacket will be saved.
552
+ phenopacket_path (Path): Path to the original Phenopacket file.
553
+
554
+ Returns:
555
+ None
556
+ """
513
557
  phenopacket = phenopacket_reader(phenopacket_path)
514
558
  spiked_vcf_file_message = generate_spiked_vcf_file(
515
- output_dir, phenopacket, phenopacket_path, hg19_vcf_info, hg38_vcf_info
559
+ output_dir,
560
+ phenopacket,
561
+ phenopacket_path,
562
+ hg19_vcf_info,
563
+ hg38_vcf_info,
564
+ hg19_vcf_dir,
565
+ hg38_vcf_dir,
516
566
  )
517
567
  updated_phenopacket = PhenopacketRebuilder(phenopacket).add_spiked_vcf_path(
518
568
  spiked_vcf_file_message
@@ -521,7 +571,12 @@ def spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, pheno
521
571
 
522
572
 
523
573
  def create_spiked_vcf(
524
- output_dir: Path, phenopacket_path: Path, hg19_template_vcf: Path, hg38_template_vcf: Path
574
+ output_dir: Path,
575
+ phenopacket_path: Path,
576
+ hg19_template_vcf: Path,
577
+ hg38_template_vcf: Path,
578
+ hg19_vcf_dir: Path,
579
+ hg38_vcf_dir: Path,
525
580
  ) -> None:
526
581
  """
527
582
  Create a spiked VCF for a Phenopacket.
@@ -531,6 +586,8 @@ def create_spiked_vcf(
531
586
  phenopacket_path (Path): Path to the Phenopacket file.
532
587
  hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
533
588
  hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
589
+ hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
590
+ hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
534
591
 
535
592
  Raises:
536
593
  InputError: If both hg19_template_vcf and hg38_template_vcf are None.
@@ -539,11 +596,18 @@ def create_spiked_vcf(
539
596
  raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
540
597
  hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
541
598
  hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
542
- spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path)
599
+ spike_and_update_phenopacket(
600
+ hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path
601
+ )
543
602
 
544
603
 
545
604
  def create_spiked_vcfs(
546
- output_dir: Path, phenopacket_dir: Path, hg19_template_vcf: Path, hg38_template_vcf: Path
605
+ output_dir: Path,
606
+ phenopacket_dir: Path,
607
+ hg19_template_vcf: Path,
608
+ hg38_template_vcf: Path,
609
+ hg19_vcf_dir: Path,
610
+ hg38_vcf_dir: Path,
547
611
  ) -> None:
548
612
  """
549
613
  Create a spiked VCF for a directory of Phenopackets.
@@ -553,16 +617,25 @@ def create_spiked_vcfs(
553
617
  phenopacket_dir (Path): Path to the Phenopacket directory.
554
618
  hg19_template_vcf (Path): Path to the template hg19 VCF file (optional).
555
619
  hg38_template_vcf (Path): Path to the template hg19 VCF file (optional).
620
+ hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
621
+ hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
556
622
 
557
623
  Raises:
558
624
  InputError: If both hg19_template_vcf and hg38_template_vcf are None.
559
625
  """
560
- if hg19_template_vcf is None and hg38_template_vcf is None:
561
- raise InputError("Either a hg19 template vcf or hg38 template vcf must be specified")
626
+ if (
627
+ hg19_template_vcf is None
628
+ and hg38_template_vcf is None
629
+ and hg19_vcf_dir is None
630
+ and hg38_vcf_dir is None
631
+ ):
632
+ raise InputError("Need to specify a VCF!")
562
633
  hg19_vcf_info = VcfFile.populate_fields(hg19_template_vcf) if hg19_template_vcf else None
563
634
  hg38_vcf_info = VcfFile.populate_fields(hg38_template_vcf) if hg38_template_vcf else None
564
635
  for phenopacket_path in files_with_suffix(phenopacket_dir, ".json"):
565
- spike_and_update_phenopacket(hg19_vcf_info, hg38_vcf_info, output_dir, phenopacket_path)
636
+ spike_and_update_phenopacket(
637
+ hg19_vcf_info, hg38_vcf_info, hg19_vcf_dir, hg38_vcf_dir, output_dir, phenopacket_path
638
+ )
566
639
 
567
640
 
568
641
  def spike_vcfs(
@@ -571,6 +644,8 @@ def spike_vcfs(
571
644
  phenopacket_dir: Path,
572
645
  hg19_template_vcf: Path,
573
646
  hg38_template_vcf: Path,
647
+ hg19_vcf_dir: Path,
648
+ hg38_vcf_dir: Path,
574
649
  ) -> None:
575
650
  """
576
651
  Create spiked VCF from either a Phenopacket or a Phenopacket directory.
@@ -581,8 +656,24 @@ def spike_vcfs(
581
656
  phenopacket_dir (Path): Path to a directory containing Phenopacket files (optional).
582
657
  hg19_template_vcf (Path): Path to the hg19 template VCF file (optional).
583
658
  hg38_template_vcf (Path): Path to the hg38 template VCF file (optional).
659
+ hg19_vcf_dir (Path): The directory containing the hg19 VCF files (optional).
660
+ hg38_vcf_dir (Path): The directory containing the hg38 VCF files (optional).
584
661
  """
585
662
  if phenopacket_path is not None:
586
- create_spiked_vcf(output_dir, phenopacket_path, hg19_template_vcf, hg38_template_vcf)
663
+ create_spiked_vcf(
664
+ output_dir,
665
+ phenopacket_path,
666
+ hg19_template_vcf,
667
+ hg38_template_vcf,
668
+ hg19_vcf_dir,
669
+ hg38_vcf_dir,
670
+ )
587
671
  elif phenopacket_dir is not None:
588
- create_spiked_vcfs(output_dir, phenopacket_dir, hg19_template_vcf, hg38_template_vcf)
672
+ create_spiked_vcfs(
673
+ output_dir,
674
+ phenopacket_dir,
675
+ hg19_template_vcf,
676
+ hg38_template_vcf,
677
+ hg19_vcf_dir,
678
+ hg38_vcf_dir,
679
+ )
@@ -18,6 +18,8 @@ def prepare_corpus(
18
18
  gene_identifier: str,
19
19
  hg19_template_vcf: Path,
20
20
  hg38_template_vcf: Path,
21
+ hg19_vcf_dir: Path,
22
+ hg38_vcf_dir: Path,
21
23
  output_dir: Path,
22
24
  ) -> None:
23
25
  """
@@ -34,7 +36,12 @@ def prepare_corpus(
34
36
  VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
35
37
  hg38_template_vcf (Path): Path to the hg38 template VCF file (optional), to spike variants into
36
38
  VCFs for variant-based analysis at least one of hg19_template_vcf or hg38_template_vcf is required.
39
+ hg19_vcf_dir (Path): Path to the directory containing hg19 template VCF files (optional).
40
+ hg38_vcf_dir (Path): Path to the directory containing hg38 template VCF files (optional).
37
41
  output_dir (Path): The directory to save the prepared Phenopackets and, optionally, VCF files.
42
+ Notes:
43
+ To spike variants into VCFs for variant-based analysis at least one of hg19_template_vcf, hg38_template_vcf,
44
+ hg19_vcf_dir or hg38_vcf_dir is required.
38
45
  """
39
46
  output_dir.joinpath("phenopackets").mkdir(exist_ok=True, parents=True)
40
47
  for phenopacket_path in all_files(phenopacket_dir):
@@ -65,7 +72,12 @@ def prepare_corpus(
65
72
  if hg19_template_vcf or hg38_template_vcf:
66
73
  output_dir.joinpath("vcf").mkdir(exist_ok=True)
67
74
  create_spiked_vcf(
68
- output_dir.joinpath("vcf"), phenopacket_path, hg19_template_vcf, hg38_template_vcf
75
+ output_dir.joinpath("vcf"),
76
+ phenopacket_path,
77
+ hg19_template_vcf,
78
+ hg38_template_vcf,
79
+ hg19_vcf_dir,
80
+ hg38_vcf_dir,
69
81
  )
70
82
  if gene_identifier:
71
83
  create_updated_phenopacket(
@@ -468,10 +468,12 @@ class PhenopacketUtil:
468
468
  for i in pheno_interpretation:
469
469
  for g in i.diagnosis.genomic_interpretations:
470
470
  variant = GenomicVariant(
471
- chrom=g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
472
- "chr", ""
471
+ chrom=str(
472
+ g.variant_interpretation.variation_descriptor.vcf_record.chrom.replace(
473
+ "chr", ""
474
+ )
473
475
  ),
474
- pos=g.variant_interpretation.variation_descriptor.vcf_record.pos,
476
+ pos=int(g.variant_interpretation.variation_descriptor.vcf_record.pos),
475
477
  ref=g.variant_interpretation.variation_descriptor.vcf_record.ref,
476
478
  alt=g.variant_interpretation.variation_descriptor.vcf_record.alt,
477
479
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pheval
3
- Version: 0.3.7
3
+ Version: 0.3.9
4
4
  Summary:
5
5
  Author: Yasemin Bridges
6
6
  Author-email: y.bridges@qmul.ac.uk
@@ -5,7 +5,7 @@ pheval/analyse/benchmark_generator.py,sha256=AeuwbaPb4j_dyBGPRgEBxQk2NahDb5u4xHy
5
5
  pheval/analyse/benchmarking_data.py,sha256=aNZkWdmWemlnC1Tg35MtR60S9YC71QWS2rMuzkUc3w0,768
6
6
  pheval/analyse/binary_classification_stats.py,sha256=E35YjvGM-zFnuEt8M3pgN03vBab4MH6ih726QKvuogg,12519
7
7
  pheval/analyse/disease_prioritisation_analysis.py,sha256=mGfGYF5Eu7LxyBkAy6xMG1nDURaPiJY4rRQyKDcQe-4,12451
8
- pheval/analyse/gene_prioritisation_analysis.py,sha256=KSEQV6EvqtWESmO4Zc3Q9CwrjoMzxRiFUDKuAVvQtuM,12190
8
+ pheval/analyse/gene_prioritisation_analysis.py,sha256=4GhXTG3hgKicf8UQ1O3YCo0CVgIPIqdVB4IOt2-g5II,13454
9
9
  pheval/analyse/generate_plots.py,sha256=MFORnFTgoelYAahFlu3Dc3Rul4cwCg8Bloxe62vONSc,21350
10
10
  pheval/analyse/generate_summary_outputs.py,sha256=s9pXMSW6xm4ZBe1aCd0UJSaFiKBvpUfPwJ2BI4qfTas,6591
11
11
  pheval/analyse/parse_benchmark_summary.py,sha256=Y8uPTlHTEiaeVBOqxMcdOqjY3ZBtOS3DoRycL78Dzxg,2384
@@ -14,22 +14,22 @@ pheval/analyse/prioritisation_rank_recorder.py,sha256=EVe8DoEvvp0_WMAcjfVxmDGGRF
14
14
  pheval/analyse/prioritisation_result_types.py,sha256=qJoB6O-lFYmzAMcTQeDJZQNLJ6hleoKDYATTkhvFF98,1228
15
15
  pheval/analyse/rank_stats.py,sha256=knj1tsKrly17QgtOUVpqA14UjbO99N3ydkWN4xU6c2k,15785
16
16
  pheval/analyse/run_data_parser.py,sha256=HzBKsJL2skjmrRZdrF3VYzswtKNgbX6U5qhY_kqq9mA,1552
17
- pheval/analyse/variant_prioritisation_analysis.py,sha256=eF3SIvU6MNv1KR8ZmwXvTF4IoNu2qfwaBHA0uKZ8uMc,12186
17
+ pheval/analyse/variant_prioritisation_analysis.py,sha256=XSlAV2G7psXewPIoiUD_4jgFivcG1aOcy1jSPlSil5M,12196
18
18
  pheval/cli.py,sha256=X4tDi7e3VB3v2RawkqIbfv4SFPCBuQwMXMnYCPTGtIo,1570
19
19
  pheval/cli_pheval.py,sha256=fWbKUcPTZZSa1EJEtH_lNn1XE6qRApRHihqUZS5owrA,2424
20
- pheval/cli_pheval_utils.py,sha256=kySsSa7NyewwVwYBMu93y8l5_qSJaVkdXklGchcXExU,20504
20
+ pheval/cli_pheval_utils.py,sha256=4jLSJm4AEXu0SBtXbg4eNYLbCNQqQgjroDpRxQX34-M,22333
21
21
  pheval/config_parser.py,sha256=lh-Dy_FflXJUnRC3HYaEdSvPAsNZWQZlEr1hHQigrTM,1227
22
22
  pheval/constants.py,sha256=TWBgWOc05FGXFu63fs-hEHS2IJkLLAPHtMppiWBfBOg,349
23
23
  pheval/implementations/__init__.py,sha256=BMUTotjTdgy5j5xubWCIQgRXrSQ1ZIcjooer7r299Zo,1228
24
24
  pheval/infra/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  pheval/infra/exomiserdb.py,sha256=pM9-TfjrgurtH4OtM1Enk5oVhIxGQN3rKRlrxHuObTM,5080
26
26
  pheval/post_processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- pheval/post_processing/post_processing.py,sha256=Xzcrb7I0DiLBT3tp0oM8_L8Ld64fTgRHBstQuNSrFHk,13329
27
+ pheval/post_processing/post_processing.py,sha256=tqeVRWF6PMHpOe681ONeGaqxdviLgVJgze3o6qSpXEg,13438
28
28
  pheval/prepare/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  pheval/prepare/create_noisy_phenopackets.py,sha256=UbBRWDD95BFHPv03VYx04v35AGwJ9ynLltYKqQJHbZ0,11236
30
- pheval/prepare/create_spiked_vcf.py,sha256=A_nIAhoU48nAeocpIu5UE41db4oBGj2cSoT-U-3qQ1Q,21111
30
+ pheval/prepare/create_spiked_vcf.py,sha256=90A-Mi8QKhvN036vtFEVWAHgzHO37itiLYrqYlG4LiA,23953
31
31
  pheval/prepare/custom_exceptions.py,sha256=_G3_95dPtHIs1SviYBV1j7cYc-hxlhuw8hhnYdzByYY,1719
32
- pheval/prepare/prepare_corpus.py,sha256=FweWoYMkS-kzi5RqgrSzkdp_8iWLyoGWMC_GF0szcUg,3692
32
+ pheval/prepare/prepare_corpus.py,sha256=eRvozzezIgAqHAumtqul0WfXfBO1iOBaSlN8fPSn0Nw,4223
33
33
  pheval/prepare/update_phenopacket.py,sha256=21fzUPbwKN6Ey5TSh9PFzjT2x86U19RAE6WmkjG8u28,4770
34
34
  pheval/resources/alternate_ouputs/CADA_results.txt,sha256=Rinn2TtfwFNsx0aEWegKJOkjKnBm-Mf54gdaT3bWP0k,547
35
35
  pheval/resources/alternate_ouputs/DeepPVP_results.txt,sha256=MF9MZJYa4r4PEvFzALpi-lNGLxjENOnq_YgrgFMn-oQ,1508
@@ -47,11 +47,11 @@ pheval/utils/docs_gen.py,sha256=6FGtHicBC0rZKi0tdL3Epsg8d4osE44I9f1Ga0j4JLA,3193
47
47
  pheval/utils/docs_gen.sh,sha256=LyKLKjaZuf4UJ962CWfM-XqkxtvM8O2N9wHZS5mcb9A,477
48
48
  pheval/utils/exomiser.py,sha256=m2u0PH2z9lFPaB3LVkZCmPmH5e55q1NoTzNl46zRRP8,683
49
49
  pheval/utils/file_utils.py,sha256=m21cz-qjDYqnI8ClUv3J9fKizex98a-9bSEerQ75i_c,3576
50
- pheval/utils/phenopacket_utils.py,sha256=4inrnhZ4UjYgO0Y85ls_Nxq6voAIIXQV57_fMeIX-24,26792
50
+ pheval/utils/phenopacket_utils.py,sha256=W9T_X48EJ-xn5GghzbZlt-lI-DxWoSm7_SHr8DCJg2Q,26856
51
51
  pheval/utils/semsim_utils.py,sha256=s7ZCR2VfPYnOh7ApX6rv66eGoVSm9QJaVYOWBEhlXpo,6151
52
52
  pheval/utils/utils.py,sha256=9V6vCT8l1g4O2-ZATYqsVyd7AYZdWGd-Ksy7_oIC3eE,2343
53
- pheval-0.3.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
54
- pheval-0.3.7.dist-info/METADATA,sha256=BwicFNwmR9Hm8o2YxBJUJvrIeGwAevFFk-DT2pm07S4,1810
55
- pheval-0.3.7.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
56
- pheval-0.3.7.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
57
- pheval-0.3.7.dist-info/RECORD,,
53
+ pheval-0.3.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
54
+ pheval-0.3.9.dist-info/METADATA,sha256=IdYBy71zIR4Jtcu9B_6ovotDUJQ6w6EMWVQF0zx2Alc,1810
55
+ pheval-0.3.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
56
+ pheval-0.3.9.dist-info/entry_points.txt,sha256=o9gSwDkvT4-lqKy4mlsftd1nzP9WUOXQCfnbqycURd0,81
57
+ pheval-0.3.9.dist-info/RECORD,,
File without changes