pheval 0.3.5__tar.gz → 0.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pheval might be problematic. Click here for more details.

Files changed (56) hide show
  1. {pheval-0.3.5 → pheval-0.3.7}/PKG-INFO +1 -1
  2. {pheval-0.3.5 → pheval-0.3.7}/pyproject.toml +1 -1
  3. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/disease_prioritisation_analysis.py +7 -14
  4. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/gene_prioritisation_analysis.py +7 -13
  5. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/parse_pheval_result.py +8 -1
  6. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/variant_prioritisation_analysis.py +8 -15
  7. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/post_processing/post_processing.py +21 -101
  8. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/prepare/create_spiked_vcf.py +32 -13
  9. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/prepare/prepare_corpus.py +5 -0
  10. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/utils/file_utils.py +0 -29
  11. {pheval-0.3.5 → pheval-0.3.7}/LICENSE +0 -0
  12. {pheval-0.3.5 → pheval-0.3.7}/README.md +0 -0
  13. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/__init__.py +0 -0
  14. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/__init__.py +0 -0
  15. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/analysis.py +0 -0
  16. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/benchmark_generator.py +0 -0
  17. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/benchmarking_data.py +0 -0
  18. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/binary_classification_stats.py +0 -0
  19. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/generate_plots.py +0 -0
  20. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/generate_summary_outputs.py +0 -0
  21. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/parse_benchmark_summary.py +0 -0
  22. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/prioritisation_rank_recorder.py +0 -0
  23. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/prioritisation_result_types.py +0 -0
  24. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/rank_stats.py +0 -0
  25. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/analyse/run_data_parser.py +0 -0
  26. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/cli.py +0 -0
  27. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/cli_pheval.py +0 -0
  28. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/cli_pheval_utils.py +0 -0
  29. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/config_parser.py +0 -0
  30. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/constants.py +0 -0
  31. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/implementations/__init__.py +0 -0
  32. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/infra/__init__.py +0 -0
  33. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/infra/exomiserdb.py +0 -0
  34. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/post_processing/__init__.py +0 -0
  35. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/prepare/__init__.py +0 -0
  36. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/prepare/create_noisy_phenopackets.py +0 -0
  37. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/prepare/custom_exceptions.py +0 -0
  38. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/prepare/update_phenopacket.py +0 -0
  39. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/resources/alternate_ouputs/CADA_results.txt +0 -0
  40. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/resources/alternate_ouputs/DeepPVP_results.txt +0 -0
  41. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/resources/alternate_ouputs/OVA_results.txt +0 -0
  42. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/resources/alternate_ouputs/Phen2Gene_results.json +0 -0
  43. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/resources/alternate_ouputs/Phenolyzer_results.txt +0 -0
  44. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/resources/alternate_ouputs/lirical_results.tsv +0 -0
  45. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/resources/alternate_ouputs/svanna_results.tsv +0 -0
  46. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/resources/hgnc_complete_set.txt +0 -0
  47. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/run_metadata.py +0 -0
  48. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/runners/__init__.py +0 -0
  49. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/runners/runner.py +0 -0
  50. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/utils/__init__.py +0 -0
  51. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/utils/docs_gen.py +0 -0
  52. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/utils/docs_gen.sh +0 -0
  53. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/utils/exomiser.py +0 -0
  54. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/utils/phenopacket_utils.py +0 -0
  55. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/utils/semsim_utils.py +0 -0
  56. {pheval-0.3.5 → pheval-0.3.7}/src/pheval/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pheval
3
- Version: 0.3.5
3
+ Version: 0.3.7
4
4
  Summary:
5
5
  Author: Yasemin Bridges
6
6
  Author-email: y.bridges@qmul.ac.uk
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "pheval"
3
- version = "0.3.5"
3
+ version = "0.3.7"
4
4
  description = ""
5
5
  authors = ["Yasemin Bridges <y.bridges@qmul.ac.uk>",
6
6
  "Julius Jacobsen <j.jacobsen@qmul.ac.uk>",
@@ -10,11 +10,7 @@ from pheval.analyse.prioritisation_result_types import DiseasePrioritisationResu
10
10
  from pheval.analyse.rank_stats import RankStats
11
11
  from pheval.analyse.run_data_parser import TrackInputOutputDirectories
12
12
  from pheval.post_processing.post_processing import RankedPhEvalDiseaseResult
13
- from pheval.utils.file_utils import (
14
- all_files,
15
- files_with_suffix,
16
- obtain_phenopacket_path_from_pheval_result,
17
- )
13
+ from pheval.utils.file_utils import all_files
18
14
  from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandDisease, phenopacket_reader
19
15
 
20
16
 
@@ -217,7 +213,7 @@ def _obtain_causative_diseases(phenopacket_path: Path) -> List[ProbandDisease]:
217
213
 
218
214
 
219
215
  def assess_phenopacket_disease_prioritisation(
220
- standardised_disease_result: Path,
216
+ phenopacket_path: Path,
221
217
  score_order: str,
222
218
  results_dir_and_input: TrackInputOutputDirectories,
223
219
  threshold: float,
@@ -230,7 +226,7 @@ def assess_phenopacket_disease_prioritisation(
230
226
  against the recorded causative diseases for a proband in the Phenopacket.
231
227
 
232
228
  Args:
233
- standardised_disease_result (Path): Path to the PhEval standardised disease result file.
229
+ phenopacket_path (Path): Path to the Phenopacket.
234
230
  score_order (str): The order in which scores are arranged, either ascending or descending.
235
231
  results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
236
232
  threshold (float): Threshold for assessment.
@@ -238,8 +234,8 @@ def assess_phenopacket_disease_prioritisation(
238
234
  disease_rank_comparison (defaultdict): Default dictionary for disease rank comparisons.
239
235
  disease_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
240
236
  """
241
- phenopacket_path = obtain_phenopacket_path_from_pheval_result(
242
- standardised_disease_result, all_files(results_dir_and_input.phenopacket_dir)
237
+ standardised_disease_result = results_dir_and_input.results_dir.joinpath(
238
+ f"pheval_disease_results/{phenopacket_path.stem}-pheval_disease_result.tsv"
243
239
  )
244
240
  pheval_disease_result = read_standardised_result(standardised_disease_result)
245
241
  proband_diseases = _obtain_causative_diseases(phenopacket_path)
@@ -276,12 +272,9 @@ def benchmark_disease_prioritisation(
276
272
  """
277
273
  disease_rank_stats = RankStats()
278
274
  disease_binary_classification_stats = BinaryClassificationStats()
279
- for standardised_result in files_with_suffix(
280
- results_directory_and_input.results_dir.joinpath("pheval_disease_results/"),
281
- ".tsv",
282
- ):
275
+ for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
283
276
  assess_phenopacket_disease_prioritisation(
284
- standardised_result,
277
+ phenopacket_path,
285
278
  score_order,
286
279
  results_directory_and_input,
287
280
  threshold,
@@ -10,11 +10,7 @@ from pheval.analyse.prioritisation_result_types import GenePrioritisationResult
10
10
  from pheval.analyse.rank_stats import RankStats
11
11
  from pheval.analyse.run_data_parser import TrackInputOutputDirectories
12
12
  from pheval.post_processing.post_processing import RankedPhEvalGeneResult
13
- from pheval.utils.file_utils import (
14
- all_files,
15
- files_with_suffix,
16
- obtain_phenopacket_path_from_pheval_result,
17
- )
13
+ from pheval.utils.file_utils import all_files
18
14
  from pheval.utils.phenopacket_utils import PhenopacketUtil, ProbandCausativeGene, phenopacket_reader
19
15
 
20
16
 
@@ -209,7 +205,7 @@ def _obtain_causative_genes(phenopacket_path: Path) -> List[ProbandCausativeGene
209
205
 
210
206
 
211
207
  def assess_phenopacket_gene_prioritisation(
212
- standardised_gene_result: Path,
208
+ phenopacket_path: Path,
213
209
  score_order: str,
214
210
  results_dir_and_input: TrackInputOutputDirectories,
215
211
  threshold: float,
@@ -222,7 +218,7 @@ def assess_phenopacket_gene_prioritisation(
222
218
  against the recorded causative genes for a proband in the Phenopacket.
223
219
 
224
220
  Args:
225
- standardised_gene_result (Path): Path to the PhEval standardised gene result file.
221
+ phenopacket_path (Path): Path to the Phenopacket.
226
222
  score_order (str): The order in which scores are arranged, either ascending or descending.
227
223
  results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
228
224
  threshold (float): Threshold for assessment.
@@ -230,8 +226,8 @@ def assess_phenopacket_gene_prioritisation(
230
226
  gene_rank_comparison (defaultdict): Default dictionary for gene rank comparisons.
231
227
  gene_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
232
228
  """
233
- phenopacket_path = obtain_phenopacket_path_from_pheval_result(
234
- standardised_gene_result, all_files(results_dir_and_input.phenopacket_dir)
229
+ standardised_gene_result = results_dir_and_input.results_dir.joinpath(
230
+ f"pheval_gene_results/{phenopacket_path.stem}-pheval_gene_result.tsv"
235
231
  )
236
232
  pheval_gene_result = read_standardised_result(standardised_gene_result)
237
233
  proband_causative_genes = _obtain_causative_genes(phenopacket_path)
@@ -266,11 +262,9 @@ def benchmark_gene_prioritisation(
266
262
  """
267
263
  gene_rank_stats = RankStats()
268
264
  gene_binary_classification_stats = BinaryClassificationStats()
269
- for standardised_result in files_with_suffix(
270
- results_directory_and_input.results_dir.joinpath("pheval_gene_results/"), ".tsv"
271
- ):
265
+ for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
272
266
  assess_phenopacket_gene_prioritisation(
273
- standardised_result,
267
+ phenopacket_path,
274
268
  score_order,
275
269
  results_directory_and_input,
276
270
  threshold,
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from pathlib import Path
2
3
  from typing import List
3
4
 
@@ -5,6 +6,8 @@ import pandas as pd
5
6
 
6
7
  from pheval.post_processing.post_processing import PhEvalResult
7
8
 
9
+ info_log = logging.getLogger("info")
10
+
8
11
 
9
12
  def read_standardised_result(standardised_result_path: Path) -> List[dict]:
10
13
  """
@@ -16,7 +19,11 @@ def read_standardised_result(standardised_result_path: Path) -> List[dict]:
16
19
  Returns:
17
20
  List[dict]: A list of dictionaries representing the content of the standardised result file.
18
21
  """
19
- return pd.read_csv(standardised_result_path, delimiter="\t").to_dict("records")
22
+ if standardised_result_path.is_file():
23
+ return pd.read_csv(standardised_result_path, delimiter="\t").to_dict("records")
24
+ else:
25
+ info_log.info(f"Could not find {standardised_result_path}")
26
+ return pd.DataFrame().to_dict("records")
20
27
 
21
28
 
22
29
  def parse_pheval_result(
@@ -10,11 +10,7 @@ from pheval.analyse.prioritisation_result_types import VariantPrioritisationResu
10
10
  from pheval.analyse.rank_stats import RankStats
11
11
  from pheval.analyse.run_data_parser import TrackInputOutputDirectories
12
12
  from pheval.post_processing.post_processing import RankedPhEvalVariantResult
13
- from pheval.utils.file_utils import (
14
- all_files,
15
- files_with_suffix,
16
- obtain_phenopacket_path_from_pheval_result,
17
- )
13
+ from pheval.utils.file_utils import all_files
18
14
  from pheval.utils.phenopacket_utils import GenomicVariant, PhenopacketUtil, phenopacket_reader
19
15
 
20
16
 
@@ -211,7 +207,7 @@ def _obtain_causative_variants(phenopacket_path: Path) -> List[GenomicVariant]:
211
207
 
212
208
 
213
209
  def assess_phenopacket_variant_prioritisation(
214
- standardised_variant_result: Path,
210
+ phenopacket_path: Path,
215
211
  score_order: str,
216
212
  results_dir_and_input: TrackInputOutputDirectories,
217
213
  threshold: float,
@@ -224,7 +220,7 @@ def assess_phenopacket_variant_prioritisation(
224
220
  against the recorded causative variants for a proband in the Phenopacket.
225
221
 
226
222
  Args:
227
- standardised_variant_result (Path): Path to the PhEval standardised variant result file.
223
+ phenopacket_path (Path): Path to the Phenopacket.
228
224
  score_order (str): The order in which scores are arranged, either ascending or descending.
229
225
  results_dir_and_input (TrackInputOutputDirectories): Input and output directories.
230
226
  threshold (float): Threshold for assessment.
@@ -232,10 +228,10 @@ def assess_phenopacket_variant_prioritisation(
232
228
  variant_rank_comparison (defaultdict): Default dictionary for variant rank comparisons.
233
229
  variant_binary_classification_stats (BinaryClassificationStats): BinaryClassificationStats class instance.
234
230
  """
235
- phenopacket_path = obtain_phenopacket_path_from_pheval_result(
236
- standardised_variant_result, all_files(results_dir_and_input.phenopacket_dir)
237
- )
238
231
  proband_causative_variants = _obtain_causative_variants(phenopacket_path)
232
+ standardised_variant_result = results_dir_and_input.results_dir.joinpath(
233
+ f"pheval_variant_results/{phenopacket_path.stem}-pheval_variant_result.tsv"
234
+ )
239
235
  pheval_variant_result = read_standardised_result(standardised_variant_result)
240
236
  AssessVariantPrioritisation(
241
237
  phenopacket_path,
@@ -270,12 +266,9 @@ def benchmark_variant_prioritisation(
270
266
  """
271
267
  variant_rank_stats = RankStats()
272
268
  variant_binary_classification_stats = BinaryClassificationStats()
273
- for standardised_result in files_with_suffix(
274
- results_directory_and_input.results_dir.joinpath("pheval_variant_results/"),
275
- ".tsv",
276
- ):
269
+ for phenopacket_path in all_files(results_directory_and_input.phenopacket_dir):
277
270
  assess_phenopacket_variant_prioritisation(
278
- standardised_result,
271
+ phenopacket_path,
279
272
  score_order,
280
273
  results_directory_and_input,
281
274
  threshold,
@@ -227,68 +227,7 @@ class ResultSorter:
227
227
  )
228
228
 
229
229
 
230
- class ScoreRanker:
231
- """
232
- Class for ranking scores based on a given sort order
233
-
234
- Attributes:
235
- rank (int): Represents the current rank, initialised with 0
236
- current_score (float): Represents the current score, initialised with positive infinity (float("inf"))
237
- count (int): Used for counting, initialised with 0
238
- """
239
-
240
- rank: int = 0
241
- current_score: float = float("inf")
242
- count: int = 0
243
-
244
- def __init__(self, sort_order: SortOrder):
245
- """
246
- Initialise ScoreRanker
247
-
248
- Args:
249
- sort_order (SortOrder): Sorting order to be applied
250
- """
251
- self.sort_order = sort_order
252
-
253
- def _check_rank_order(self, round_score: float) -> None:
254
- """
255
- Check if the results are correctly ordered
256
-
257
- Args:
258
- round_score (float): Score to be checked against the current score
259
-
260
- Raises:
261
- ValueError: If results are not correctly sorted.
262
- """
263
- if self.sort_order == SortOrder.ASCENDING and round_score < self.current_score != float(
264
- "inf"
265
- ):
266
- raise ValueError("Results are not correctly sorted!")
267
- elif self.sort_order == SortOrder.DESCENDING and round_score > self.current_score != float(
268
- "inf"
269
- ):
270
- raise ValueError("Results are not correctly sorted!")
271
-
272
- def rank_scores(self, round_score: float) -> int:
273
- """
274
- Add ranks to a result; equal scores are given the same rank, e.g., 1, 1, 3
275
-
276
- Args:
277
- round_score (float): Score to be ranked
278
-
279
- Returns:
280
- int: Rank assigned to the score
281
- """
282
- self._check_rank_order(round_score)
283
- self.count += 1
284
- if self.current_score == round_score:
285
- return self.rank
286
- self.current_score = round_score
287
- self.rank = self.count
288
- return self.rank
289
-
290
-
291
- def _rank_pheval_result(pheval_result: [PhEvalResult], sort_order: SortOrder) -> [PhEvalResult]:
230
+ def _rank_pheval_result(pheval_result: [PhEvalResult], sort_order: SortOrder) -> pd.DataFrame:
292
231
  """
293
232
  Rank PhEval results post-processed from tool-specific output, managing tied scores (ex aequo)
294
233
 
@@ -297,35 +236,17 @@ def _rank_pheval_result(pheval_result: [PhEvalResult], sort_order: SortOrder) ->
297
236
  sort_order (SortOrder): Sorting order based on which ranking is performed
298
237
 
299
238
  Returns:
300
- List[PhEvalResult]: Ranked PhEval results with tied scores managed
239
+ pd.DataFrame : Ranked PhEval results with tied scores managed
301
240
 
302
241
  Raises:
303
242
  ValueError: If an incompatible PhEval result type is encountered
304
243
  """
305
- score_ranker = ScoreRanker(sort_order)
306
- ranked_result = []
307
- for result in pheval_result:
308
- if type(result) == PhEvalGeneResult:
309
- ranked_result.append(
310
- RankedPhEvalGeneResult.from_gene_result(
311
- result, score_ranker.rank_scores(result.score)
312
- )
313
- )
314
- elif type(result) == PhEvalVariantResult:
315
- ranked_result.append(
316
- RankedPhEvalVariantResult.from_variant_result(
317
- result, score_ranker.rank_scores(result.score)
318
- )
319
- )
320
- elif type(result) == PhEvalDiseaseResult:
321
- ranked_result.append(
322
- RankedPhEvalDiseaseResult.from_disease_result(
323
- result, score_ranker.rank_scores(result.score)
324
- )
325
- )
326
- else:
327
- raise ValueError("Incompatible PhEval result type.")
328
- return ranked_result
244
+ pheval_result_df = pd.DataFrame([data.__dict__ for data in pheval_result])
245
+ if sort_order == SortOrder.ASCENDING:
246
+ pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=True)
247
+ elif sort_order == SortOrder.DESCENDING:
248
+ pheval_result_df["rank"] = pheval_result_df["score"].rank(method="max", ascending=False)
249
+ return pheval_result_df
329
250
 
330
251
 
331
252
  def _return_sort_order(sort_order_str: str) -> SortOrder:
@@ -347,7 +268,7 @@ def _return_sort_order(sort_order_str: str) -> SortOrder:
347
268
  raise ValueError("Incompatible ordering method specified.")
348
269
 
349
270
 
350
- def _create_pheval_result(pheval_result: [PhEvalResult], sort_order_str: str) -> [PhEvalResult]:
271
+ def _create_pheval_result(pheval_result: [PhEvalResult], sort_order_str: str) -> pd.DataFrame:
351
272
  """
352
273
  Create PhEval results with corresponding ranks based on the specified sorting order.
353
274
 
@@ -356,7 +277,7 @@ def _create_pheval_result(pheval_result: [PhEvalResult], sort_order_str: str) ->
356
277
  sort_order_str (str): String representation of the desired sorting order.
357
278
 
358
279
  Returns:
359
- List[PhEvalResult]: PhEval results with ranks assigned.
280
+ pd.DataFrame: PhEval results with ranks assigned.
360
281
  """
361
282
  sort_order = _return_sort_order(sort_order_str)
362
283
  sorted_pheval_result = ResultSorter(pheval_result, sort_order).sort_pheval_results()
@@ -364,7 +285,7 @@ def _create_pheval_result(pheval_result: [PhEvalResult], sort_order_str: str) ->
364
285
 
365
286
 
366
287
  def _write_pheval_gene_result(
367
- ranked_pheval_result: [PhEvalResult], output_dir: Path, tool_result_path: Path
288
+ ranked_pheval_result: pd.DataFrame, output_dir: Path, tool_result_path: Path
368
289
  ) -> None:
369
290
  """
370
291
  Write ranked PhEval gene results to a TSV file
@@ -374,8 +295,9 @@ def _write_pheval_gene_result(
374
295
  output_dir (Path): Path to the output directory
375
296
  tool_result_path (Path): Path to the tool-specific result file
376
297
  """
377
- ranked_result = pd.DataFrame([data.__dict__ for data in ranked_pheval_result])
378
- pheval_gene_output = ranked_result.loc[:, ["rank", "score", "gene_symbol", "gene_identifier"]]
298
+ pheval_gene_output = ranked_pheval_result.loc[
299
+ :, ["rank", "score", "gene_symbol", "gene_identifier"]
300
+ ]
379
301
  pheval_gene_output.to_csv(
380
302
  output_dir.joinpath(
381
303
  "pheval_gene_results/" + tool_result_path.stem + "-pheval_gene_result.tsv"
@@ -386,7 +308,7 @@ def _write_pheval_gene_result(
386
308
 
387
309
 
388
310
  def _write_pheval_variant_result(
389
- ranked_pheval_result: [PhEvalResult], output_dir: Path, tool_result_path: Path
311
+ ranked_pheval_result: pd.DataFrame, output_dir: Path, tool_result_path: Path
390
312
  ) -> None:
391
313
  """
392
314
  Write ranked PhEval variant results to a TSV file
@@ -396,8 +318,7 @@ def _write_pheval_variant_result(
396
318
  output_dir (Path): Path to the output directory
397
319
  tool_result_path (Path): Path to the tool-specific result file
398
320
  """
399
- ranked_result = pd.DataFrame([data.__dict__ for data in ranked_pheval_result])
400
- pheval_variant_output = ranked_result.loc[
321
+ pheval_variant_output = ranked_pheval_result.loc[
401
322
  :, ["rank", "score", "chromosome", "start", "end", "ref", "alt"]
402
323
  ]
403
324
  pheval_variant_output.to_csv(
@@ -410,7 +331,7 @@ def _write_pheval_variant_result(
410
331
 
411
332
 
412
333
  def _write_pheval_disease_result(
413
- ranked_pheval_result: [RankedPhEvalDiseaseResult], output_dir: Path, tool_result_path: Path
334
+ ranked_pheval_result: pd.DataFrame, output_dir: Path, tool_result_path: Path
414
335
  ) -> None:
415
336
  """
416
337
  Write ranked PhEval disease results to a TSV file
@@ -420,8 +341,7 @@ def _write_pheval_disease_result(
420
341
  output_dir (Path): Path to the output directory
421
342
  tool_result_path (Path): Path to the tool-specific result file
422
343
  """
423
- ranked_result = pd.DataFrame([data.__dict__ for data in ranked_pheval_result])
424
- pheval_disease_output = ranked_result.loc[
344
+ pheval_disease_output = ranked_pheval_result.loc[
425
345
  :, ["rank", "score", "disease_name", "disease_identifier"]
426
346
  ]
427
347
  pheval_disease_output.to_csv(
@@ -455,11 +375,11 @@ def generate_pheval_result(
455
375
  info_log.warning(f"No results found for {tool_result_path.name}")
456
376
  return
457
377
  ranked_pheval_result = _create_pheval_result(pheval_result, sort_order_str)
458
- if all(isinstance(result, RankedPhEvalGeneResult) for result in ranked_pheval_result):
378
+ if all(isinstance(result, PhEvalGeneResult) for result in pheval_result):
459
379
  _write_pheval_gene_result(ranked_pheval_result, output_dir, tool_result_path)
460
- elif all(isinstance(result, RankedPhEvalVariantResult) for result in ranked_pheval_result):
380
+ elif all(isinstance(result, PhEvalVariantResult) for result in pheval_result):
461
381
  _write_pheval_variant_result(ranked_pheval_result, output_dir, tool_result_path)
462
- elif all(isinstance(result, RankedPhEvalDiseaseResult) for result in ranked_pheval_result):
382
+ elif all(isinstance(result, PhEvalDiseaseResult) for result in pheval_result):
463
383
  _write_pheval_disease_result(ranked_pheval_result, output_dir, tool_result_path)
464
384
  else:
465
385
  raise ValueError("Results are not all of the same type.")
@@ -328,22 +328,35 @@ class VcfSpiker:
328
328
  genotype_codes[proband_variant_data.genotype.lower()] + "\n",
329
329
  ]
330
330
 
331
- def construct_vcf_records(self) -> List[str]:
331
+ def construct_vcf_records(self, template_vcf_name: str) -> List[str]:
332
332
  """
333
333
  Construct updated VCF records by inserting spiked variants into the correct positions within the VCF.
334
334
 
335
+ Args:
336
+ template_vcf_name (str): Name of the template VCF file.
337
+
335
338
  Returns:
336
339
  List[str]: Updated VCF records containing the spiked variants.
337
340
  """
338
341
  updated_vcf_records = copy(self.vcf_contents)
339
342
  for variant in self.proband_causative_variants:
340
- variant = self.construct_variant_entry(variant)
341
- variant_entry_position = [
343
+ variant_entry = self.construct_variant_entry(variant)
344
+ matching_indices = [
342
345
  i
343
346
  for i, val in enumerate(updated_vcf_records)
344
- if val.split("\t")[0] == variant[0] and int(val.split("\t")[1]) < int(variant[1])
345
- ][-1] + 1
346
- updated_vcf_records.insert(variant_entry_position, "\t".join(variant))
347
+ if val.split("\t")[0] == variant_entry[0]
348
+ and int(val.split("\t")[1]) < int(variant_entry[1])
349
+ ]
350
+ if matching_indices:
351
+ variant_entry_position = matching_indices[-1] + 1
352
+ else:
353
+ info_log.warning(
354
+ f"Could not find entry position for {variant.variant.chrom}-{variant.variant.pos}-"
355
+ f"{variant.variant.ref}-{variant.variant.alt} in {template_vcf_name}, "
356
+ "inserting at end of VCF contents."
357
+ )
358
+ variant_entry_position = len(updated_vcf_records)
359
+ updated_vcf_records.insert(variant_entry_position, "\t".join(variant_entry))
347
360
  return updated_vcf_records
348
361
 
349
362
  def construct_header(self, updated_vcf_records: List[str]) -> List[str]:
@@ -358,21 +371,27 @@ class VcfSpiker:
358
371
  """
359
372
  updated_vcf_file = []
360
373
  for line in updated_vcf_records:
361
- text = line.replace(
362
- self.vcf_header.sample_id,
363
- self.proband_causative_variants[0].proband_id,
364
- )
374
+ if line.startswith("#"):
375
+ text = line.replace(
376
+ self.vcf_header.sample_id,
377
+ self.proband_causative_variants[0].proband_id,
378
+ )
379
+ else:
380
+ text = line
365
381
  updated_vcf_file.append(text)
366
382
  return updated_vcf_file
367
383
 
368
- def construct_vcf(self) -> List[str]:
384
+ def construct_vcf(self, template_vcf_name: str) -> List[str]:
369
385
  """
370
386
  Construct the entire spiked VCF file by incorporating the spiked variants into the VCF.
371
387
 
388
+ Args:
389
+ template_vcf_name (str): Name of the template VCF file.
390
+
372
391
  Returns:
373
392
  List[str]: The complete spiked VCF file content as a list of strings.
374
393
  """
375
- return self.construct_header(self.construct_vcf_records())
394
+ return self.construct_header(self.construct_vcf_records(template_vcf_name))
376
395
 
377
396
 
378
397
  class VcfWriter:
@@ -454,7 +473,7 @@ def spike_vcf_contents(
454
473
  chosen_template_vcf.vcf_contents,
455
474
  phenopacket_causative_variants,
456
475
  chosen_template_vcf.vcf_header,
457
- ).construct_vcf(),
476
+ ).construct_vcf(chosen_template_vcf.vcf_file_name),
458
477
  )
459
478
 
460
479
 
@@ -39,6 +39,11 @@ def prepare_corpus(
39
39
  output_dir.joinpath("phenopackets").mkdir(exist_ok=True, parents=True)
40
40
  for phenopacket_path in all_files(phenopacket_dir):
41
41
  phenopacket_util = PhenopacketUtil(phenopacket_reader(phenopacket_path))
42
+ if not phenopacket_util.observed_phenotypic_features():
43
+ info_log.warning(
44
+ f"Removed {phenopacket_path.name} from the corpus due to no observed phenotypic features."
45
+ )
46
+ continue
42
47
  if variant_analysis:
43
48
  if phenopacket_util.check_incomplete_variant_record():
44
49
  info_log.warning(
@@ -70,35 +70,6 @@ def normalise_file_name(file_path: Path) -> str:
70
70
  return re.sub("[\u0300-\u036f]", "", normalised_file_name)
71
71
 
72
72
 
73
- def obtain_phenopacket_path_from_pheval_result(
74
- pheval_result_path: Path, phenopacket_paths: list[Path]
75
- ) -> Path:
76
- """
77
- Obtains the phenopacket file name when given a pheval result file name
78
- and a list of full paths of phenopackets to be queried.
79
-
80
- Args:
81
- pheval_result_path (Path): The PhEval result.
82
- phenopacket_paths (list[Path]): List of full paths of phenopackets to be queried.
83
-
84
- Returns:
85
- Path: The matching phenopacket file path from the provided list.
86
- """
87
- pheval_result_path_stem_stripped = pheval_result_path.stem.split("-pheval_")[0]
88
- matching_phenopacket_paths = [
89
- phenopacket_path
90
- for phenopacket_path in phenopacket_paths
91
- if phenopacket_path.stem == pheval_result_path_stem_stripped
92
- ]
93
- if matching_phenopacket_paths:
94
- return matching_phenopacket_paths[0]
95
- else:
96
- raise FileNotFoundError(
97
- f"Unable to find matching phenopacket file named "
98
- f"{pheval_result_path_stem_stripped}.json for {pheval_result_path.name}"
99
- )
100
-
101
-
102
73
  def ensure_file_exists(*files: str):
103
74
  """Ensures the existence of files passed as parameter
104
75
  Raises:
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes