scout-browser 4.84__py3-none-any.whl → 4.86__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. scout/__version__.py +1 -1
  2. scout/adapter/mongo/base.py +17 -14
  3. scout/adapter/mongo/case.py +20 -1
  4. scout/adapter/mongo/cytoband.py +13 -0
  5. scout/adapter/mongo/filter.py +36 -1
  6. scout/adapter/mongo/hgnc.py +1 -1
  7. scout/adapter/mongo/omics_variant.py +145 -0
  8. scout/adapter/mongo/query.py +13 -3
  9. scout/adapter/mongo/variant.py +10 -4
  10. scout/build/case.py +5 -0
  11. scout/build/variant/variant.py +1 -0
  12. scout/commands/update/genes.py +9 -13
  13. scout/constants/__init__.py +3 -1
  14. scout/constants/case_tags.py +1 -0
  15. scout/constants/clinvar.py +1 -1
  16. scout/constants/file_types.py +31 -0
  17. scout/constants/filters.py +4 -0
  18. scout/constants/indexes.py +30 -13
  19. scout/constants/variant_tags.py +3 -0
  20. scout/demo/643594.clinical.mei.vcf.gz +0 -0
  21. scout/demo/643594.clinical.mei.vcf.gz.tbi +0 -0
  22. scout/demo/643594.config.yaml +4 -0
  23. scout/demo/drop/fraser_top_hits_clinical.tsv +5 -0
  24. scout/demo/drop/outrider_top_hits_clinical.tsv +10 -0
  25. scout/load/hgnc_gene.py +39 -6
  26. scout/load/setup.py +4 -4
  27. scout/models/case/case_loading_models.py +25 -2
  28. scout/models/omics_variant.py +227 -0
  29. scout/parse/hgnc.py +1 -0
  30. scout/parse/omics_variant/__init__.py +11 -0
  31. scout/parse/omics_variant/drop.py +19 -0
  32. scout/parse/variant/callers.py +6 -3
  33. scout/parse/variant/frequency.py +10 -2
  34. scout/parse/variant/transcript.py +1 -1
  35. scout/parse/variant/variant.py +10 -4
  36. scout/server/app.py +4 -1
  37. scout/server/blueprints/alignviewers/controllers.py +35 -24
  38. scout/server/blueprints/alignviewers/templates/alignviewers/igv_sashimi_viewer.html +19 -15
  39. scout/server/blueprints/alignviewers/templates/alignviewers/igv_viewer.html +45 -5
  40. scout/server/blueprints/alignviewers/templates/alignviewers/utils.html +1 -1
  41. scout/server/blueprints/alignviewers/views.py +10 -2
  42. scout/server/blueprints/cases/controllers.py +18 -1
  43. scout/server/blueprints/cases/templates/cases/case.html +28 -10
  44. scout/server/blueprints/cases/templates/cases/case_report.html +2 -17
  45. scout/server/blueprints/cases/templates/cases/collapsible_actionbar.html +1 -1
  46. scout/server/blueprints/cases/templates/cases/gene_panel.html +27 -41
  47. scout/server/blueprints/cases/templates/cases/phenotype.html +8 -5
  48. scout/server/blueprints/cases/templates/cases/utils.html +27 -4
  49. scout/server/blueprints/clinvar/controllers.py +9 -3
  50. scout/server/blueprints/dashboard/controllers.py +44 -13
  51. scout/server/blueprints/dashboard/static/charts.js +46 -36
  52. scout/server/blueprints/dashboard/templates/dashboard/dashboard_general.html +2 -2
  53. scout/server/blueprints/institutes/forms.py +2 -0
  54. scout/server/blueprints/institutes/templates/overview/cases.html +6 -4
  55. scout/server/blueprints/institutes/templates/overview/gene_variants.html +40 -27
  56. scout/server/blueprints/institutes/templates/overview/institute_sidebar.html +1 -1
  57. scout/server/blueprints/institutes/views.py +5 -12
  58. scout/server/blueprints/omics_variants/__init__.py +1 -0
  59. scout/server/blueprints/omics_variants/controllers.py +122 -0
  60. scout/server/blueprints/omics_variants/templates/omics_variants/outliers.html +262 -0
  61. scout/server/blueprints/omics_variants/views.py +106 -0
  62. scout/server/blueprints/panels/controllers.py +1 -7
  63. scout/server/blueprints/panels/templates/panels/panels.html +12 -4
  64. scout/server/blueprints/panels/views.py +9 -11
  65. scout/server/blueprints/variant/templates/variant/buttons.html +7 -2
  66. scout/server/blueprints/variant/templates/variant/str-variant-reviewer.html +1 -1
  67. scout/server/blueprints/variant/templates/variant/utils.html +1 -1
  68. scout/server/blueprints/variant/utils.py +54 -103
  69. scout/server/blueprints/variant/views.py +1 -0
  70. scout/server/blueprints/variants/controllers.py +1 -4
  71. scout/server/blueprints/variants/forms.py +42 -0
  72. scout/server/blueprints/variants/templates/variants/utils.html +8 -4
  73. scout/server/blueprints/variants/views.py +28 -7
  74. scout/server/config.py +4 -0
  75. scout/server/extensions/clinvar_extension.py +7 -7
  76. scout/server/links.py +2 -2
  77. scout/server/templates/bootstrap_global.html +1 -4
  78. scout/server/templates/utils.html +4 -4
  79. scout/server/utils.py +4 -1
  80. {scout_browser-4.84.dist-info → scout_browser-4.86.dist-info}/METADATA +11 -11
  81. {scout_browser-4.84.dist-info → scout_browser-4.86.dist-info}/RECORD +85 -75
  82. {scout_browser-4.84.dist-info → scout_browser-4.86.dist-info}/WHEEL +1 -1
  83. {scout_browser-4.84.dist-info → scout_browser-4.86.dist-info}/LICENSE +0 -0
  84. {scout_browser-4.84.dist-info → scout_browser-4.86.dist-info}/entry_points.txt +0 -0
  85. {scout_browser-4.84.dist-info → scout_browser-4.86.dist-info}/top_level.txt +0 -0
@@ -12,23 +12,48 @@ INDEXES = {
12
12
  IndexModel(
13
13
  [("build", ASCENDING), ("chromosome", ASCENDING)],
14
14
  name="build_chromosome",
15
- background=True,
16
15
  ),
17
16
  IndexModel(
18
17
  [("build", ASCENDING), ("hgnc_id", ASCENDING)],
19
18
  name="build_hgncid",
20
- background=True,
21
19
  ),
22
20
  IndexModel(
23
21
  [("build", ASCENDING), ("aliases", ASCENDING)],
24
22
  name="build_aliases",
25
- background=True,
26
23
  ),
27
24
  IndexModel(
28
25
  [("build", ASCENDING), ("hgnc_symbol", ASCENDING)],
29
26
  name="build_hgnc_symbol",
30
27
  ),
31
28
  ],
29
+ "omics_variant": [
30
+ IndexModel(
31
+ # Clear text variant id index
32
+ [
33
+ ("omics_variant_id", ASCENDING),
34
+ ],
35
+ name="omics_variant_id",
36
+ ),
37
+ IndexModel(
38
+ # Index for searching across cases for a change in given genes
39
+ [
40
+ ("hgnc_ids", ASCENDING),
41
+ ("sub_category", ASCENDING),
42
+ ("variant_type", ASCENDING),
43
+ ],
44
+ name="hgnc_ids_sub_category_variant_type",
45
+ ),
46
+ IndexModel(
47
+ # Filterish index
48
+ [
49
+ ("case_id", ASCENDING),
50
+ ("variant_type", ASCENDING),
51
+ ("sub_category", ASCENDING),
52
+ ("hgnc_ids", ASCENDING),
53
+ ],
54
+ name="case_id_variant_type_sub_category_hgnc_ids",
55
+ ),
56
+ ],
32
57
  "variant": [
33
58
  IndexModel(
34
59
  [
@@ -39,7 +64,6 @@ INDEXES = {
39
64
  ("hgnc_ids", ASCENDING),
40
65
  ],
41
66
  name="caseid_category_varianttype_variantrank_hgncids",
42
- background=True,
43
67
  ),
44
68
  IndexModel(
45
69
  [
@@ -49,8 +73,7 @@ INDEXES = {
49
73
  ("variant_type", ASCENDING),
50
74
  ],
51
75
  name="hgncsymbol_rankscore_category_varianttype",
52
- background=True,
53
- partialFilterExpression={"rank_score": {"$gt": 5}, "category": "snv"},
76
+ partialFilterExpression={"rank_score": {"$gte": 5}},
54
77
  ),
55
78
  IndexModel(
56
79
  [
@@ -59,7 +82,6 @@ INDEXES = {
59
82
  ("category", ASCENDING),
60
83
  ],
61
84
  name="variantid_caseid_category",
62
- background=True,
63
85
  ),
64
86
  IndexModel(
65
87
  [
@@ -69,7 +91,6 @@ INDEXES = {
69
91
  ("rank_score", ASCENDING),
70
92
  ],
71
93
  name="category_caseid_varianttype_rankscore",
72
- background=True,
73
94
  ),
74
95
  IndexModel(
75
96
  [
@@ -81,18 +102,16 @@ INDEXES = {
81
102
  ("end", ASCENDING),
82
103
  ],
83
104
  name="caseid_category_chromosome_start_end",
84
- background=True,
85
105
  ),
86
106
  IndexModel(
87
107
  [("variant_id", ASCENDING), ("institute", ASCENDING)],
88
108
  name="variant_id_institute",
89
- background=True,
90
109
  ),
91
110
  ],
92
111
  "hpo_term": [
93
112
  IndexModel([("description", ASCENDING)], name="description"),
94
113
  IndexModel([("description", TEXT)], default_language="english", name="description_text"),
95
- IndexModel([("hpo_number", ASCENDING)], name="number", background=True),
114
+ IndexModel([("hpo_number", ASCENDING)], name="number"),
96
115
  ],
97
116
  "event": [
98
117
  IndexModel(
@@ -115,14 +134,12 @@ INDEXES = {
115
134
  IndexModel(
116
135
  [("build", ASCENDING), ("hgnc_id", ASCENDING), ("length", DESCENDING)],
117
136
  name="hgncid_length",
118
- background=True,
119
137
  )
120
138
  ],
121
139
  "exon": [
122
140
  IndexModel(
123
141
  [("build", ASCENDING), ("hgnc_id", ASCENDING)],
124
142
  name="build_hgncid",
125
- background=True,
126
143
  )
127
144
  ],
128
145
  "case": [
@@ -37,6 +37,8 @@ FEATURE_TYPES = (
37
37
 
38
38
  SV_TYPES = ("ins", "del", "dup", "cnv", "inv", "bnd")
39
39
 
40
+ OUTLIER_TYPES = ("splicing", "expression")
41
+
40
42
  GENETIC_MODELS = (
41
43
  ("AR_hom", "Autosomal Recessive Homozygote"),
42
44
  ("AR_hom_dn", "Autosomal Recessive Homozygote De Novo"),
@@ -519,4 +521,5 @@ VARIANTS_TARGET_FROM_CATEGORY = {
519
521
  "snv": "variants.variants",
520
522
  "str": "variants.str_variants",
521
523
  "fusion": "variants.fusion_variants",
524
+ "outlier": "omics_variants.outliers",
522
525
  }
Binary file
Binary file
@@ -125,6 +125,10 @@ vcf_snv_research: scout/demo/643594.research.vcf.gz
125
125
  vcf_sv_research: scout/demo/643594.research.SV.vcf.gz
126
126
  vcf_mei_research: scout/demo/643594.research.mei.vcf.gz
127
127
 
128
+ omics_files:
129
+ fraser: scout/demo/drop/fraser_top_hits_clinical.tsv
130
+ outrider: scout/demo/drop/outrider_top_hits_clinical.tsv
131
+
128
132
  smn_tsv: scout/demo/643594.solo.smn.tsv
129
133
 
130
134
  madeline: scout/demo/madeline.xml
@@ -0,0 +1,5 @@
1
+ hgnc_id geneID hgncSymbol gene_type gene_name_orig sampleID seqnames start end width strand type pValue psiValue deltaPsi counts totalCounts meanCounts meanTotalCounts nonsplitCounts nonsplitProportion nonsplitProportion_99quantile annotatedJunction pValueGene padjustGene PAIRED_END DNA_ID DROP_GROUP SPLICE_COUNTS_DIR HPO_TERMS GENE_COUNTS_FILE GENE_ANNOTATION GENOME isExternal potentialImpact causesFrameshift UTR_overlap blacklist
2
+ 2439 ENSG00000119535.18 CSF3R protein_coding CSF3R ADM1059A2 chr1 36479517 Imp 1961 - jaccard 1.6652e-06 0.49 -0.39 127 258 4237.77 4570.55 3 0.01 0.02 both 9.9912e-06 0.013423 True outrider,fraser False annotatedIntron_reducedUsage unlikely 5'-UTR False
3
+ 4831 ENSG00000213934.9 HBG1 protein_coding HBG1 ADM1059A2 chr11 5248488 5254291 5804 - jaccard 1.918e-12 0.36 0.35 35 96 22.84 6902.91 0 0.0 0.0 end 5.7541e-12 1.2885e-08 True outrider,fraser False exonSkipping inconclusive 3'-UTR False
4
+ 4832 ENSG00000196565.15 HBG2 protein_coding HBG2 ADM1059A2 chr11 5248488 5254291 5804 - jaccard 1.918e-12 0.36 0.35 35 96 22.84 6902.91 0 0.0 0.0 end 3.836e-12 1.2885e-08 True outrider,fraser False exonSkipping inconclusive 3'-UTR False
5
+ 17284 ENSG00000213934.9 POT1 protein_coding POT1 ADM1059A2 chr7 124532319 124532434 115 - jaccard 1.918e-12 0.36 0.35 35 96 22.84 6902.91 0 0.0 0.0 end 5.7541e-12 1.2885e-08 True outrider,fraser False exonSkipping inconclusive 3'-UTR False
@@ -0,0 +1,10 @@
1
+ hgnc_id seqnames start end strand geneID hgncSymbol gene_type gene_name_orig sampleID pValue padjust zScore l2fc rawcounts normcounts meanCorrected theta aberrant AberrantBySample AberrantByGene padj_rank FDR_set foldChange
2
+ 25415 chr4 88257620 88284769 - ENSG00000163644.15 PPM1K protein_coding PPM1K ADM1059A2 0.0016124374690447165 1.0 -5.92 -0.9 27 317.46 601.46 139.77 False 4.0 0.0 6110.0 transcriptome-wide 0.54
3
+ 10019 chr6 3063824 3115187 + ENSG00000137275.16 RIPK1 protein_coding RIPK1 ADM1059A2 0.0009997468998232232 1.0 6.33 0.53 104 1891.09 1308.4 547.35 False 4.0 0.0 6110.0 transcriptome-wide 1.44
4
+ 4827 chr11 5225464 5229395 - ENSG00000244734.4 HBB protein_coding HBB ADM1059A2 3.417496739472308e-25 4.169327890311004e-20 -12.61 -10.14 61 2625.42 3016434.64 9.21 True 4.0 1.0 1.0 transcriptome-wide 0.0
5
+ 4831 chr11 5248269 5249857 - ENSG00000213934.9 HBG1 protein_coding HBG1 ADM1059A3 0.0016205310428587193 1.0 2.23 2.86 73 151.01 19.19 0.91 False 4.0 0.0 6110.0 transcriptome-wide 7.26
6
+ 16860 chr12 108522214 108561400 - ENSG00000075856.12 SART3 protein_coding SART3 ADM1059A2 0.0014636643867152977 1.0 7.02 0.51 97 1412.71 989.2 1000.0 False 4.0 0.0 6110.0 transcriptome-wide 1.42
7
+ 4824 chr16 172876 173710 + ENSG00000188536.13 HBA2 protein_coding HBA2 ADM1059A2 2.15484140654196e-24 1.2729742965811128e-19 -12.53 -10.01 262 6781.29 7019330.9 8.97 True 4.0 1.0 2.5 transcriptome-wide 0.0
8
+ 4823 chr16 176680 177522 + ENSG00000206172.8 HBA1 protein_coding HBA1 ADM1059A2 3.1302782768232926e-24 1.2729742965811128e-19 -12.5 -9.97 246 6497.61 6540277.08 8.95 True 4.0 1.0 2.5 transcriptome-wide 0.0
9
+ 9543 chr17 4796144 4798502 + ENSG00000142507.10 PSMB6 protein_coding PSMB6 ADM1059A2 0.0010771639306525651 1.0 -10.54 -1.14 13 342.89 783.82 1000.0 False 4.0 0.0 6110.0 transcriptome-wide 0.45
10
+ 17284 chr7 124532319 124532434 - ENSG00000213934.9 POT1 protein_coding POT1 ADM1059A2 0.0016205310428587193 1.0 2.23 2.86 73 151.01 19.19 0.91 False 4.0 0.0 6110.0 transcriptome-wide 7.26
scout/load/hgnc_gene.py CHANGED
@@ -1,5 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
  import logging
3
+ from typing import Dict
3
4
 
4
5
  from click import progressbar
5
6
 
@@ -16,6 +17,22 @@ from scout.utils.scout_requests import (
16
17
  LOG = logging.getLogger(__name__)
17
18
 
18
19
 
20
+ def set_missing_gene_coordinates(gene_data: dict, cytoband_coords: Dict[str, dict]):
21
+ """Attempt at collecting gene coordinates from cytoband for genes missing Ensembl ID."""
22
+
23
+ if gene_data.get("ensembl_gene_id") not in [
24
+ "",
25
+ None,
26
+ ]: # Coordinates are present, since they're collected from the Ensembl file
27
+ return
28
+ gene_data["ensembl_gene_id"] = None
29
+ cytoband_coord: dict = cytoband_coords.get(gene_data["location"])
30
+ if cytoband_coord:
31
+ gene_data["chromosome"]: str = cytoband_coord["chromosome"]
32
+ gene_data["start"]: int = cytoband_coord["start"]
33
+ gene_data["end"]: int = cytoband_coord["stop"]
34
+
35
+
19
36
  def load_hgnc_genes(
20
37
  adapter,
21
38
  genes=None,
@@ -36,7 +53,7 @@ def load_hgnc_genes(
36
53
  Args:
37
54
  adapter(scout.adapter.MongoAdapter)
38
55
  genes(dict): If genes are already parsed
39
- ensembl_lines(iterable(str)): Lines formated with ensembl gene information
56
+ ensembl_lines(iterable(str)): Lines formatted with ensembl gene information
40
57
  hgnc_lines(iterable(str)): Lines with gene information from genenames.org
41
58
  exac_lines(iterable(str)): Lines with information pLi-scores from ExAC
42
59
  mim2gene(iterable(str)): Lines with map from omim id to gene symbol
@@ -78,20 +95,36 @@ def load_hgnc_genes(
78
95
  genemap_lines=genemap_lines,
79
96
  )
80
97
 
81
- non_existing = 0
98
+ without_coords = 0
82
99
  nr_genes = len(genes)
83
100
  LOG.info(f"Building info for {nr_genes} genes")
101
+
102
+ cytoband_coords: Dict[str, dict] = adapter.cytoband_to_coordinates(build=build)
103
+
84
104
  with progressbar(genes.values(), label="Building genes", length=nr_genes) as bar:
85
105
  for gene_data in bar:
106
+ set_missing_gene_coordinates(gene_data=gene_data, cytoband_coords=cytoband_coords)
107
+
86
108
  if not gene_data.get("chromosome"):
87
- non_existing += 1
109
+ without_coords += 1
88
110
  continue
111
+ gene_obj = build_hgnc_gene(
112
+ gene_data,
113
+ build=build,
114
+ )
89
115
 
90
- gene_obj = build_hgnc_gene(gene_data, build=build)
91
- gene_objects.append(gene_obj)
116
+ if gene_obj:
117
+ gene_objects.append(gene_obj)
118
+ else:
119
+ without_coords += 1
92
120
 
93
- LOG.info("Nr of genes without coordinates in build %s: %s", build, non_existing)
121
+ LOG.info(
122
+ "Nr of genes without coordinates in build %s and therefore skipped: %s",
123
+ build,
124
+ without_coords,
125
+ )
94
126
  LOG.info(f"Loading {len(gene_objects)} genes into the database")
127
+
95
128
  adapter.load_hgnc_bulk(gene_objects)
96
129
 
97
130
  LOG.info("Loading done. %s genes loaded", len(gene_objects))
scout/load/setup.py CHANGED
@@ -51,12 +51,12 @@ def setup_scout(
51
51
 
52
52
  WARNING: If the instance is populated all collections will be deleted
53
53
 
54
- Build insert a institute and an admin user.
55
- There are multiple sources of information that is used by scout and that needs to exist for
56
- scout to work proper.
54
+ Build and insert an institute and an admin user.
57
55
 
56
+ Multiple sources of information that are used by scout need to exist for
57
+ scout to work properly.
58
58
  Genes:
59
- Scout uses HGNC as the source for gene identifiers en ensembl as source for coordinates.
59
+ Scout uses HGNC as the source for gene identifiers and ENSEMBL as source for coordinates.
60
60
  Additional information of disease connections for genes if fetched from OMIM.
61
61
  Link between hpo terms and genes is fetched from HPO
62
62
  For more details check the documentation.
@@ -15,7 +15,7 @@ except ImportError:
15
15
 
16
16
  from pydantic import BaseModel, Field, field_validator, model_validator
17
17
 
18
- from scout.constants import ANALYSIS_TYPES, FILE_TYPE_MAP
18
+ from scout.constants import ANALYSIS_TYPES, FILE_TYPE_MAP, OMICS_FILE_TYPE_MAP
19
19
  from scout.exceptions import PedigreeError
20
20
  from scout.utils.date import get_date
21
21
 
@@ -41,15 +41,17 @@ CASE_FILE_PATH_CHECKS = [
41
41
  "cnv_report",
42
42
  "coverage_qc_report",
43
43
  "delivery_report",
44
+ "exe_ver",
45
+ "fraser_tsv",
44
46
  "gene_fusion_report",
45
47
  "gene_fusion_report_research",
46
48
  "madeline_info",
47
49
  "multiqc",
48
50
  "multiqc_rna",
51
+ "outrider_tsv",
49
52
  "peddy_ped",
50
53
  "peddy_ped_check",
51
54
  "peddy_sex_check",
52
- "exe_ver",
53
55
  "smn_tsv",
54
56
  "reference_info",
55
57
  "RNAfusion_inspector",
@@ -59,6 +61,7 @@ CASE_FILE_PATH_CHECKS = [
59
61
  ]
60
62
 
61
63
  VCF_FILE_PATH_CHECKS = FILE_TYPE_MAP.keys()
64
+ OMICS_FILE_PATH_CHECKS = OMICS_FILE_TYPE_MAP.keys()
62
65
 
63
66
  GENOME_BUILDS = ["37", "38"]
64
67
  TRACKS = ["rare", "cancer"]
@@ -153,6 +156,25 @@ class Mitodel(BaseModel):
153
156
  ratioppk: Optional[float] = None
154
157
 
155
158
 
159
+ class OmicsFiles(BaseModel):
160
+ """Represents multiple kinds of omics files, e.g. RNA expression outliers for aberrant splicing
161
+ and aberrant expression."""
162
+
163
+ fraser: Optional[str] = None
164
+ fraser_research: Optional[str] = None
165
+ outrider: Optional[str] = None
166
+ outrider_research: Optional[str] = None
167
+
168
+ @model_validator(mode="before")
169
+ def validate_file_path(cls, values: Dict) -> "OmicsFiles":
170
+ """Make sure that VCF file exists on disk."""
171
+ for item in OMICS_FILE_PATH_CHECKS:
172
+ item_path: str = values.get(item)
173
+ if item_path:
174
+ values[item] = _resource_abs_path(item_path)
175
+ return values
176
+
177
+
156
178
  class REViewer(BaseModel):
157
179
  alignment: Optional[str] = None
158
180
  alignment_index: Optional[str] = None
@@ -392,6 +414,7 @@ class CaseLoader(BaseModel):
392
414
  madeline_info: Optional[str] = Field(None, alias="madeline")
393
415
  multiqc: Optional[str] = None
394
416
  multiqc_rna: Optional[str] = None
417
+ omics_files: Optional[OmicsFiles] = None
395
418
  owner: Optional[str] = None
396
419
  peddy_ped: Optional[str] = None # Soon to be deprecated
397
420
  peddy_ped_check: Optional[str] = Field(None, alias="peddy_check") # Soon to be deprecated
@@ -0,0 +1,227 @@
1
+ """ OMICS variant
2
+
3
+ For potentially causative variants that are not yet in ClinVar
4
+ and have yet not been marked causative in any existing case.
5
+
6
+ """
7
+
8
+ import logging
9
+ from datetime import datetime
10
+ from typing import List, Optional
11
+
12
+ from pydantic import BaseModel, Field, field_validator, model_validator
13
+
14
+ LOG = logging.getLogger(__name__)
15
+
16
+
17
+ class OmicsVariantLoader(BaseModel):
18
+ """Omics variants loader
19
+ OmicsVariants are e.g. RNA expression outliers as identified by the DROP pipeline.
20
+
21
+ Variable names are as found in the original files, plus a set common to all mixed in after file parsing,
22
+ but before model validation by this class.
23
+
24
+ The serialisation names will be used when dumping the model for e.g. db storage.
25
+ """
26
+
27
+ case_id: str
28
+ institute: str
29
+ build: str = "38"
30
+ variant_type: str = "clinical"
31
+ category: str # eg "outlier"
32
+ sub_category: str # eg "splicing"
33
+ date: datetime = datetime.now()
34
+ display_name: str
35
+ omics_variant_id: str
36
+
37
+ # DROP Fraser and Outrider outlier TSVs
38
+
39
+ # sample id is mandatory: each row pertains to one outlier event in one individual as compared to others
40
+ # In the db object, this will be replaced with a "samples" array of individual dict.
41
+ sampleID: str
42
+
43
+ # outlier variants must identify the gene they pertain to, primarily with an hgnc_id
44
+ hgnc_ids: Optional[List[int]] = Field(alias="hgnc_id", serialization_alias="hgnc_ids")
45
+ geneID: Optional[str]
46
+
47
+ hgnc_symbols: Optional[List[str]] = Field(
48
+ alias="hgncSymbol", serialization_alias="hgnc_symbols"
49
+ )
50
+ gene_name_orig: Optional[str]
51
+
52
+ gene_type: Optional[str]
53
+
54
+ # coordinates if applicable
55
+ chromosome: Optional[str] = Field(alias="seqnames", serialization_alias="chromosome")
56
+ position: Optional[int] = Field(alias="start", serialization_alias="position")
57
+ end: Optional[int]
58
+ width: Optional[int] = None
59
+ strand: Optional[str] = None
60
+
61
+ p_value: Optional[float] = Field(alias="pValue", serialization_alias="p_value", default=None)
62
+
63
+ # Fraser specific
64
+ type: Optional[str] = None
65
+ psi_value: Optional[float] = Field(
66
+ alias="psiValue", serialization_alias="psi_value", default=None
67
+ )
68
+ delta_psi: Optional[float] = Field(
69
+ alias="deltaPsi", serialization_alias="delta_psi", default=None
70
+ )
71
+ counts: Optional[int] = None
72
+ total_counts: Optional[int] = Field(
73
+ alias="totalCounts", serialization_alias="total_counts", default=None
74
+ )
75
+ mean_counts: Optional[float] = Field(
76
+ alias="meanCounts", serialization_alias="mean_counts", default=None
77
+ )
78
+ mean_total_counts: Optional[float] = Field(
79
+ alias="meanTotalCounts", serialization_alias="mean_total_counts", default=None
80
+ )
81
+ nonsplit_counts: Optional[int] = Field(
82
+ alias="nonsplitCounts", serialization_alias="nonsplit_counts", default=None
83
+ )
84
+ nonsplit_proportion: Optional[float] = Field(
85
+ alias="nonsplitProportion", serialization_alias="nonsplit_proportion", default=None
86
+ )
87
+ nonsplit_proportion_99quantile: Optional[float] = Field(
88
+ alias="nonsplitProportion_99quantile",
89
+ serialization_alias="nonsplit_proportion_99quantile",
90
+ default=None,
91
+ )
92
+ annotated_junction: Optional[str] = Field(
93
+ alias="annotatedJunction", serialization_alias="annotated_junction", default=None
94
+ )
95
+ p_value_gene: Optional[float] = Field(
96
+ alias="pValueGene", serialization_alias="p_value_gene", default=None
97
+ )
98
+ p_adjust_gene: Optional[float] = Field(
99
+ alias="padjustGene", serialization_alias="p_adjust_gene", default=None
100
+ )
101
+ paired_end: Optional[str] = Field(
102
+ alias="PAIRED_END", serialization_alias="paired_end", default=None
103
+ )
104
+ is_external: Optional[bool] = Field(
105
+ alias="isExternal", serialization_alias="is_external", default=None
106
+ )
107
+ potential_impact: Optional[str] = Field(
108
+ alias="potentialImpact", serialization_alias="potential_impact", default=None
109
+ )
110
+ causes_frameshift: Optional[str] = Field(
111
+ alias="causesFrameshift", serialization_alias="causes_frameshift", default=None
112
+ )
113
+ utr_overlap: Optional[str] = Field(
114
+ alias="UTR_overlap", serialization_alias="utr_overlap", default=None
115
+ )
116
+
117
+ # Outrider specific
118
+ padjust: Optional[float] = None
119
+ zscore: Optional[float] = Field(alias="zScore", serialization_alias="zscore", default=None)
120
+ l2fc: Optional[float] = None
121
+ rawcounts: Optional[int] = None
122
+ normcounts: Optional[float] = None
123
+ meanCorrected: Optional[float] = None
124
+ theta: Optional[float] = None
125
+ aberrant: Optional[bool] = None
126
+ aberrant_by_sample: Optional[float] = Field(
127
+ alias="aberrantBySample", serialization_alias="aberrant_by_sample", default=None
128
+ )
129
+ aberrant_by_gene: Optional[float] = Field(
130
+ alias="aberrantByGene", serialization_alias="aberrant_by_gene", default=None
131
+ )
132
+ padj_rank: Optional[float] = None
133
+ fdr_set: Optional[str] = Field(alias="FDR_set", serialization_alias="fdr_set", default=None)
134
+ fold_change: Optional[float] = Field(
135
+ alias="foldChange", serialization_alias="fold_change", default=None
136
+ )
137
+
138
+ @field_validator("chromosome")
139
+ def strip_chr(cls, chrom: str) -> str:
140
+ """We store chromosome names without a chr prefix internally."""
141
+ return chrom.lstrip("chr")
142
+
143
+ @model_validator(mode="before")
144
+ def ensure_end(cls, values):
145
+ """End is not always set, but sometimes width is.
146
+ Sometimes Imp is given as end. Worst case we default to width 1."""
147
+ end_guess = int(values.get("start")) + int(values.get("width", 1))
148
+ if "end" not in values:
149
+ values["end"] = end_guess
150
+
151
+ if isinstance(values["end"], str):
152
+ if values["end"].isdigit():
153
+ values["end"] = int(values["end"])
154
+ if values["end"] == "Imp":
155
+ # imprecise?
156
+ values["end"] = end_guess
157
+
158
+ return values
159
+
160
+ @model_validator(mode="before")
161
+ def genes_become_lists(cls, values):
162
+ """HGNC ids and gene symbols are found one on each line in DROP tsvs.
163
+ Convert to a list with a single member in omics_variants for storage."""
164
+
165
+ if "hgnc_id" in values:
166
+ values["hgnc_id"] = [int(values.get("hgnc_id"))]
167
+
168
+ if "hgncSymbol" in values:
169
+ values["hgncSymbol"] = [str(values.get("hgncSymbol"))]
170
+
171
+ return values
172
+
173
+ @model_validator(mode="before")
174
+ def set_display_name(cls, values) -> "OmicsVariantLoader":
175
+ """Set a free text qualification, depending on the kind of variant."""
176
+
177
+ values["display_name"] = "_".join(
178
+ [
179
+ values.get("hgncSymbol"),
180
+ values.get("category"),
181
+ values.get("sub_category"),
182
+ get_qualification(values=values),
183
+ values.get("seqnames"), # chrom, unserialised
184
+ str(values.get("start")),
185
+ str(values.get("end")),
186
+ values.get("variant_type"),
187
+ ]
188
+ )
189
+ return values
190
+
191
+ @model_validator(mode="before")
192
+ def set_omics_variant_id(cls, values) -> "OmicsVariantLoader":
193
+ """Set OMICS variant id based on the kind of variant."""
194
+
195
+ values["omics_variant_id"] = "_".join(
196
+ [
197
+ values.get("seqnames"), # chrom, unserialised
198
+ str(values.get("start")),
199
+ str(values.get("end")),
200
+ values.get("build"),
201
+ values.get("hgncSymbol"),
202
+ values.get("sub_category"),
203
+ get_qualification(values=values),
204
+ values.get("variant_type"),
205
+ ]
206
+ )
207
+ return values
208
+
209
+ @model_validator(mode="before")
210
+ def set_sample_display_name(cls, values) -> "OmicsVariantLoader":
211
+ """Set a display name."""
212
+ values["display_name"] = values.get(
213
+ "display_name", values.get("sample_name", values.get("individual_id"))
214
+ )
215
+ return values
216
+
217
+
218
+ def get_qualification(values: dict) -> str:
219
+ """Get qualification string for ID and display name.
220
+ This string further qualifies the kind of omics event,
221
+ e.g. for an expression outlier it could be 'up' or 'down'."""
222
+ qualification = "affected"
223
+ if values.get("sub_category") == "expression":
224
+ qualification = "up" if float(values.get("zScore", 0)) > 0 else "down"
225
+ if values.get("sub_category") == "splicing":
226
+ qualification = values.get("potentialImpact")
227
+ return qualification
scout/parse/hgnc.py CHANGED
@@ -24,6 +24,7 @@ def parse_hgnc_line(line, header):
24
24
  hgnc_gene["hgnc_symbol"] = hgnc_symbol
25
25
  hgnc_gene["hgnc_id"] = int(raw_info["hgnc_id"].split(":")[-1])
26
26
  hgnc_gene["description"] = raw_info["name"]
27
+ hgnc_gene["location"] = raw_info["location"] # cytoband
27
28
 
28
29
  # We want to have the current symbol as an alias
29
30
  aliases = set([hgnc_symbol, hgnc_symbol.upper()])
@@ -0,0 +1,11 @@
1
+ from typing import Dict, Iterable, List
2
+
3
+ from .drop import parse_omics_tsv
4
+
5
+ OMICS_CATEGORY_PARSER = {"tsv": parse_omics_tsv}
6
+
7
+
8
+ def parse_omics_file(omics_lines: Iterable[str], omics_file_type: dict) -> List[Dict[str, str]]:
9
+ """Call appropriate parser for omics variants file, depending on the file format anticipated."""
10
+ parser = OMICS_CATEGORY_PARSER[omics_file_type.get("format")]
11
+ return parser(omics_lines)
@@ -0,0 +1,19 @@
1
+ from typing import Dict, Iterable, List
2
+
3
+
4
+ def parse_omics_tsv(lines: Iterable[str]) -> List[Dict[str, str]]:
5
+ """Parse a DROP Outrider or Fraser TSV file."""
6
+ omics_infos = []
7
+ header = []
8
+
9
+ for i, line in enumerate(lines):
10
+ line = line.rstrip()
11
+ if i == 0:
12
+ # Header line
13
+ header = line.split("\t")
14
+ continue
15
+
16
+ info = dict(zip(header, line.split("\t")))
17
+ omics_infos.append(info)
18
+
19
+ return omics_infos
@@ -22,6 +22,7 @@ def parse_callers(variant, category="snv"):
22
22
  """
23
23
  relevant_callers = CALLERS[category]
24
24
  callers = {caller["id"]: None for caller in relevant_callers}
25
+ callers_keys = set(callers.keys())
25
26
 
26
27
  other_info = variant.INFO.get("FOUND_IN")
27
28
  svdb_origin = variant.INFO.get("svdb_origin")
@@ -30,10 +31,12 @@ def parse_callers(variant, category="snv"):
30
31
  if other_info:
31
32
  for info in other_info.split(","):
32
33
  called_by = info.split("|")[0]
33
- callers[called_by] = "Pass"
34
+ if called_by in callers_keys:
35
+ callers[called_by] = "Pass"
34
36
  elif svdb_origin:
35
37
  for called_by in svdb_origin.split("|"):
36
- callers[called_by] = "Pass"
38
+ if called_by in callers_keys:
39
+ callers[called_by] = "Pass"
37
40
  elif raw_info:
38
41
  info = raw_info.split("-")
39
42
  for call in info:
@@ -47,7 +50,7 @@ def parse_callers(variant, category="snv"):
47
50
  for caller in callers:
48
51
  if caller in call:
49
52
  callers[caller] = "Filtered"
50
- elif call in set(callers.keys()):
53
+ elif call in callers_keys:
51
54
  callers[call] = "Pass"
52
55
 
53
56
  if raw_info or svdb_origin or other_info:
@@ -11,8 +11,14 @@ EXAC_KEYS = ["EXACAF"]
11
11
  EXAC_MAX_KEYS = ["ExAC_MAX_AF", "EXAC_MAX_AF"]
12
12
 
13
13
  # gnomAD has both SNV and SV
14
- GNOMAD_INFO_KEYS = ["GNOMADAF", "GNOMAD_AF", "gnomADg_AF", "gnomad_svAF"]
15
- GNOMAD_INFO_MAX_KEYS = ["gnomADg_AF_POPMAX", "GNOMADAF_popmax", "GNOMADAF_POPMAX", "GNOMADAF_MAX"]
14
+ GNOMAD_INFO_KEYS = ["GNOMADAF", "GNOMAD_AF", "gnomADg_AF", "gnomad_svAF", "gnomad_af"]
15
+ GNOMAD_INFO_MAX_KEYS = [
16
+ "gnomADg_AF_POPMAX",
17
+ "GNOMADAF_popmax",
18
+ "GNOMADAF_POPMAX",
19
+ "GNOMADAF_MAX",
20
+ "gnomad_popmax_af",
21
+ ]
16
22
 
17
23
  # SV
18
24
  CLINGEN_BENIGN_KEYS = [
@@ -69,6 +75,7 @@ def parse_frequencies(variant, transcripts):
69
75
  # These are SV-specific frequencies
70
76
  update_frequency_from_vcf(frequencies, variant, ["left_1000GAF"], "thousand_g_left")
71
77
  update_frequency_from_vcf(frequencies, variant, ["right_1000GAF"], "thousand_g_right")
78
+ update_frequency_from_vcf(frequencies, variant, ["colorsdb_af"], "colorsdb_af")
72
79
 
73
80
  # Search transcripts CSQ if not found in VCF INFO
74
81
  if not frequencies:
@@ -117,6 +124,7 @@ def parse_sv_frequencies(variant: cyvcf2.Variant) -> Dict:
117
124
  update_sv_frequency_from_vcf(sv_frequencies, variant, SWEGEN_KEYS, "swegen")
118
125
  update_sv_frequency_from_vcf(sv_frequencies, variant, DECIPHER_KEYS, "decipher")
119
126
  update_sv_frequency_from_vcf(sv_frequencies, variant, CG_KEYS, "clingen_mip")
127
+ update_sv_frequency_from_vcf(sv_frequencies, variant, ["colorsdb_af"], "colorsdb_af")
120
128
 
121
129
  return sv_frequencies
122
130