@sjcrh/proteinpaint-shared 2.78.0-0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/vcf.js ADDED
@@ -0,0 +1,629 @@
1
+ import { mclass } from './common.js'
2
+ import { dissect_INFO } from './vcf.info.js'
3
+ import { parse_CSQ } from './vcf.csq.js'
4
+ import { parse_ANN } from './vcf.ann.js'
5
+ import { getVariantType } from './vcf.type.js'
6
+
7
+ /*
8
+ Only for parsing vcf files
9
+ is not involved in creating vcf tracks
10
+
11
+ shared between client-server
12
+ */
13
+
14
+ // for telling symbolic alleles e.g. <*:DEL>
15
+ const getallelename = new RegExp(/<(.+)>/)
16
+
17
+ const mclasslabel2key = {}
18
+ for (const k in mclass) {
19
+ mclasslabel2key[mclass[k].label.toUpperCase()] = k
20
+ }
21
+
22
+ export function vcfparsemeta(lines) {
23
+ /*
24
+ input: array of string, as lines separated by linebreak
25
+
26
+ ##INFO for meta lines
27
+ #CHROM for header, to get samples
28
+
29
+ */
30
+
31
+ let sample = [],
32
+ errlst = [],
33
+ info = {},
34
+ hasinfo = false,
35
+ format = {},
36
+ hasformat = false
37
+
38
+ for (const line of lines) {
39
+ if (!line.startsWith('#')) {
40
+ continue
41
+ }
42
+
43
+ if (line.startsWith('#C')) {
44
+ // header, get samples
45
+ sample = line.split('\t').slice(9)
46
+ continue
47
+ }
48
+
49
+ if (line.startsWith('##INFO')) {
50
+ const e = tohash(line.substring(8, line.length - 1), info)
51
+ if (e) {
52
+ errlst.push('INFO error: ' + e)
53
+ } else {
54
+ hasinfo = true
55
+ }
56
+ continue
57
+ }
58
+
59
+ if (line.startsWith('##FORMAT')) {
60
+ const e = tohash(line.substring(10, line.length - 1), format)
61
+ if (e) {
62
+ errlst.push('FORMAT error: ' + e)
63
+ } else {
64
+ hasformat = true
65
+ }
66
+ }
67
+ }
68
+
69
+ const sampleobjlst = []
70
+ for (const samplename of sample) {
71
+ const a = { name: samplename }
72
+
73
+ // this enables adding key4annotation to match with .ds.cohort.annotation
74
+
75
+ sampleobjlst.push(a)
76
+ }
77
+
78
+ // reserved INFO fields
79
+ if (info.CSQ) {
80
+ const lst = info.CSQ.Description.split(' Format: ')
81
+ if (lst[1]) {
82
+ const lst2 = lst[1].split('|')
83
+ if (lst2.length > 1) {
84
+ // fix csq headers so to allow configuring show/hide of csq fields
85
+ info.CSQ.csqheader = []
86
+ for (const str of lst2) {
87
+ const attr = { name: str }
88
+ info.CSQ.csqheader.push(attr)
89
+ }
90
+ } else {
91
+ errlst.push('unknown format for CSQ header: ' + info.CSQ.Description)
92
+ }
93
+ } else {
94
+ errlst.push('unknown format for CSQ header: ' + info.CSQ.Description)
95
+ }
96
+ }
97
+
98
+ if (info.ANN) {
99
+ const lst = info.ANN.Description.split("'")
100
+ if (lst[1]) {
101
+ const lst2 = lst[1].split(' | ')
102
+ if (lst2.length) {
103
+ info.ANN.annheader = []
104
+ for (const s of lst2) {
105
+ const attr = { name: s }
106
+ info.ANN.annheader.push(attr)
107
+ }
108
+ } else {
109
+ errlst.push('no " | " joined annotation fields for ANN (snpEff annotation): ' + info.ANN.Description)
110
+ }
111
+ } else {
112
+ errlst.push('no single-quote enclosed annotation fields for ANN (snpEff annotation): ' + info.ANN.Description)
113
+ }
114
+ }
115
+
116
+ return [hasinfo ? info : null, hasformat ? format : null, sampleobjlst, errlst.length ? errlst : null]
117
+ }
118
+
119
+ export function vcfparseline(line, vcf) {
120
+ /*
121
+ vcf, samples/info is generated by vcfparsemeta()
122
+ .nochr BOOL
123
+ .samples [ {} ]
124
+ .name
125
+ .info {}
126
+ .format {}
127
+
128
+ return:
129
+ error message STR
130
+ altinvalid []
131
+ mlst [ m ] one m per alt allele
132
+ chr
133
+ pos
134
+ name
135
+ type
136
+ ref
137
+ alt
138
+ altstr
139
+ sampledata []
140
+ altinfo
141
+ */
142
+
143
+ const lst = line.split('\t')
144
+ if (lst.length < 8) {
145
+ // no good
146
+ return ['line has less than 8 fields', null, null]
147
+ }
148
+
149
+ const rawpos = Number.parseInt(lst[2 - 1])
150
+ if (!Number.isInteger(rawpos)) {
151
+ return ['invalid value for genomic position', null, null]
152
+ }
153
+
154
+ const refallele = lst[4 - 1]
155
+
156
+ const m = {
157
+ vcf_ID: lst[3 - 1],
158
+ chr: (vcf.nochr ? 'chr' : '') + lst[1 - 1],
159
+ pos: rawpos - 1,
160
+ ref: refallele,
161
+ //refstr:refallele, // e.g. GA>GCC, ref:A, refstr:GA, "refstr" is required for matching in FORMAT
162
+ altstr: lst[5 - 1],
163
+ alleles: [
164
+ {
165
+ /*
166
+ ref allele only a placeholder, to be removed, this array only contains alt alleles
167
+ this is a must
168
+ also allows GT allele index to work
169
+ */
170
+ allele: refallele,
171
+ sampledata: []
172
+ }
173
+ ],
174
+
175
+ info: {}, // locus info, do not contain allele info
176
+
177
+ name: lst[3 - 1] == '.' ? null : lst[3 - 1]
178
+ }
179
+
180
+ // parse alt
181
+ const altinvalid = []
182
+ for (const alt of lst[5 - 1].split(',')) {
183
+ const a = {
184
+ ref: m.ref, // may be corrected just below!
185
+ allele: alt,
186
+ // 5078356.TATCAGAGAA.GGGAGGA keep original allele for matching with csq which hardcodes original allele
187
+ allele_original: alt,
188
+ sampledata: [],
189
+ _m: m,
190
+ info: {} // allele info, do not contain locus info
191
+ }
192
+ m.alleles.push(a)
193
+ if (alt[0] == '<') {
194
+ /*
195
+ symbolic allele, show text within <> as name
196
+ FIXME match INFO
197
+ */
198
+ const tmp = alt.match(getallelename)
199
+ if (!tmp) {
200
+ altinvalid.push(alt)
201
+ continue
202
+ }
203
+ a.type = tmp[1]
204
+
205
+ a.allele = tmp[1]
206
+ a.issymbolicallele = true
207
+ } else {
208
+ // normal nucleotide
209
+
210
+ const [p, ref, alt] = correctRefAlt(m.pos, m.ref, a.allele)
211
+ a.pos = p
212
+ a.ref = ref
213
+ a.allele = alt
214
+ }
215
+ }
216
+
217
+ if (lst[9 - 1] && lst[10 - 1]) {
218
+ parse_FORMAT2(lst, m, vcf)
219
+ }
220
+
221
+ /*
222
+ remove ref allele so it only contain alternative alleles
223
+ so that parse_INFO can safely apply Number=A fields to m.alleles
224
+ */
225
+ m.alleles.shift()
226
+
227
+ // info
228
+ const tmp = lst[8 - 1] == '.' ? [] : dissect_INFO(lst[8 - 1])
229
+ let badinfokeys = []
230
+
231
+ if (vcf.info) {
232
+ badinfokeys = parse_INFO(tmp, m, vcf)
233
+ } else {
234
+ // vcf meta lines told nothing about INFO, do not parse
235
+ m.info = tmp
236
+ }
237
+
238
+ const mlst = []
239
+ for (const a of m.alleles) {
240
+ const m2 = {}
241
+ for (const k in m) {
242
+ if (k != 'alleles') {
243
+ m2[k] = m[k]
244
+ }
245
+ }
246
+ for (const k in a) {
247
+ if (k == 'allele') {
248
+ m2.alt = a[k]
249
+ } else if (k == 'info') {
250
+ m2.altinfo = a[k]
251
+ } else {
252
+ m2[k] = a[k]
253
+ }
254
+ }
255
+ if (!m2.issymbolicallele && m2.alt != 'NON_REF') {
256
+ m2.type = getVariantType(m2.ref, m2.alt)
257
+ /*
258
+ // valid alt allele, apply Dr. J's cool method
259
+ const [p,ref,alt]=correctRefAlt(m2.pos, m2.ref, m2.alt)
260
+ m2.pos=p
261
+ m2.ref=ref
262
+ m2.alt=alt
263
+ */
264
+ }
265
+ mlst.push(m2)
266
+ }
267
+ return [
268
+ badinfokeys.length ? 'unknown info keys: ' + badinfokeys.join(',') : null,
269
+ mlst,
270
+ altinvalid.length > 0 ? altinvalid : null
271
+ ]
272
+ }
273
+
274
+ function correctRefAlt(p, ref, alt) {
275
+ // for oligos, always trim the last identical base
276
+ while (ref.length > 1 && alt.length > 1 && ref[ref.length - 1] == alt[alt.length - 1]) {
277
+ ref = ref.substr(0, ref.length - 1)
278
+ alt = alt.substr(0, alt.length - 1)
279
+ }
280
+ // move position up as long as first positions are equal
281
+ while (ref.length > 1 && alt.length > 1 && ref[0] == alt[0]) {
282
+ ref = ref.substr(1)
283
+ alt = alt.substr(1)
284
+ p++
285
+ }
286
+ return [p, ref, alt]
287
+ }
288
+
289
+ function parse_FORMAT2(lst, m, vcf) {
290
+ /*
291
+ m.alleles[0] is ref allele
292
+
293
+ each allele:
294
+ .ref
295
+ .allele
296
+ .allele_original
297
+ .sampledata[] blank array
298
+ */
299
+ const formatfields = lst[9 - 1].split(':')
300
+
301
+ for (let _sampleidx = 9; _sampleidx < lst.length; _sampleidx++) {
302
+ // for each sample
303
+
304
+ const valuelst = lst[_sampleidx].split(':')
305
+ {
306
+ // tell if this sample have any data in this line (variant), if .:., then skip
307
+ let none = true
308
+ for (const v of valuelst) {
309
+ if (v != '.') {
310
+ none = false
311
+ break
312
+ }
313
+ }
314
+ if (none) {
315
+ // this sample has no format value
316
+ continue
317
+ }
318
+ }
319
+
320
+ /* should create an object of {format:value} of this sample
321
+ with this object, for each alt allele this sample has,
322
+ put a copy in m.allele[x].sampledata[]
323
+ */
324
+
325
+ const sampleidx = _sampleidx - 9
326
+
327
+ /*
328
+ for each alt allele, initialize obj of this sample and store in this allele
329
+ later, to iterate over format fields and put in appropriate values
330
+ note that this sample may not actually have this allele
331
+ */
332
+ for (let i = 1; i < m.alleles.length; i++) {
333
+ const sobj = {}
334
+ if (vcf.samples && vcf.samples[sampleidx]) {
335
+ for (const k in vcf.samples[sampleidx]) {
336
+ sobj[k] = vcf.samples[sampleidx][k]
337
+ }
338
+ } else {
339
+ sobj.name = 'missing_samplename_from_vcf_header'
340
+ }
341
+ m.alleles[i].sampledata.push({
342
+ sampleobj: sobj
343
+ })
344
+ }
345
+
346
+ for (let fi = 0; fi < formatfields.length; fi++) {
347
+ // for each field of this sample
348
+
349
+ const field = formatfields[fi]
350
+ const value = valuelst[fi]
351
+ if (value == '.') {
352
+ // no value for this field
353
+ continue
354
+ }
355
+
356
+ if (field == 'GT') {
357
+ const splitter = value.indexOf('/') != -1 ? '/' : '|'
358
+ let gtsum = 0 // for calculating gtallref=true, old
359
+ let unknowngt = false // if any is '.', then won't calculate gtallref
360
+ const gtalleles = []
361
+ for (const i of value.split(splitter)) {
362
+ if (i == '.') {
363
+ unknowngt = true
364
+ continue
365
+ }
366
+ const j = Number.parseInt(i)
367
+ if (Number.isNaN(j)) {
368
+ unknowngt = true
369
+ continue
370
+ }
371
+ gtsum += j
372
+ const ale = m.alleles[j]
373
+ if (ale) {
374
+ gtalleles.push(ale.allele)
375
+ }
376
+ }
377
+ let gtallref = false
378
+ if (!unknowngt) {
379
+ gtallref = gtsum == 0
380
+ }
381
+
382
+ const genotype = gtalleles.join(splitter)
383
+ for (let i = 1; i < m.alleles.length; i++) {
384
+ const ms = m.alleles[i].sampledata[m.alleles[i].sampledata.length - 1]
385
+ ms.GT = value
386
+ ms.genotype = genotype
387
+ if (gtallref) {
388
+ ms.gtallref = true
389
+ }
390
+
391
+ // for mds vcf to drop out samples that do not have this alt allele
392
+ ms.__gtalleles = gtalleles
393
+ }
394
+ continue
395
+ }
396
+
397
+ // other data fields
398
+ const formatdesc = vcf.format ? vcf.format[field] : null
399
+ if (!formatdesc) {
400
+ // unspecified field, put to all alt alleles
401
+ for (let i = 1; i < m.alleles.length; i++) {
402
+ m.alleles[i].sampledata[m.alleles[i].sampledata.length - 1][field] = value
403
+ }
404
+ continue
405
+ }
406
+
407
+ const isinteger = formatdesc.Type == 'Integer'
408
+ const isfloat = formatdesc.Type == 'Float'
409
+
410
+ if ((formatdesc.Number && formatdesc.Number == 'R') || field == 'AD') {
411
+ /*
412
+ per-allele value, including ref
413
+ v4.1 has AD not with "R", must process as R
414
+ */
415
+ const fvlst = value.split(',').map(i => {
416
+ if (isinteger) return Number.parseInt(i)
417
+ if (isfloat) return Number.parseFloat(i)
418
+ return i
419
+ })
420
+ for (let i = 1; i < m.alleles.length; i++) {
421
+ if (fvlst[i] != undefined) {
422
+ // this allele has value
423
+ const m2 = m.alleles[i]
424
+ const m2s = m2.sampledata[m2.sampledata.length - 1]
425
+ // use this allele's ref/alt (after nt trimming)
426
+ m2s[field] = {}
427
+ m2s[field][m2.ref] = fvlst[0]
428
+ m2s[field][m2.allele] = fvlst[i]
429
+ }
430
+ }
431
+ continue
432
+ }
433
+ if (formatdesc.Number && formatdesc.Number == 'A') {
434
+ // per alt-allele value
435
+ const fvlst = value.split(',').map(i => {
436
+ if (isinteger) return Number.parseInt(i)
437
+ if (isfloat) return Number.parseFloat(i)
438
+ return i
439
+ })
440
+ for (let i = 1; i < m.alleles.length; i++) {
441
+ if (fvlst[i - 1] != undefined) {
442
+ // this allele has value
443
+ const m2 = m.alleles[i]
444
+ const m2s = m2.sampledata[m2.sampledata.length - 1]
445
+ // use this allele's ref/alt (after nt trimming)
446
+ m2s[field] = {}
447
+ m2s[field][m2.allele] = fvlst[i - 1]
448
+ }
449
+ }
450
+ continue
451
+ }
452
+ // otherwise, append this field to all alt
453
+ for (let i = 1; i < m.alleles.length; i++) {
454
+ m.alleles[i].sampledata[m.alleles[i].sampledata.length - 1][field] = value
455
+ }
456
+ }
457
+ }
458
+
459
+ // compatible with old ds: make allele2readcount from AD
460
+ for (const a of m.alleles) {
461
+ for (const s of a.sampledata) {
462
+ if (s.AD) {
463
+ s.allele2readcount = {}
464
+ for (const k in s.AD) {
465
+ s.allele2readcount[k] = s.AD[k]
466
+ }
467
+ }
468
+ }
469
+ }
470
+ }
471
+
472
+ function tohash(s, hash) {
473
+ /*
474
+ parse INFO
475
+ */
476
+ const h = {},
477
+ err = []
478
+ let prev = 0,
479
+ prevdoublequote = false,
480
+ k = null
481
+ for (let i = 0; i < s.length; i++) {
482
+ if (s[i] == '"') {
483
+ i++
484
+ const thisstart = i
485
+ while (s[i] != '"') {
486
+ i++
487
+ }
488
+ if (k) {
489
+ h[k] = s.substring(thisstart, i)
490
+ k = null
491
+ } else {
492
+ err.push('k undefined before double quotes')
493
+ }
494
+ prevdoublequote = true
495
+ continue
496
+ }
497
+ if (s[i] == '=') {
498
+ k = s.substring(prev, i)
499
+ prev = i + 1
500
+ continue
501
+ }
502
+ if (s[i] == ',') {
503
+ if (prevdoublequote) {
504
+ prevdoublequote = false
505
+ } else {
506
+ if (k) {
507
+ h[k] = s.substring(prev, i)
508
+ k = null
509
+ } else {
510
+ err.push('k undefined')
511
+ }
512
+ }
513
+ prev = i + 1
514
+ continue
515
+ }
516
+ }
517
+ if (k) {
518
+ h[k] = s.substring(prev, i)
519
+ }
520
+ if (h.ID) {
521
+ hash[h.ID] = h
522
+ } else {
523
+ return 'no ID'
524
+ }
525
+ if (err.length) return err.join('\n')
526
+ }
527
+
528
+ function parse_INFO(tmp, m, vcf) {
529
+ /*
530
+ this function fills in both m.info{} and m.alleles[].info{}
531
+
532
+ the m.alleles[] will later be converted to [m], each carrying one alt allele
533
+ each m will have .info{} for locus info, and .altinfo{} for alt allele info
534
+
535
+ */
536
+
537
+ const badinfokeys = []
538
+
539
+ for (const key in tmp) {
540
+ if (vcf.info[key] == undefined) {
541
+ badinfokeys.push(key)
542
+ continue
543
+ }
544
+
545
+ const value = tmp[key]
546
+
547
+ ////////////////// hard-coded fields
548
+
549
+ if (key == 'CSQ') {
550
+ const okay = parse_CSQ(value, vcf.info.CSQ.csqheader, m)
551
+ if (!okay) {
552
+ m.info[key] = value
553
+ }
554
+ continue
555
+ }
556
+ if (key == 'ANN') {
557
+ const okay = parse_ANN(value, vcf.info.ANN.annheader, m)
558
+ if (!okay) {
559
+ m.info[key] = value
560
+ }
561
+ continue
562
+ }
563
+
564
+ ////////////////// end of hardcoded fields
565
+
566
+ if (vcf.info[key].Type == 'Flag') {
567
+ // flag has no value
568
+ m.info[key] = key
569
+ continue
570
+ }
571
+
572
+ const __number = vcf.info[key].Number
573
+ const isinteger = vcf.info[key].Type == 'Integer'
574
+ const isfloat = vcf.info[key].Type == 'Float'
575
+
576
+ if (__number == '0') {
577
+ /*
578
+ no value, should be a Flag
579
+ */
580
+ m.info[key] = key
581
+ continue
582
+ }
583
+
584
+ if (__number == 'A') {
585
+ /*
586
+ per alt allele
587
+ */
588
+ const tt = value.split(',')
589
+ for (let j = 0; j < tt.length; j++) {
590
+ if (m.alleles[j]) {
591
+ m.alleles[j].info[key] = isinteger ? Number.parseInt(tt[j]) : isfloat ? Number.parseFloat(tt[j]) : tt[j]
592
+ }
593
+ }
594
+ continue
595
+ }
596
+
597
+ if (__number == 'R') {
598
+ /*
599
+ FIXME "R" is not considered, m.alleles only contain alt, which .info{} for each
600
+ the current datastructure does not support info for ref allele!
601
+ */
602
+ }
603
+
604
+ if (__number == '1') {
605
+ /*
606
+ single value
607
+ */
608
+ m.info[key] = isinteger ? Number.parseInt(value) : isfloat ? Number.parseFloat(value) : value
609
+ continue
610
+ }
611
+
612
+ if (!value.split) {
613
+ // unknown error
614
+ continue
615
+ }
616
+
617
+ // number of values unknown, "commas are permitted only as delimiters for lists of values"
618
+
619
+ const lst = value.split(',') // value is always array!!
620
+ if (isinteger) {
621
+ m.info[key] = lst.map(Number.parseInt)
622
+ } else if (isfloat) {
623
+ m.info[key] = lst.map(Number.parseFloat)
624
+ } else {
625
+ m.info[key] = lst
626
+ }
627
+ }
628
+ return badinfokeys
629
+ }
@@ -0,0 +1,18 @@
1
+ import { mclassdeletion, mclasssnv, mclassmnv, mclassinsertion, mclassnonstandard } from './common.js'
2
+
3
+ export function getVariantType(ref, alt) {
4
+ if (ref.length == 1 && alt.length == 1) {
5
+ // both alleles length of 1
6
+ if (alt == '.') {
7
+ // alt is missing
8
+ return mclassdeletion
9
+ }
10
+ // snv
11
+ return mclasssnv
12
+ }
13
+ if (ref.length == alt.length) return mclassmnv
14
+ // FIXME only when empty length of one allele
15
+ if (ref.length < alt.length) return mclassinsertion
16
+ if (ref.length > alt.length) return mclassdeletion
17
+ return mclassnonstandard
18
+ }