@sjcrh/proteinpaint-server 2.43.3-0 → 2.46.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dataset/clinvar.hg19.js +53 -52
- package/dataset/clinvar.hg38.js +74 -73
- package/dataset/clinvar.js +164 -47
- package/dataset/termdb.test.js +257 -0
- package/genome/CriGri.js +1859 -27
- package/genome/cgc.js +743 -7
- package/genome/danRer10.js +1108 -46
- package/genome/dm3.js +71 -44
- package/genome/dm6.js +1926 -45
- package/genome/galGal5.js +23522 -46
- package/genome/galGal6.js +512 -46
- package/genome/hg19.js +293 -198
- package/genome/hg38.js +472 -105
- package/genome/hg38.test.js +406 -40
- package/genome/hgvirus.js +45 -20
- package/genome/mm10.js +135 -67
- package/genome/mm9.js +116 -79
- package/genome/rn6.js +1002 -47
- package/package.json +32 -36
- package/routes/_template_.js +30 -0
- package/routes/burden.js +149 -0
- package/routes/dataset.js +266 -0
- package/routes/dsdata.js +127 -0
- package/routes/gdc.maf.js +120 -0
- package/routes/gdc.mafBuild.js +106 -0
- package/routes/gdc.topMutatedGenes.js +465 -0
- package/routes/gene2canonicalisoform.js +41 -0
- package/routes/genelookup.js +52 -0
- package/routes/genomes.js +144 -0
- package/routes/healthcheck.js +30 -0
- package/routes/hicdata.js +98 -0
- package/routes/hicstat.js +55 -0
- package/routes/isoformlst.js +57 -0
- package/routes/ntseq.js +43 -0
- package/routes/pdomain.js +61 -0
- package/routes/snp.js +107 -0
- package/routes/termdb.categories.js +209 -0
- package/routes/termdb.cluster.js +228 -0
- package/routes/termdb.cohort.summary.js +38 -0
- package/routes/termdb.cohorts.js +49 -0
- package/routes/termdb.config.js +201 -0
- package/routes/termdb.getdescrstats.js +102 -0
- package/routes/termdb.getnumericcategories.js +92 -0
- package/routes/termdb.getpercentile.js +108 -0
- package/routes/termdb.getrootterm.js +65 -0
- package/routes/termdb.gettermchildren.js +67 -0
- package/routes/termdb.singleSampleMutation.js +80 -0
- package/routes/termdb.singlecellData.js +46 -0
- package/routes/termdb.singlecellSamples.js +160 -0
- package/routes/termdb.termsbyids.js +59 -0
- package/routes/termdb.topVariablyExpressedGenes.js +171 -0
- package/routes/termdb.violin.js +77 -0
- package/src/app.js +41498 -0
- package/src/serverconfig.js +14 -8
- package/start.js +3 -3
- package/routes/README.md +0 -84
- package/routes/burden.ts +0 -143
- package/routes/gdc.maf.ts +0 -195
- package/routes/gdc.mafBuild.ts +0 -114
- package/routes/gdc.topMutatedGenes.ts +0 -586
- package/routes/genelookup.ts +0 -50
- package/routes/healthcheck.ts +0 -29
- package/routes/hicdata.ts +0 -111
- package/routes/hicstat.ts +0 -55
- package/routes/termdb.categories.ts +0 -245
- package/routes/termdb.cluster.ts +0 -248
- package/routes/termdb.getdescrstats.ts +0 -102
- package/routes/termdb.getnumericcategories.ts +0 -99
- package/routes/termdb.getpercentile.ts +0 -118
- package/routes/termdb.getrootterm.ts +0 -73
- package/routes/termdb.gettermchildren.ts +0 -82
- package/routes/termdb.singleSampleMutation.ts +0 -87
- package/routes/termdb.singlecellData.ts +0 -49
- package/routes/termdb.singlecellSamples.ts +0 -175
- package/routes/termdb.termsbyids.ts +0 -63
- package/routes/termdb.topVariablyExpressedGenes.ts +0 -214
- package/routes/termdb.violin.ts +0 -77
- package/server.js +0 -2
- package/server.js.map +0 -1
- package/shared/common.js +0 -1080
- package/shared/termdb.initbinconfig.js +0 -96
- package/shared/vcf.js +0 -629
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
/*
|
|
2
|
-
Initialize a bin configuration for a numeric dataset
|
|
3
|
-
<data>: array of numeric data values
|
|
4
|
-
<opts> (optional): object of options
|
|
5
|
-
{}: output bin config as JavaScript object (default)
|
|
6
|
-
{format: 'string'}: output bin config as JSON string
|
|
7
|
-
*/
|
|
8
|
-
module.exports = function initBinConfig(data, opts = {}) {
|
|
9
|
-
if (data.find(d => !Number.isFinite(d))) throw 'non-numeric values found'
|
|
10
|
-
let binConfig
|
|
11
|
-
const s = new Set(data)
|
|
12
|
-
if (s.size === 1) {
|
|
13
|
-
// single unique value in data array
|
|
14
|
-
// prepare custom bin config for 3 bins: first bin
|
|
15
|
-
// for values less than the value, second bin for values
|
|
16
|
-
// equal to the value, and third bin one for values
|
|
17
|
-
// greater than the value
|
|
18
|
-
// all data values will fall into the second bin
|
|
19
|
-
const value = [...s][0]
|
|
20
|
-
binConfig = {
|
|
21
|
-
type: 'custom-bin',
|
|
22
|
-
lst: [
|
|
23
|
-
{ stop: value, stopinclusive: false, startunbounded: true, label: '<' + value },
|
|
24
|
-
{ start: value, stop: value, startinclusive: true, stopinclusive: true, label: '=' + value },
|
|
25
|
-
{ start: value, startinclusive: false, stopunbounded: true, label: '>' + value }
|
|
26
|
-
]
|
|
27
|
-
}
|
|
28
|
-
} else {
|
|
29
|
-
// multiple unique values in data array
|
|
30
|
-
// prepare regular bin config
|
|
31
|
-
|
|
32
|
-
// compute the bin size for a maximum bin number of 8
|
|
33
|
-
data.sort((a, b) => a - b)
|
|
34
|
-
const l = data.length
|
|
35
|
-
const min = data[0]
|
|
36
|
-
const max = data[l - 1]
|
|
37
|
-
const binSize = (max - min) / 8
|
|
38
|
-
// first bin stop will equal either (minimum + bin size) or (5th percentile), whichever is larger.
|
|
39
|
-
let p5idx = Math.round(l * 0.05) - 1
|
|
40
|
-
if (p5idx < 0) p5idx = 0
|
|
41
|
-
const p5 = data[p5idx]
|
|
42
|
-
const firstBinStop = Math.max(min + binSize, p5)
|
|
43
|
-
// round the bin values
|
|
44
|
-
let [binSize_rnd, firstBinStop_rnd, lastBinStart_rnd, rounding] = roundBinVals(binSize, firstBinStop, max, min)
|
|
45
|
-
// generate the bin configuration
|
|
46
|
-
binConfig = {
|
|
47
|
-
type: 'regular-bin',
|
|
48
|
-
startinclusive: true,
|
|
49
|
-
bin_size: binSize_rnd,
|
|
50
|
-
first_bin: { stop: firstBinStop_rnd }
|
|
51
|
-
}
|
|
52
|
-
if (lastBinStart_rnd) binConfig.last_bin = { start: lastBinStart_rnd }
|
|
53
|
-
if (rounding) binConfig.rounding = rounding
|
|
54
|
-
}
|
|
55
|
-
if ('format' in opts) {
|
|
56
|
-
if (opts.format === 'string') {
|
|
57
|
-
return JSON.stringify(binConfig)
|
|
58
|
-
} else {
|
|
59
|
-
throw 'options are not in the correct format'
|
|
60
|
-
}
|
|
61
|
-
} else {
|
|
62
|
-
return binConfig
|
|
63
|
-
}
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
function roundBinVals(binSize, firstBinStop, max, min) {
|
|
67
|
-
let binSize_rnd, firstBinStop_rnd, lastBinStart_rnd, rounding
|
|
68
|
-
const log = Math.floor(Math.log10(binSize))
|
|
69
|
-
if (binSize >= 0.1 && binSize <= 2) {
|
|
70
|
-
// Round to the nearest one for small bin sizes
|
|
71
|
-
binSize_rnd = Math.round(binSize / (1 * 10 ** log)) * (1 * 10 ** log)
|
|
72
|
-
firstBinStop_rnd = Math.round(firstBinStop / (1 * 10 ** log)) * (1 * 10 ** log)
|
|
73
|
-
} else {
|
|
74
|
-
// Round to the nearest five for large bin sizes
|
|
75
|
-
binSize_rnd = Math.round(binSize / (5 * 10 ** log)) * (5 * 10 ** log)
|
|
76
|
-
firstBinStop_rnd = Math.round(firstBinStop / (5 * 10 ** log)) * (5 * 10 ** log)
|
|
77
|
-
if (binSize_rnd === 0) binSize_rnd = 1 * 10 ** log
|
|
78
|
-
if (firstBinStop_rnd === 0) firstBinStop_rnd = 1 * 10 ** log
|
|
79
|
-
if (binSize_rnd === 5 * 10 ** log && firstBinStop_rnd === 1 * 10 ** log) firstBinStop_rnd = 5 * 10 ** log
|
|
80
|
-
}
|
|
81
|
-
if (firstBinStop_rnd < min) firstBinStop_rnd = firstBinStop_rnd * 2
|
|
82
|
-
// if the number of bins is above 8 after rounding, then set the last bin start to restrict the number of bins to 8
|
|
83
|
-
const eighthBinStop_rnd = firstBinStop_rnd + binSize_rnd * 7
|
|
84
|
-
if (max > eighthBinStop_rnd) {
|
|
85
|
-
lastBinStart_rnd = firstBinStop_rnd + binSize_rnd * 6
|
|
86
|
-
}
|
|
87
|
-
if (binSize < 1) {
|
|
88
|
-
const digits = Math.abs(log)
|
|
89
|
-
binSize_rnd = Number(binSize_rnd.toFixed(digits))
|
|
90
|
-
firstBinStop_rnd = Number(firstBinStop_rnd.toFixed(digits))
|
|
91
|
-
if (lastBinStart_rnd) lastBinStart_rnd = Number(lastBinStart_rnd.toFixed(digits))
|
|
92
|
-
rounding = '.' + digits + 'f'
|
|
93
|
-
}
|
|
94
|
-
if (Object.is(firstBinStop_rnd, -0)) firstBinStop_rnd = 0
|
|
95
|
-
return [binSize_rnd, firstBinStop_rnd, lastBinStart_rnd, rounding]
|
|
96
|
-
}
|
package/shared/vcf.js
DELETED
|
@@ -1,629 +0,0 @@
|
|
|
1
|
-
import { mclass } from './common'
|
|
2
|
-
import { dissect_INFO } from './vcf.info'
|
|
3
|
-
import { parse_CSQ } from './vcf.csq'
|
|
4
|
-
import { parse_ANN } from './vcf.ann'
|
|
5
|
-
import { getVariantType } from './vcf.type'
|
|
6
|
-
|
|
7
|
-
/*
|
|
8
|
-
Only for parsing vcf files
|
|
9
|
-
is not involved in creating vcf tracks
|
|
10
|
-
|
|
11
|
-
shared between client-server
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
// for telling symbolic alleles e.g. <*:DEL>
|
|
15
|
-
const getallelename = new RegExp(/<(.+)>/)
|
|
16
|
-
|
|
17
|
-
const mclasslabel2key = {}
|
|
18
|
-
for (const k in mclass) {
|
|
19
|
-
mclasslabel2key[mclass[k].label.toUpperCase()] = k
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
export function vcfparsemeta(lines) {
|
|
23
|
-
/*
|
|
24
|
-
input: array of string, as lines separated by linebreak
|
|
25
|
-
|
|
26
|
-
##INFO for meta lines
|
|
27
|
-
#CHROM for header, to get samples
|
|
28
|
-
|
|
29
|
-
*/
|
|
30
|
-
|
|
31
|
-
let sample = [],
|
|
32
|
-
errlst = [],
|
|
33
|
-
info = {},
|
|
34
|
-
hasinfo = false,
|
|
35
|
-
format = {},
|
|
36
|
-
hasformat = false
|
|
37
|
-
|
|
38
|
-
for (const line of lines) {
|
|
39
|
-
if (!line.startsWith('#')) {
|
|
40
|
-
continue
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
if (line.startsWith('#C')) {
|
|
44
|
-
// header, get samples
|
|
45
|
-
sample = line.split('\t').slice(9)
|
|
46
|
-
continue
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
if (line.startsWith('##INFO')) {
|
|
50
|
-
const e = tohash(line.substring(8, line.length - 1), info)
|
|
51
|
-
if (e) {
|
|
52
|
-
errlst.push('INFO error: ' + e)
|
|
53
|
-
} else {
|
|
54
|
-
hasinfo = true
|
|
55
|
-
}
|
|
56
|
-
continue
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
if (line.startsWith('##FORMAT')) {
|
|
60
|
-
const e = tohash(line.substring(10, line.length - 1), format)
|
|
61
|
-
if (e) {
|
|
62
|
-
errlst.push('FORMAT error: ' + e)
|
|
63
|
-
} else {
|
|
64
|
-
hasformat = true
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
const sampleobjlst = []
|
|
70
|
-
for (const samplename of sample) {
|
|
71
|
-
const a = { name: samplename }
|
|
72
|
-
|
|
73
|
-
// this enables adding key4annotation to match with .ds.cohort.annotation
|
|
74
|
-
|
|
75
|
-
sampleobjlst.push(a)
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
// reserved INFO fields
|
|
79
|
-
if (info.CSQ) {
|
|
80
|
-
const lst = info.CSQ.Description.split(' Format: ')
|
|
81
|
-
if (lst[1]) {
|
|
82
|
-
const lst2 = lst[1].split('|')
|
|
83
|
-
if (lst2.length > 1) {
|
|
84
|
-
// fix csq headers so to allow configuring show/hide of csq fields
|
|
85
|
-
info.CSQ.csqheader = []
|
|
86
|
-
for (const str of lst2) {
|
|
87
|
-
const attr = { name: str }
|
|
88
|
-
info.CSQ.csqheader.push(attr)
|
|
89
|
-
}
|
|
90
|
-
} else {
|
|
91
|
-
errlst.push('unknown format for CSQ header: ' + info.CSQ.Description)
|
|
92
|
-
}
|
|
93
|
-
} else {
|
|
94
|
-
errlst.push('unknown format for CSQ header: ' + info.CSQ.Description)
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
if (info.ANN) {
|
|
99
|
-
const lst = info.ANN.Description.split("'")
|
|
100
|
-
if (lst[1]) {
|
|
101
|
-
const lst2 = lst[1].split(' | ')
|
|
102
|
-
if (lst2.length) {
|
|
103
|
-
info.ANN.annheader = []
|
|
104
|
-
for (const s of lst2) {
|
|
105
|
-
const attr = { name: s }
|
|
106
|
-
info.ANN.annheader.push(attr)
|
|
107
|
-
}
|
|
108
|
-
} else {
|
|
109
|
-
errlst.push('no " | " joined annotation fields for ANN (snpEff annotation): ' + info.ANN.Description)
|
|
110
|
-
}
|
|
111
|
-
} else {
|
|
112
|
-
errlst.push('no single-quote enclosed annotation fields for ANN (snpEff annotation): ' + info.ANN.Description)
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
return [hasinfo ? info : null, hasformat ? format : null, sampleobjlst, errlst.length ? errlst : null]
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
export function vcfparseline(line, vcf) {
|
|
120
|
-
/*
|
|
121
|
-
vcf, samples/info is generated by vcfparsemeta()
|
|
122
|
-
.nochr BOOL
|
|
123
|
-
.samples [ {} ]
|
|
124
|
-
.name
|
|
125
|
-
.info {}
|
|
126
|
-
.format {}
|
|
127
|
-
|
|
128
|
-
return:
|
|
129
|
-
error message STR
|
|
130
|
-
altinvalid []
|
|
131
|
-
mlst [ m ] one m per alt allele
|
|
132
|
-
chr
|
|
133
|
-
pos
|
|
134
|
-
name
|
|
135
|
-
type
|
|
136
|
-
ref
|
|
137
|
-
alt
|
|
138
|
-
altstr
|
|
139
|
-
sampledata []
|
|
140
|
-
altinfo
|
|
141
|
-
*/
|
|
142
|
-
|
|
143
|
-
const lst = line.split('\t')
|
|
144
|
-
if (lst.length < 8) {
|
|
145
|
-
// no good
|
|
146
|
-
return ['line has less than 8 fields', null, null]
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
const rawpos = Number.parseInt(lst[2 - 1])
|
|
150
|
-
if (!Number.isInteger(rawpos)) {
|
|
151
|
-
return ['invalid value for genomic position', null, null]
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
const refallele = lst[4 - 1]
|
|
155
|
-
|
|
156
|
-
const m = {
|
|
157
|
-
vcf_ID: lst[3 - 1],
|
|
158
|
-
chr: (vcf.nochr ? 'chr' : '') + lst[1 - 1],
|
|
159
|
-
pos: rawpos - 1,
|
|
160
|
-
ref: refallele,
|
|
161
|
-
//refstr:refallele, // e.g. GA>GCC, ref:A, refstr:GA, "refstr" is required for matching in FORMAT
|
|
162
|
-
altstr: lst[5 - 1],
|
|
163
|
-
alleles: [
|
|
164
|
-
{
|
|
165
|
-
/*
|
|
166
|
-
ref allele only a placeholder, to be removed, this array only contains alt alleles
|
|
167
|
-
this is a must
|
|
168
|
-
also allows GT allele index to work
|
|
169
|
-
*/
|
|
170
|
-
allele: refallele,
|
|
171
|
-
sampledata: []
|
|
172
|
-
}
|
|
173
|
-
],
|
|
174
|
-
|
|
175
|
-
info: {}, // locus info, do not contain allele info
|
|
176
|
-
|
|
177
|
-
name: lst[3 - 1] == '.' ? null : lst[3 - 1]
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
// parse alt
|
|
181
|
-
const altinvalid = []
|
|
182
|
-
for (const alt of lst[5 - 1].split(',')) {
|
|
183
|
-
const a = {
|
|
184
|
-
ref: m.ref, // may be corrected just below!
|
|
185
|
-
allele: alt,
|
|
186
|
-
// 5078356.TATCAGAGAA.GGGAGGA keep original allele for matching with csq which hardcodes original allele
|
|
187
|
-
allele_original: alt,
|
|
188
|
-
sampledata: [],
|
|
189
|
-
_m: m,
|
|
190
|
-
info: {} // allele info, do not contain locus info
|
|
191
|
-
}
|
|
192
|
-
m.alleles.push(a)
|
|
193
|
-
if (alt[0] == '<') {
|
|
194
|
-
/*
|
|
195
|
-
symbolic allele, show text within <> as name
|
|
196
|
-
FIXME match INFO
|
|
197
|
-
*/
|
|
198
|
-
const tmp = alt.match(getallelename)
|
|
199
|
-
if (!tmp) {
|
|
200
|
-
altinvalid.push(alt)
|
|
201
|
-
continue
|
|
202
|
-
}
|
|
203
|
-
a.type = tmp[1]
|
|
204
|
-
|
|
205
|
-
a.allele = tmp[1]
|
|
206
|
-
a.issymbolicallele = true
|
|
207
|
-
} else {
|
|
208
|
-
// normal nucleotide
|
|
209
|
-
|
|
210
|
-
const [p, ref, alt] = correctRefAlt(m.pos, m.ref, a.allele)
|
|
211
|
-
a.pos = p
|
|
212
|
-
a.ref = ref
|
|
213
|
-
a.allele = alt
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
if (lst[9 - 1] && lst[10 - 1]) {
|
|
218
|
-
parse_FORMAT2(lst, m, vcf)
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
/*
|
|
222
|
-
remove ref allele so it only contain alternative alleles
|
|
223
|
-
so that parse_INFO can safely apply Number=A fields to m.alleles
|
|
224
|
-
*/
|
|
225
|
-
m.alleles.shift()
|
|
226
|
-
|
|
227
|
-
// info
|
|
228
|
-
const tmp = lst[8 - 1] == '.' ? [] : dissect_INFO(lst[8 - 1])
|
|
229
|
-
let badinfokeys = []
|
|
230
|
-
|
|
231
|
-
if (vcf.info) {
|
|
232
|
-
badinfokeys = parse_INFO(tmp, m, vcf)
|
|
233
|
-
} else {
|
|
234
|
-
// vcf meta lines told nothing about INFO, do not parse
|
|
235
|
-
m.info = tmp
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
const mlst = []
|
|
239
|
-
for (const a of m.alleles) {
|
|
240
|
-
const m2 = {}
|
|
241
|
-
for (const k in m) {
|
|
242
|
-
if (k != 'alleles') {
|
|
243
|
-
m2[k] = m[k]
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
for (const k in a) {
|
|
247
|
-
if (k == 'allele') {
|
|
248
|
-
m2.alt = a[k]
|
|
249
|
-
} else if (k == 'info') {
|
|
250
|
-
m2.altinfo = a[k]
|
|
251
|
-
} else {
|
|
252
|
-
m2[k] = a[k]
|
|
253
|
-
}
|
|
254
|
-
}
|
|
255
|
-
if (!m2.issymbolicallele && m2.alt != 'NON_REF') {
|
|
256
|
-
m2.type = getVariantType(m2.ref, m2.alt)
|
|
257
|
-
/*
|
|
258
|
-
// valid alt allele, apply Dr. J's cool method
|
|
259
|
-
const [p,ref,alt]=correctRefAlt(m2.pos, m2.ref, m2.alt)
|
|
260
|
-
m2.pos=p
|
|
261
|
-
m2.ref=ref
|
|
262
|
-
m2.alt=alt
|
|
263
|
-
*/
|
|
264
|
-
}
|
|
265
|
-
mlst.push(m2)
|
|
266
|
-
}
|
|
267
|
-
return [
|
|
268
|
-
badinfokeys.length ? 'unknown info keys: ' + badinfokeys.join(',') : null,
|
|
269
|
-
mlst,
|
|
270
|
-
altinvalid.length > 0 ? altinvalid : null
|
|
271
|
-
]
|
|
272
|
-
}
|
|
273
|
-
|
|
274
|
-
function correctRefAlt(p, ref, alt) {
|
|
275
|
-
// for oligos, always trim the last identical base
|
|
276
|
-
while (ref.length > 1 && alt.length > 1 && ref[ref.length - 1] == alt[alt.length - 1]) {
|
|
277
|
-
ref = ref.substr(0, ref.length - 1)
|
|
278
|
-
alt = alt.substr(0, alt.length - 1)
|
|
279
|
-
}
|
|
280
|
-
// move position up as long as first positions are equal
|
|
281
|
-
while (ref.length > 1 && alt.length > 1 && ref[0] == alt[0]) {
|
|
282
|
-
ref = ref.substr(1)
|
|
283
|
-
alt = alt.substr(1)
|
|
284
|
-
p++
|
|
285
|
-
}
|
|
286
|
-
return [p, ref, alt]
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
function parse_FORMAT2(lst, m, vcf) {
|
|
290
|
-
/*
|
|
291
|
-
m.alleles[0] is ref allele
|
|
292
|
-
|
|
293
|
-
each allele:
|
|
294
|
-
.ref
|
|
295
|
-
.allele
|
|
296
|
-
.allele_original
|
|
297
|
-
.sampledata[] blank array
|
|
298
|
-
*/
|
|
299
|
-
const formatfields = lst[9 - 1].split(':')
|
|
300
|
-
|
|
301
|
-
for (let _sampleidx = 9; _sampleidx < lst.length; _sampleidx++) {
|
|
302
|
-
// for each sample
|
|
303
|
-
|
|
304
|
-
const valuelst = lst[_sampleidx].split(':')
|
|
305
|
-
{
|
|
306
|
-
// tell if this sample have any data in this line (variant), if .:., then skip
|
|
307
|
-
let none = true
|
|
308
|
-
for (const v of valuelst) {
|
|
309
|
-
if (v != '.') {
|
|
310
|
-
none = false
|
|
311
|
-
break
|
|
312
|
-
}
|
|
313
|
-
}
|
|
314
|
-
if (none) {
|
|
315
|
-
// this sample has no format value
|
|
316
|
-
continue
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
/* should create an object of {format:value} of this sample
|
|
321
|
-
with this object, for each alt allele this sample has,
|
|
322
|
-
put a copy in m.allele[x].sampledata[]
|
|
323
|
-
*/
|
|
324
|
-
|
|
325
|
-
const sampleidx = _sampleidx - 9
|
|
326
|
-
|
|
327
|
-
/*
|
|
328
|
-
for each alt allele, initialize obj of this sample and store in this allele
|
|
329
|
-
later, to iterate over format fields and put in appropriate values
|
|
330
|
-
note that this sample may not actually have this allele
|
|
331
|
-
*/
|
|
332
|
-
for (let i = 1; i < m.alleles.length; i++) {
|
|
333
|
-
const sobj = {}
|
|
334
|
-
if (vcf.samples && vcf.samples[sampleidx]) {
|
|
335
|
-
for (const k in vcf.samples[sampleidx]) {
|
|
336
|
-
sobj[k] = vcf.samples[sampleidx][k]
|
|
337
|
-
}
|
|
338
|
-
} else {
|
|
339
|
-
sobj.name = 'missing_samplename_from_vcf_header'
|
|
340
|
-
}
|
|
341
|
-
m.alleles[i].sampledata.push({
|
|
342
|
-
sampleobj: sobj
|
|
343
|
-
})
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
for (let fi = 0; fi < formatfields.length; fi++) {
|
|
347
|
-
// for each field of this sample
|
|
348
|
-
|
|
349
|
-
const field = formatfields[fi]
|
|
350
|
-
const value = valuelst[fi]
|
|
351
|
-
if (value == '.') {
|
|
352
|
-
// no value for this field
|
|
353
|
-
continue
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
if (field == 'GT') {
|
|
357
|
-
const splitter = value.indexOf('/') != -1 ? '/' : '|'
|
|
358
|
-
let gtsum = 0 // for calculating gtallref=true, old
|
|
359
|
-
let unknowngt = false // if any is '.', then won't calculate gtallref
|
|
360
|
-
const gtalleles = []
|
|
361
|
-
for (const i of value.split(splitter)) {
|
|
362
|
-
if (i == '.') {
|
|
363
|
-
unknowngt = true
|
|
364
|
-
continue
|
|
365
|
-
}
|
|
366
|
-
const j = Number.parseInt(i)
|
|
367
|
-
if (Number.isNaN(j)) {
|
|
368
|
-
unknowngt = true
|
|
369
|
-
continue
|
|
370
|
-
}
|
|
371
|
-
gtsum += j
|
|
372
|
-
const ale = m.alleles[j]
|
|
373
|
-
if (ale) {
|
|
374
|
-
gtalleles.push(ale.allele)
|
|
375
|
-
}
|
|
376
|
-
}
|
|
377
|
-
let gtallref = false
|
|
378
|
-
if (!unknowngt) {
|
|
379
|
-
gtallref = gtsum == 0
|
|
380
|
-
}
|
|
381
|
-
|
|
382
|
-
const genotype = gtalleles.join(splitter)
|
|
383
|
-
for (let i = 1; i < m.alleles.length; i++) {
|
|
384
|
-
const ms = m.alleles[i].sampledata[m.alleles[i].sampledata.length - 1]
|
|
385
|
-
ms.GT = value
|
|
386
|
-
ms.genotype = genotype
|
|
387
|
-
if (gtallref) {
|
|
388
|
-
ms.gtallref = true
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
// for mds vcf to drop out samples that do not have this alt allele
|
|
392
|
-
ms.__gtalleles = gtalleles
|
|
393
|
-
}
|
|
394
|
-
continue
|
|
395
|
-
}
|
|
396
|
-
|
|
397
|
-
// other data fields
|
|
398
|
-
const formatdesc = vcf.format ? vcf.format[field] : null
|
|
399
|
-
if (!formatdesc) {
|
|
400
|
-
// unspecified field, put to all alt alleles
|
|
401
|
-
for (let i = 1; i < m.alleles.length; i++) {
|
|
402
|
-
m.alleles[i].sampledata[m.alleles[i].sampledata.length - 1][field] = value
|
|
403
|
-
}
|
|
404
|
-
continue
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
const isinteger = formatdesc.Type == 'Integer'
|
|
408
|
-
const isfloat = formatdesc.Type == 'Float'
|
|
409
|
-
|
|
410
|
-
if ((formatdesc.Number && formatdesc.Number == 'R') || field == 'AD') {
|
|
411
|
-
/*
|
|
412
|
-
per-allele value, including ref
|
|
413
|
-
v4.1 has AD not with "R", must process as R
|
|
414
|
-
*/
|
|
415
|
-
const fvlst = value.split(',').map(i => {
|
|
416
|
-
if (isinteger) return Number.parseInt(i)
|
|
417
|
-
if (isfloat) return Number.parseFloat(i)
|
|
418
|
-
return i
|
|
419
|
-
})
|
|
420
|
-
for (let i = 1; i < m.alleles.length; i++) {
|
|
421
|
-
if (fvlst[i] != undefined) {
|
|
422
|
-
// this allele has value
|
|
423
|
-
const m2 = m.alleles[i]
|
|
424
|
-
const m2s = m2.sampledata[m2.sampledata.length - 1]
|
|
425
|
-
// use this allele's ref/alt (after nt trimming)
|
|
426
|
-
m2s[field] = {}
|
|
427
|
-
m2s[field][m2.ref] = fvlst[0]
|
|
428
|
-
m2s[field][m2.allele] = fvlst[i]
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
continue
|
|
432
|
-
}
|
|
433
|
-
if (formatdesc.Number && formatdesc.Number == 'A') {
|
|
434
|
-
// per alt-allele value
|
|
435
|
-
const fvlst = value.split(',').map(i => {
|
|
436
|
-
if (isinteger) return Number.parseInt(i)
|
|
437
|
-
if (isfloat) return Number.parseFloat(i)
|
|
438
|
-
return i
|
|
439
|
-
})
|
|
440
|
-
for (let i = 1; i < m.alleles.length; i++) {
|
|
441
|
-
if (fvlst[i - 1] != undefined) {
|
|
442
|
-
// this allele has value
|
|
443
|
-
const m2 = m.alleles[i]
|
|
444
|
-
const m2s = m2.sampledata[m2.sampledata.length - 1]
|
|
445
|
-
// use this allele's ref/alt (after nt trimming)
|
|
446
|
-
m2s[field] = {}
|
|
447
|
-
m2s[field][m2.allele] = fvlst[i - 1]
|
|
448
|
-
}
|
|
449
|
-
}
|
|
450
|
-
continue
|
|
451
|
-
}
|
|
452
|
-
// otherwise, append this field to all alt
|
|
453
|
-
for (let i = 1; i < m.alleles.length; i++) {
|
|
454
|
-
m.alleles[i].sampledata[m.alleles[i].sampledata.length - 1][field] = value
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
// compatible with old ds: make allele2readcount from AD
|
|
460
|
-
for (const a of m.alleles) {
|
|
461
|
-
for (const s of a.sampledata) {
|
|
462
|
-
if (s.AD) {
|
|
463
|
-
s.allele2readcount = {}
|
|
464
|
-
for (const k in s.AD) {
|
|
465
|
-
s.allele2readcount[k] = s.AD[k]
|
|
466
|
-
}
|
|
467
|
-
}
|
|
468
|
-
}
|
|
469
|
-
}
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
function tohash(s, hash) {
|
|
473
|
-
/*
|
|
474
|
-
parse INFO
|
|
475
|
-
*/
|
|
476
|
-
const h = {},
|
|
477
|
-
err = []
|
|
478
|
-
let prev = 0,
|
|
479
|
-
prevdoublequote = false,
|
|
480
|
-
k = null
|
|
481
|
-
for (let i = 0; i < s.length; i++) {
|
|
482
|
-
if (s[i] == '"') {
|
|
483
|
-
i++
|
|
484
|
-
const thisstart = i
|
|
485
|
-
while (s[i] != '"') {
|
|
486
|
-
i++
|
|
487
|
-
}
|
|
488
|
-
if (k) {
|
|
489
|
-
h[k] = s.substring(thisstart, i)
|
|
490
|
-
k = null
|
|
491
|
-
} else {
|
|
492
|
-
err.push('k undefined before double quotes')
|
|
493
|
-
}
|
|
494
|
-
prevdoublequote = true
|
|
495
|
-
continue
|
|
496
|
-
}
|
|
497
|
-
if (s[i] == '=') {
|
|
498
|
-
k = s.substring(prev, i)
|
|
499
|
-
prev = i + 1
|
|
500
|
-
continue
|
|
501
|
-
}
|
|
502
|
-
if (s[i] == ',') {
|
|
503
|
-
if (prevdoublequote) {
|
|
504
|
-
prevdoublequote = false
|
|
505
|
-
} else {
|
|
506
|
-
if (k) {
|
|
507
|
-
h[k] = s.substring(prev, i)
|
|
508
|
-
k = null
|
|
509
|
-
} else {
|
|
510
|
-
err.push('k undefined')
|
|
511
|
-
}
|
|
512
|
-
}
|
|
513
|
-
prev = i + 1
|
|
514
|
-
continue
|
|
515
|
-
}
|
|
516
|
-
}
|
|
517
|
-
if (k) {
|
|
518
|
-
h[k] = s.substring(prev, i)
|
|
519
|
-
}
|
|
520
|
-
if (h.ID) {
|
|
521
|
-
hash[h.ID] = h
|
|
522
|
-
} else {
|
|
523
|
-
return 'no ID'
|
|
524
|
-
}
|
|
525
|
-
if (err.length) return err.join('\n')
|
|
526
|
-
}
|
|
527
|
-
|
|
528
|
-
function parse_INFO(tmp, m, vcf) {
|
|
529
|
-
/*
|
|
530
|
-
this function fills in both m.info{} and m.alleles[].info{}
|
|
531
|
-
|
|
532
|
-
the m.alleles[] will later be converted to [m], each carrying one alt allele
|
|
533
|
-
each m will have .info{} for locus info, and .altinfo{} for alt allele info
|
|
534
|
-
|
|
535
|
-
*/
|
|
536
|
-
|
|
537
|
-
const badinfokeys = []
|
|
538
|
-
|
|
539
|
-
for (const key in tmp) {
|
|
540
|
-
if (vcf.info[key] == undefined) {
|
|
541
|
-
badinfokeys.push(key)
|
|
542
|
-
continue
|
|
543
|
-
}
|
|
544
|
-
|
|
545
|
-
const value = tmp[key]
|
|
546
|
-
|
|
547
|
-
////////////////// hard-coded fields
|
|
548
|
-
|
|
549
|
-
if (key == 'CSQ') {
|
|
550
|
-
const okay = parse_CSQ(value, vcf.info.CSQ.csqheader, m)
|
|
551
|
-
if (!okay) {
|
|
552
|
-
m.info[key] = value
|
|
553
|
-
}
|
|
554
|
-
continue
|
|
555
|
-
}
|
|
556
|
-
if (key == 'ANN') {
|
|
557
|
-
const okay = parse_ANN(value, vcf.info.ANN.annheader, m)
|
|
558
|
-
if (!okay) {
|
|
559
|
-
m.info[key] = value
|
|
560
|
-
}
|
|
561
|
-
continue
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
////////////////// end of hardcoded fields
|
|
565
|
-
|
|
566
|
-
if (vcf.info[key].Type == 'Flag') {
|
|
567
|
-
// flag has no value
|
|
568
|
-
m.info[key] = key
|
|
569
|
-
continue
|
|
570
|
-
}
|
|
571
|
-
|
|
572
|
-
const __number = vcf.info[key].Number
|
|
573
|
-
const isinteger = vcf.info[key].Type == 'Integer'
|
|
574
|
-
const isfloat = vcf.info[key].Type == 'Float'
|
|
575
|
-
|
|
576
|
-
if (__number == '0') {
|
|
577
|
-
/*
|
|
578
|
-
no value, should be a Flag
|
|
579
|
-
*/
|
|
580
|
-
m.info[key] = key
|
|
581
|
-
continue
|
|
582
|
-
}
|
|
583
|
-
|
|
584
|
-
if (__number == 'A') {
|
|
585
|
-
/*
|
|
586
|
-
per alt allele
|
|
587
|
-
*/
|
|
588
|
-
const tt = value.split(',')
|
|
589
|
-
for (let j = 0; j < tt.length; j++) {
|
|
590
|
-
if (m.alleles[j]) {
|
|
591
|
-
m.alleles[j].info[key] = isinteger ? Number.parseInt(tt[j]) : isfloat ? Number.parseFloat(tt[j]) : tt[j]
|
|
592
|
-
}
|
|
593
|
-
}
|
|
594
|
-
continue
|
|
595
|
-
}
|
|
596
|
-
|
|
597
|
-
if (__number == 'R') {
|
|
598
|
-
/*
|
|
599
|
-
FIXME "R" is not considered, m.alleles only contain alt, which .info{} for each
|
|
600
|
-
the current datastructure does not support info for ref allele!
|
|
601
|
-
*/
|
|
602
|
-
}
|
|
603
|
-
|
|
604
|
-
if (__number == '1') {
|
|
605
|
-
/*
|
|
606
|
-
single value
|
|
607
|
-
*/
|
|
608
|
-
m.info[key] = isinteger ? Number.parseInt(value) : isfloat ? Number.parseFloat(value) : value
|
|
609
|
-
continue
|
|
610
|
-
}
|
|
611
|
-
|
|
612
|
-
if (!value.split) {
|
|
613
|
-
// unknown error
|
|
614
|
-
continue
|
|
615
|
-
}
|
|
616
|
-
|
|
617
|
-
// number of values unknown, "commas are permitted only as delimiters for lists of values"
|
|
618
|
-
|
|
619
|
-
const lst = value.split(',') // value is always array!!
|
|
620
|
-
if (isinteger) {
|
|
621
|
-
m.info[key] = lst.map(Number.parseInt)
|
|
622
|
-
} else if (isfloat) {
|
|
623
|
-
m.info[key] = lst.map(Number.parseFloat)
|
|
624
|
-
} else {
|
|
625
|
-
m.info[key] = lst
|
|
626
|
-
}
|
|
627
|
-
}
|
|
628
|
-
return badinfokeys
|
|
629
|
-
}
|