@datagrok/bio 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/detectors.js +200 -1
- package/dist/package-test.js +406 -87
- package/dist/package.js +351 -9
- package/package.json +3 -3
- package/src/package-test.ts +7 -4
- package/src/package.ts +9 -0
- package/src/tests/{Palettes.test.ts → Palettes-test.ts} +2 -2
- package/src/tests/WebLogo-test.ts +127 -0
- package/src/tests/detectors-test.ts +166 -0
- package/{test-Bio-3109311545e4-fcd6a7f5.html → test-Bio-69a4761f6044-51a4ab35.html} +2 -2
- package/src/tests/WebLogo.test.ts +0 -132
package/detectors.js
CHANGED
|
@@ -4,6 +4,205 @@
|
|
|
4
4
|
* See also: https://datagrok.ai/help/develop/how-to/define-semantic-type-detectors
|
|
5
5
|
* The class name is comprised of <PackageName> and the `PackageDetectors` suffix.
|
|
6
6
|
* Follow this naming convention to ensure that your detectors are properly loaded.
|
|
7
|
+
*
|
|
8
|
+
* TODO: Use detectors from WebLogo pickUp.. methods
|
|
7
9
|
*/
|
|
8
|
-
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BioPackageDetectors extends DG.Package {
|
|
13
|
+
|
|
14
|
+
static semType = 'MACROMOLECULE';
|
|
15
|
+
|
|
16
|
+
static Units = {
|
|
17
|
+
FastaSeqPt: 'fasta:SEQ:PT', FastaSeqNt: 'fasta:SEQ:NT', FastaMsaPt: 'fasta:MSA:PT', FastaMsaNt: 'fasta:MSA:NT',
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
static AminoacidsFastaAlphabet = new Set([
|
|
21
|
+
'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
|
|
22
|
+
'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T',
|
|
23
|
+
]);
|
|
24
|
+
|
|
25
|
+
static NucleotidesFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
|
|
26
|
+
|
|
27
|
+
//tags: semTypeDetector
|
|
28
|
+
//input: column col
|
|
29
|
+
//output: string semType
|
|
30
|
+
detectMacromolecule(col) {
|
|
31
|
+
// To collect alphabet freq three strategies can be used:
|
|
32
|
+
// as chars, as fasta (single or within square brackets), as with the separator.
|
|
33
|
+
|
|
34
|
+
const alphabetCandidates = [
|
|
35
|
+
['NT', BioPackageDetectors.NucleotidesFastaAlphabet],
|
|
36
|
+
['PT', BioPackageDetectors.AminoacidsFastaAlphabet],
|
|
37
|
+
];
|
|
38
|
+
|
|
39
|
+
// TODO: Detect HELM sequence
|
|
40
|
+
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
41
|
+
const statsAsChars = BioPackageDetectors.getStats(col, 5, BioPackageDetectors.splitterAsChars);
|
|
42
|
+
if (statsAsChars.sameLength) {
|
|
43
|
+
const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, alphabetCandidates, '-');
|
|
44
|
+
const units = `fasta:SEQ.MSA:${alphabet}`;
|
|
45
|
+
col.setTag(DG.TAGS.UNITS, units);
|
|
46
|
+
return BioPackageDetectors.semType;
|
|
47
|
+
} else {
|
|
48
|
+
const sep = BioPackageDetectors.detectSeparator(statsAsChars.freq);
|
|
49
|
+
const gapSymbol = sep ? '' : '-';
|
|
50
|
+
const splitter = sep ? BioPackageDetectors.getSplitterWithSeparator(sep) : BioPackageDetectors.splitterAsFasta;
|
|
51
|
+
const stats = BioPackageDetectors.getStats(col, 5, splitter);
|
|
52
|
+
|
|
53
|
+
const format = sep ? 'separator' : 'fasta';
|
|
54
|
+
const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';
|
|
55
|
+
|
|
56
|
+
// TODO: If separator detected, then extra efforts to detect alphabet are allowed.
|
|
57
|
+
const alphabet = BioPackageDetectors.detectAlphabet(stats.freq, alphabetCandidates, gapSymbol);
|
|
58
|
+
|
|
59
|
+
const units = `${format}:${seqType}:${alphabet}`;
|
|
60
|
+
col.setTag(DG.TAGS.UNITS, units);
|
|
61
|
+
return BioPackageDetectors.semType;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** Detects the most frequent char with a rate of at least 0.15 of others in sum.
|
|
66
|
+
* Does not use any splitting strategies, estimates just by single characters.
|
|
67
|
+
* */
|
|
68
|
+
static detectSeparator(freq) {
|
|
69
|
+
// To detect a separator we analyse col's sequences character frequencies.
|
|
70
|
+
// If there is an exceptionally frequent symbol, then we will call it the separator.
|
|
71
|
+
// The most frequent symbol should occur with a rate of at least 0.15
|
|
72
|
+
// of all other symbols in sum to be called the separator.
|
|
73
|
+
|
|
74
|
+
// !!! But there is a caveat because exceptionally frequent char can be a gap symbol in MSA.
|
|
75
|
+
// !!! What is the difference between the gap symbol and separator symbol in stats terms?
|
|
76
|
+
|
|
77
|
+
const maxFreq = Math.max(...Object.values(freq));
|
|
78
|
+
const sep = Object.entries(freq).find((kv) => kv[1] == maxFreq)[0];
|
|
79
|
+
const sepFreq = freq[sep];
|
|
80
|
+
const otherSumFreq = Object.entries(freq).filter((kv) => kv[0] !== sep)
|
|
81
|
+
.map((kv) => kv[1]).reduce((pSum, a) => pSum + a, 0);
|
|
82
|
+
const freqThreshold = 3.5 * (1 / Object.keys(freq).length);
|
|
83
|
+
return sepFreq / otherSumFreq > freqThreshold ? sep : null;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/** Stats of sequences with specified splitter func, returns { freq, sameLength } */
|
|
87
|
+
static getStats(seqCol, minLength, splitter) {
|
|
88
|
+
const freq = {};
|
|
89
|
+
let sameLength = true;
|
|
90
|
+
let firstLength = null;
|
|
91
|
+
|
|
92
|
+
for (const seq of seqCol.categories) {
|
|
93
|
+
const mSeq = splitter(seq);
|
|
94
|
+
|
|
95
|
+
if (firstLength == null) {
|
|
96
|
+
firstLength = mSeq.length;
|
|
97
|
+
} else if (mSeq.length !== firstLength) {
|
|
98
|
+
sameLength = false;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (mSeq.length > minLength) {
|
|
102
|
+
for (const m of mSeq) {
|
|
103
|
+
if (!(m in freq)) {
|
|
104
|
+
freq[m] = 0;
|
|
105
|
+
}
|
|
106
|
+
freq[m] += 1;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
return {freq: freq, sameLength: sameLength};
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/** Detects alphabet for freq by freq similarity to alphabet monomer set.
|
|
114
|
+
* @param freq frequencies of monomers in sequence set
|
|
115
|
+
* @param candidates an array of pairs [name, monomer set]
|
|
116
|
+
* */
|
|
117
|
+
static detectAlphabet(freq, candidates, gapSymbol) {
|
|
118
|
+
const candidatesSims = candidates.map((c) => {
|
|
119
|
+
const sim = BioPackageDetectors.getAlphabetSimilarity(freq, c[1], gapSymbol);
|
|
120
|
+
return [c[0], c[1], freq, sim];
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
let alphabetName;
|
|
124
|
+
const maxSim = Math.max(...candidatesSims.map((cs) => cs[3]));
|
|
125
|
+
if (maxSim > 0.65) {
|
|
126
|
+
const sim = candidatesSims.find((cs) => cs[3] == maxSim);
|
|
127
|
+
alphabetName = sim[0];
|
|
128
|
+
} else {
|
|
129
|
+
alphabetName = 'UN';
|
|
130
|
+
}
|
|
131
|
+
return alphabetName;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
static getAlphabetSimilarity(freq, alphabet, gapSymbol) {
|
|
135
|
+
const keys = new Set([...new Set(Object.keys(freq)), ...alphabet]);
|
|
136
|
+
keys.delete(gapSymbol);
|
|
137
|
+
|
|
138
|
+
const freqA = [];
|
|
139
|
+
const alphabetA = [];
|
|
140
|
+
for (const m of keys) {
|
|
141
|
+
freqA.push(m in freq ? freq[m] : 0);
|
|
142
|
+
alphabetA.push(alphabet.has(m) ? 1 : 0);
|
|
143
|
+
}
|
|
144
|
+
/* There were a few ideas: chi-squared, pearson correlation (variance?), scalar product */
|
|
145
|
+
const cos = BioPackageDetectors.vectorDotProduct(freqA, alphabetA) / (BioPackageDetectors.vectorLength(freqA) * BioPackageDetectors.vectorLength(alphabetA));
|
|
146
|
+
return cos;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
static vectorLength(v) {
|
|
150
|
+
let sqrSum = 0;
|
|
151
|
+
for (let i = 0; i < v.length; i++) {
|
|
152
|
+
sqrSum += v[i] * v[i];
|
|
153
|
+
}
|
|
154
|
+
return Math.sqrt(sqrSum);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
static vectorDotProduct(v1, v2) {
|
|
158
|
+
if (v1.length != v2.length) {
|
|
159
|
+
throw Error('The dimensionality of the vectors must match');
|
|
160
|
+
}
|
|
161
|
+
let prod = 0;
|
|
162
|
+
for (let i = 0; i < v1.length; i++) {
|
|
163
|
+
prod += v1[i] * v2[i];
|
|
164
|
+
}
|
|
165
|
+
return prod;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/** For trivial checks split by single chars*/
|
|
169
|
+
static splitterAsChars(seq) {
|
|
170
|
+
return seq.split('');
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
static getSplitterWithSeparator(sep) {
|
|
174
|
+
return function(seq) {
|
|
175
|
+
return seq.split(sep);
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Multichar monomer names in square brackets, single char monomers or gap symbol
|
|
180
|
+
static monomerRe = /\[(\w+)\]|(\w)|(-)/g;
|
|
181
|
+
|
|
182
|
+
/** Split sequence for single character monomers, square brackets multichar monomer names or gap symbol. */
|
|
183
|
+
static splitterAsFasta(seq) {
|
|
184
|
+
const res = wu(seq.toString().matchAll(BioPackageDetectors.monomerRe)).map((ma) => {
|
|
185
|
+
let mRes;
|
|
186
|
+
const m = ma[0];
|
|
187
|
+
if (m.length > 1) {
|
|
188
|
+
if (m in BioPackageDetectors.aaSynonyms) {
|
|
189
|
+
mRes = BioPackageDetectors.aaSynonyms[m];
|
|
190
|
+
} else {
|
|
191
|
+
mRes = '';
|
|
192
|
+
console.debug(`Long monomer '${m}' has not a short synonym.`);
|
|
193
|
+
}
|
|
194
|
+
} else {
|
|
195
|
+
mRes = m;
|
|
196
|
+
}
|
|
197
|
+
return mRes;
|
|
198
|
+
}).toArray();
|
|
199
|
+
|
|
200
|
+
return res;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/** Only some of the synonyms. These were obtained from the clustered oligopeptide dataset. */
|
|
204
|
+
static aaSynonyms = {
|
|
205
|
+
'[MeNle]': 'L', // Nle - norleucine
|
|
206
|
+
'[MeA]': 'A', '[MeG]': 'G', '[MeF]': 'F',
|
|
207
|
+
};
|
|
9
208
|
}
|