@datagrok/bio 2.1.2 → 2.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/detectors.js +206 -147
- package/dist/package-test.js +323 -141
- package/dist/package.js +65 -32
- package/dist/vendors-node_modules_datagrok-libraries_ml_src_workers_dimensionality-reducer_js.js +3 -3
- package/package.json +5 -5
- package/scripts/generate_fasta_csv_for_alphabets.R +6 -9
- package/src/__jest__/remote.test.ts +13 -7
- package/src/package-test.ts +4 -3
- package/src/package.ts +31 -21
- package/src/tests/checkInputColumn-tests.ts +1 -1
- package/src/tests/{convert-test.ts → converters-test.ts} +0 -0
- package/src/tests/detectors-benchmark-tests.ts +165 -0
- package/src/tests/{detectors-test.ts → detectors-tests.ts} +18 -18
- package/src/tests/renderers-test.ts +2 -2
- package/src/tests/splitters-test.ts +1 -1
- package/src/tests/{substructure-filter-tests.ts → substructure-filters-tests.ts} +1 -1
- package/src/tests/{test-sequnces-generators.ts → utils/sequences-generators.ts} +0 -0
- package/{test-Bio-62cc009524f3-73ccfff9.html → test-Bio-62cc009524f3-6c978eb5.html} +114 -113
package/detectors.js
CHANGED
|
@@ -8,6 +8,9 @@
|
|
|
8
8
|
* TODO: Use detectors from WebLogo pickUp.. methods
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
|
+
const SEQ_SAMPLE_LIMIT = 100;
|
|
12
|
+
const SEQ_SAMPLE_LENGTH_LIMIT = 500;
|
|
13
|
+
|
|
11
14
|
/** enum type to simplify setting "user-friendly" notation if necessary */
|
|
12
15
|
const NOTATION = {
|
|
13
16
|
FASTA: 'fasta',
|
|
@@ -28,35 +31,37 @@ const ALIGNMENT = {
|
|
|
28
31
|
};
|
|
29
32
|
|
|
30
33
|
/** Class for handling notation units in Macromolecule columns */
|
|
31
|
-
|
|
32
|
-
|
|
34
|
+
const UnitsHandler = {
|
|
35
|
+
TAGS: {
|
|
33
36
|
aligned: 'aligned',
|
|
34
37
|
alphabet: 'alphabet',
|
|
35
38
|
alphabetSize: '.alphabetSize',
|
|
36
39
|
alphabetIsMultichar: '.alphabetIsMultichar',
|
|
37
40
|
separator: 'separator',
|
|
38
|
-
}
|
|
39
|
-
}
|
|
41
|
+
},
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
const isUrlRe = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)?/i;
|
|
40
45
|
|
|
41
46
|
class BioPackageDetectors extends DG.Package {
|
|
42
47
|
|
|
43
|
-
|
|
48
|
+
PeptideFastaAlphabet = new Set([
|
|
44
49
|
'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
|
|
45
50
|
'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T',
|
|
46
51
|
'MeNle', 'MeA', 'MeG', 'MeF',
|
|
47
52
|
]);
|
|
48
53
|
|
|
49
|
-
|
|
54
|
+
DnaFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
|
|
50
55
|
|
|
51
|
-
|
|
56
|
+
RnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
|
|
52
57
|
|
|
53
|
-
|
|
58
|
+
SmilesRawAlphabet = new Set([
|
|
54
59
|
'A', 'B', 'C', 'E', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'Z',
|
|
55
60
|
'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'r', 's', 't', 'u',
|
|
56
61
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
|
57
62
|
'+', '-', '.', , '/', '\\', '@', '[', ']', '(', ')', '#', '%', '=']);
|
|
58
63
|
|
|
59
|
-
|
|
64
|
+
SmartsRawAlphabet = new Set([
|
|
60
65
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
|
61
66
|
'!', '#', '$', '&', '(', ')', '*', '+', ',', '-', '.', ':', ';', '=', '@', '~', '[', ']',
|
|
62
67
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
|
|
@@ -67,7 +72,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
67
72
|
|
|
68
73
|
/** @param s {String} - string to check
|
|
69
74
|
* @returns {boolean} */
|
|
70
|
-
|
|
75
|
+
isHelm(s) {
|
|
71
76
|
return s.startsWith('PEPTIDE1{') || s.startsWith('CHEM1{') || s.startsWith('BLOB1{') ||
|
|
72
77
|
s.startsWith('RNA1{') || s.startsWith('DNA1{');
|
|
73
78
|
}
|
|
@@ -76,106 +81,123 @@ class BioPackageDetectors extends DG.Package {
|
|
|
76
81
|
//input: column col
|
|
77
82
|
//output: string semType
|
|
78
83
|
detectMacromolecule(col) {
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
84
|
+
const t1 = Date.now();
|
|
85
|
+
try {
|
|
86
|
+
// Fail early
|
|
87
|
+
if (col.type !== DG.TYPE.STRING) return null;
|
|
88
|
+
|
|
89
|
+
const categoriesSample = col.categories.length < SEQ_SAMPLE_LIMIT ? col.categories :
|
|
90
|
+
this.sample(col.categories, SEQ_SAMPLE_LIMIT);
|
|
91
|
+
|
|
92
|
+
// To collect alphabet freq three strategies can be used:
|
|
93
|
+
// as chars, as fasta (single or within square brackets), as with the separator.
|
|
94
|
+
if (
|
|
95
|
+
!(col.categories.length == 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category value
|
|
96
|
+
DG.Detector.sampleCategories(col, (s) => this.isHelm(s), 1, SEQ_SAMPLE_LIMIT)
|
|
97
|
+
) {
|
|
98
|
+
const statsAsHelm = this.getStats(categoriesSample, 2,
|
|
99
|
+
this.getSplitterAsHelm(SEQ_SAMPLE_LENGTH_LIMIT));
|
|
100
|
+
col.setTag(DG.TAGS.UNITS, NOTATION.HELM);
|
|
101
|
+
|
|
102
|
+
// alphabetSize calculated on (sub)sample of data is incorrect
|
|
103
|
+
// const alphabetSize = Object.keys(statsAsHelm.freq).length;
|
|
104
|
+
const alphabetIsMultichar = Object.keys(statsAsHelm.freq).some((m) => m.length > 1);
|
|
105
|
+
// col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
106
|
+
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
95
107
|
|
|
96
|
-
const decoyAlphabets = [
|
|
97
|
-
['SMILES', BioPackageDetectors.SmilesRawAlphabet, 0.30],
|
|
98
|
-
['SMARTS', BioPackageDetectors.SmartsRawAlphabet, 0.45],
|
|
99
|
-
];
|
|
100
|
-
|
|
101
|
-
const candidateAlphabets = [
|
|
102
|
-
[ALPHABET.PT, BioPackageDetectors.PeptideFastaAlphabet, 0.50],
|
|
103
|
-
[ALPHABET.DNA, BioPackageDetectors.DnaFastaAlphabet, 0.55],
|
|
104
|
-
[ALPHABET.RNA, BioPackageDetectors.RnaFastaAlphabet, 0.55],
|
|
105
|
-
];
|
|
106
|
-
|
|
107
|
-
// Check for url column, maybe it is too heavy check
|
|
108
|
-
const isUrlCheck = (s) => {
|
|
109
|
-
let res = true;
|
|
110
|
-
try {
|
|
111
|
-
const url = new URL(s);
|
|
112
|
-
res = true;
|
|
113
|
-
} catch {
|
|
114
|
-
res = false;
|
|
115
|
-
}
|
|
116
|
-
return res;
|
|
117
|
-
};
|
|
118
|
-
const isUrl = DG.Detector.sampleCategories(col, isUrlCheck, 1);
|
|
119
|
-
if (isUrl) return null;
|
|
120
|
-
|
|
121
|
-
// TODO: Detect HELM sequence
|
|
122
|
-
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
123
|
-
const statsAsChars = BioPackageDetectors.getStats(col, 5, BioPackageDetectors.splitterAsChars);
|
|
124
|
-
// if (Object.keys(statsAsChars.freq).length === 0) return;
|
|
125
|
-
|
|
126
|
-
const decoy = BioPackageDetectors.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
|
|
127
|
-
if (decoy != ALPHABET.UN) return null;
|
|
128
|
-
|
|
129
|
-
if (statsAsChars.sameLength) {
|
|
130
|
-
if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
|
|
131
|
-
const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, candidateAlphabets, '-');
|
|
132
|
-
if (alphabet === ALPHABET.UN) return null;
|
|
133
|
-
|
|
134
|
-
const units = NOTATION.FASTA;
|
|
135
|
-
col.setTag(DG.TAGS.UNITS, units);
|
|
136
|
-
col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
|
|
137
|
-
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
138
108
|
return DG.SEMTYPE.MACROMOLECULE;
|
|
139
109
|
}
|
|
140
|
-
|
|
141
|
-
const
|
|
110
|
+
|
|
111
|
+
const decoyAlphabets = [
|
|
112
|
+
['SMILES', this.SmilesRawAlphabet, 0.30],
|
|
113
|
+
['SMARTS', this.SmartsRawAlphabet, 0.43],
|
|
114
|
+
];
|
|
115
|
+
|
|
116
|
+
const candidateAlphabets = [
|
|
117
|
+
[ALPHABET.PT, this.PeptideFastaAlphabet, 0.50],
|
|
118
|
+
[ALPHABET.DNA, this.DnaFastaAlphabet, 0.55],
|
|
119
|
+
[ALPHABET.RNA, this.RnaFastaAlphabet, 0.55],
|
|
120
|
+
];
|
|
121
|
+
|
|
122
|
+
// Check for url column, maybe it is too heavy check
|
|
123
|
+
const isUrlCheck = (s) => {
|
|
124
|
+
let res = true;
|
|
125
|
+
try {
|
|
126
|
+
const url = new URL(s);
|
|
127
|
+
res = true;
|
|
128
|
+
} catch {
|
|
129
|
+
res = false;
|
|
130
|
+
}
|
|
131
|
+
return res;
|
|
132
|
+
// return isUrlRe.test(s);
|
|
133
|
+
};
|
|
134
|
+
const isUrl = categoriesSample.every((v) => { return !v || isUrlCheck(v); });
|
|
135
|
+
if (isUrl) return null;
|
|
136
|
+
|
|
137
|
+
// TODO: Detect HELM sequence
|
|
138
|
+
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
139
|
+
const statsAsChars = this.getStats(categoriesSample, 5,
|
|
140
|
+
this.getSplitterAsChars(SEQ_SAMPLE_LENGTH_LIMIT));
|
|
141
|
+
// if (Object.keys(statsAsChars.freq).length === 0) return;
|
|
142
|
+
|
|
143
|
+
const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
|
|
144
|
+
if (decoy != ALPHABET.UN) return null;
|
|
145
|
+
|
|
146
|
+
const separator = this.detectSeparator(statsAsChars.freq);
|
|
147
|
+
const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
|
|
142
148
|
const gapSymbol = separator ? '' : '-';
|
|
143
|
-
const splitter = separator ?
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
if (
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
149
|
+
const splitter = separator ? this.getSplitterWithSeparator(separator, SEQ_SAMPLE_LENGTH_LIMIT) :
|
|
150
|
+
this.getSplitterAsFasta(SEQ_SAMPLE_LENGTH_LIMIT);
|
|
151
|
+
|
|
152
|
+
col.setTag(DG.TAGS.UNITS, units);
|
|
153
|
+
if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
|
|
154
|
+
|
|
155
|
+
if (statsAsChars.sameLength) {
|
|
156
|
+
if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
|
|
157
|
+
const stats = this.getStats(categoriesSample, 5, splitter);
|
|
158
|
+
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-');
|
|
159
|
+
if (alphabet === ALPHABET.UN) return null;
|
|
160
|
+
|
|
161
|
+
col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
|
|
162
|
+
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
163
|
+
return DG.SEMTYPE.MACROMOLECULE;
|
|
164
|
+
}
|
|
165
|
+
} else {
|
|
166
|
+
const stats = this.getStats(categoriesSample, 5, splitter);
|
|
167
|
+
// Empty monomer alphabet is not allowed
|
|
168
|
+
if (Object.keys(stats.freq).length === 0) return null;
|
|
169
|
+
// Long monomer names for sequences with separators have constraints
|
|
170
|
+
if (separator && this.checkForbiddenWithSeparators(stats.freq)) return null;
|
|
171
|
+
|
|
172
|
+
const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
|
|
173
|
+
|
|
174
|
+
// TODO: If separator detected, then extra efforts to detect alphabet are allowed.
|
|
175
|
+
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
|
|
176
|
+
|
|
177
|
+
// const forbidden = this.checkForbiddenWoSeparator(stats.freq);
|
|
178
|
+
if (separator || alphabet != 'UN') {
|
|
179
|
+
col.setTag(UnitsHandler.TAGS.aligned, aligned);
|
|
180
|
+
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
181
|
+
if (alphabet === ALPHABET.UN) {
|
|
182
|
+
// alphabetSize calculated on (sub)sample of data is incorrect
|
|
183
|
+
// const alphabetSize = Object.keys(stats.freq).length;
|
|
184
|
+
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
185
|
+
// col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
186
|
+
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
187
|
+
}
|
|
188
|
+
return DG.SEMTYPE.MACROMOLECULE;
|
|
169
189
|
}
|
|
170
|
-
return DG.SEMTYPE.MACROMOLECULE;
|
|
171
190
|
}
|
|
191
|
+
} finally {
|
|
192
|
+
const t2 = Date.now();
|
|
193
|
+
console.debug('Bio: detectMacromolecule() ' + `ET = ${t2 - t1} ms.`);
|
|
172
194
|
}
|
|
173
195
|
}
|
|
174
196
|
|
|
175
197
|
/** Detects the most frequent char with a rate of at least 0.15 of others in sum.
|
|
176
198
|
* Does not use any splitting strategies, estimates just by single characters.
|
|
177
199
|
* */
|
|
178
|
-
|
|
200
|
+
detectSeparator(freq) {
|
|
179
201
|
// To detect a separator we analyse col's sequences character frequencies.
|
|
180
202
|
// If there is an exceptionally frequent symbol, then we will call it the separator.
|
|
181
203
|
// The most frequent symbol should occur with a rate of at least 0.15
|
|
@@ -190,8 +212,8 @@ class BioPackageDetectors extends DG.Package {
|
|
|
190
212
|
const cleanFreq = Object.assign({}, ...Object.entries(freq)
|
|
191
213
|
.filter(([m, f]) =>
|
|
192
214
|
!noSeparatorChemRe.test(m) && !noSeparatorAlphaDigitRe.test(m) && !noSeparatorBracketsRe.test(m) &&
|
|
193
|
-
!
|
|
194
|
-
!
|
|
215
|
+
!this.PeptideFastaAlphabet.has(m) &&
|
|
216
|
+
!this.DnaFastaAlphabet.has(m))
|
|
195
217
|
.map(([m, f]) => ({[m]: f})));
|
|
196
218
|
if (Object.keys(cleanFreq).length == 0) return null;
|
|
197
219
|
|
|
@@ -208,24 +230,24 @@ class BioPackageDetectors extends DG.Package {
|
|
|
208
230
|
/** With a separator, spaces are nor allowed in monomer names.
|
|
209
231
|
* The monomer name/label cannot contain digits only.
|
|
210
232
|
*/
|
|
211
|
-
|
|
233
|
+
checkForbiddenWithSeparators(freq) {
|
|
212
234
|
const forbiddenRe = /[ ]|^\d+$/i;
|
|
213
235
|
return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
|
|
214
236
|
}
|
|
215
237
|
|
|
216
238
|
// /** Without a separator, special symbols or digits are not allowed as monomers. */
|
|
217
|
-
//
|
|
239
|
+
// checkForbiddenWoSeparator(freq) {
|
|
218
240
|
// const forbiddenRe = /[\d!@#$%^&*()_+\-=\[\]{};':"\\|,.<>\/?]/i;
|
|
219
241
|
// return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
|
|
220
242
|
// }
|
|
221
243
|
|
|
222
244
|
/** Stats of sequences with specified splitter func, returns { freq, sameLength } */
|
|
223
|
-
|
|
245
|
+
getStats(values, minLength, splitter) {
|
|
224
246
|
const freq = {};
|
|
225
247
|
let sameLength = true;
|
|
226
248
|
let firstLength = null;
|
|
227
249
|
|
|
228
|
-
for (const seq of
|
|
250
|
+
for (const seq of values) {
|
|
229
251
|
const mSeq = splitter(seq);
|
|
230
252
|
|
|
231
253
|
if (firstLength == null) {
|
|
@@ -250,9 +272,9 @@ class BioPackageDetectors extends DG.Package {
|
|
|
250
272
|
* @param freq frequencies of monomers in sequence set
|
|
251
273
|
* @param candidates an array of pairs [name, monomer set]
|
|
252
274
|
* */
|
|
253
|
-
|
|
275
|
+
detectAlphabet(freq, candidates, gapSymbol) {
|
|
254
276
|
const candidatesSims = candidates.map((c) => {
|
|
255
|
-
const sim =
|
|
277
|
+
const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol);
|
|
256
278
|
return [c[0], c[1], c[2], freq, sim];
|
|
257
279
|
});
|
|
258
280
|
|
|
@@ -267,7 +289,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
267
289
|
return alphabetName;
|
|
268
290
|
}
|
|
269
291
|
|
|
270
|
-
|
|
292
|
+
getAlphabetSimilarity(freq, alphabet, gapSymbol) {
|
|
271
293
|
const keys = new Set([...new Set(Object.keys(freq)), ...alphabet]);
|
|
272
294
|
keys.delete(gapSymbol);
|
|
273
295
|
|
|
@@ -278,11 +300,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
278
300
|
alphabetA.push(alphabet.has(m) ? 10 : -20 /* penalty for character outside alphabet set*/);
|
|
279
301
|
}
|
|
280
302
|
/* There were a few ideas: chi-squared, pearson correlation (variance?), scalar product */
|
|
281
|
-
const cos =
|
|
303
|
+
const cos = this.vectorDotProduct(freqA, alphabetA) / (this.vectorLength(freqA) * this.vectorLength(alphabetA));
|
|
282
304
|
return cos;
|
|
283
305
|
}
|
|
284
306
|
|
|
285
|
-
|
|
307
|
+
vectorLength(v) {
|
|
286
308
|
let sqrSum = 0;
|
|
287
309
|
for (let i = 0; i < v.length; i++) {
|
|
288
310
|
sqrSum += v[i] * v[i];
|
|
@@ -290,7 +312,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
290
312
|
return Math.sqrt(sqrSum);
|
|
291
313
|
}
|
|
292
314
|
|
|
293
|
-
|
|
315
|
+
vectorDotProduct(v1, v2) {
|
|
294
316
|
if (v1.length != v2.length) {
|
|
295
317
|
throw Error('The dimensionality of the vectors must match');
|
|
296
318
|
}
|
|
@@ -302,62 +324,99 @@ class BioPackageDetectors extends DG.Package {
|
|
|
302
324
|
}
|
|
303
325
|
|
|
304
326
|
/** For trivial checks split by single chars*/
|
|
305
|
-
|
|
306
|
-
return seq
|
|
327
|
+
getSplitterAsChars(lengthLimit) {
|
|
328
|
+
return function(seq) {
|
|
329
|
+
return seq.split('', lengthLimit);
|
|
330
|
+
}.bind(this);
|
|
307
331
|
}
|
|
308
332
|
|
|
309
|
-
|
|
333
|
+
getSplitterWithSeparator(separator, lengthLimit) {
|
|
310
334
|
return function(seq) {
|
|
311
|
-
|
|
312
|
-
|
|
335
|
+
// if (!!lengthLimit) {
|
|
336
|
+
// const res = new Array(lengthLimit);
|
|
337
|
+
// let pos = 0, count = 0;
|
|
338
|
+
// while (pos < seq.length && count < lengthLimit) {
|
|
339
|
+
// const newPos = seq.indexOf(separator, pos);
|
|
340
|
+
// res[count] = seq.substring(pos, newPos);
|
|
341
|
+
// count++;
|
|
342
|
+
// pos = newPos;
|
|
343
|
+
// }
|
|
344
|
+
//
|
|
345
|
+
// return res.slice(0, count);
|
|
346
|
+
// } else {
|
|
347
|
+
return seq.split(separator, lengthLimit);
|
|
348
|
+
// }
|
|
349
|
+
}.bind(this);
|
|
313
350
|
}
|
|
314
351
|
|
|
315
352
|
// Multichar monomer names in square brackets, single char monomers or gap symbol
|
|
316
|
-
|
|
353
|
+
monomerRe = /\[(\w+)\]|(\w)|(-)/g;
|
|
317
354
|
|
|
318
355
|
/** Split sequence for single character monomers, square brackets multichar monomer names or gap symbol. */
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
356
|
+
getSplitterAsFasta(lengthLimit) {
|
|
357
|
+
return function(seq) {
|
|
358
|
+
const res = wu(seq.toString().matchAll(this.monomerRe))
|
|
359
|
+
.take(lengthLimit)
|
|
360
|
+
.map((ma) => {
|
|
361
|
+
let mRes;
|
|
362
|
+
const m = ma[0];
|
|
363
|
+
if (m.length > 1) {
|
|
364
|
+
mRes = ma[1];
|
|
365
|
+
} else {
|
|
366
|
+
mRes = m;
|
|
367
|
+
}
|
|
368
|
+
return mRes;
|
|
369
|
+
}).toArray();
|
|
330
370
|
|
|
331
|
-
|
|
371
|
+
return res;
|
|
372
|
+
}.bind(this);
|
|
332
373
|
}
|
|
333
374
|
|
|
334
375
|
/** Only some of the synonyms. These were obtained from the clustered oligopeptide dataset. */
|
|
335
|
-
|
|
376
|
+
aaSynonyms = {
|
|
336
377
|
'[MeNle]': 'L', // Nle - norleucine
|
|
337
378
|
'[MeA]': 'A', '[MeG]': 'G', '[MeF]': 'F',
|
|
338
379
|
};
|
|
339
380
|
|
|
340
|
-
|
|
341
|
-
|
|
381
|
+
helmRe = /(PEPTIDE1|DNA1|RNA1)\{([^}]+)}/g;
|
|
382
|
+
helmPp1Re = /\[([^\[\]]+)]/g;
|
|
342
383
|
|
|
343
384
|
/** Splits Helm string to monomers, but does not replace monomer names to other notation (e.g. for RNA). */
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
385
|
+
getSplitterAsHelm(lengthLimit) {
|
|
386
|
+
return function(seq) {
|
|
387
|
+
this.helmRe.lastIndex = 0;
|
|
388
|
+
const ea = this.helmRe.exec(seq.toString());
|
|
389
|
+
const inSeq = ea ? ea[2] : null;
|
|
390
|
+
|
|
391
|
+
const mmPostProcess = (mm) => {
|
|
392
|
+
this.helmPp1Re.lastIndex = 0;
|
|
393
|
+
const pp1M = this.helmPp1Re.exec(mm);
|
|
394
|
+
if (pp1M && pp1M.length >= 2) {
|
|
395
|
+
return pp1M[1];
|
|
396
|
+
} else {
|
|
397
|
+
return mm;
|
|
398
|
+
}
|
|
399
|
+
};
|
|
400
|
+
|
|
401
|
+
const mmList = inSeq ? inSeq.split('.') : [];
|
|
402
|
+
const mmListRes = mmList.map(mmPostProcess);
|
|
403
|
+
return mmListRes;
|
|
404
|
+
}.bind(this);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
sample(src, n) {
|
|
408
|
+
if (src.length < n) {
|
|
409
|
+
throw new Error('Sample source is less than n requested.');
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const idxSet = new Set();
|
|
413
|
+
while (idxSet.size < n) {
|
|
414
|
+
const idx = Math.floor(Math.random() * src.length);
|
|
415
|
+
if (!idxSet.has(idx)) {
|
|
416
|
+
idxSet.add(idx);
|
|
356
417
|
}
|
|
357
|
-
}
|
|
418
|
+
}
|
|
358
419
|
|
|
359
|
-
|
|
360
|
-
const mmListRes = mmList.map(mmPostProcess);
|
|
361
|
-
return mmListRes;
|
|
420
|
+
return [...idxSet].map((idx) => src[idx]);
|
|
362
421
|
}
|
|
363
422
|
}
|