@datagrok/bio 2.1.4 → 2.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/detectors.js +191 -158
- package/dist/package-test.js +281 -152
- package/dist/package.js +38 -19
- package/files/tests/SPGI-derived.csv +320 -0
- package/package.json +3 -3
- package/scripts/generate_fasta_csv_for_alphabets.R +6 -9
- package/src/__jest__/remote.test.ts +13 -7
- package/src/package-test.ts +4 -3
- package/src/package.ts +29 -21
- package/src/tests/{convert-test.ts → converters-test.ts} +0 -0
- package/src/tests/detectors-benchmark-tests.ts +165 -0
- package/src/tests/{detectors-test.ts → detectors-tests.ts} +19 -1
- package/src/tests/renderers-test.ts +1 -6
- package/src/tests/splitters-test.ts +0 -5
- package/src/tests/{substructure-filter-tests.ts → substructure-filters-tests.ts} +1 -1
- package/src/tests/{test-sequnces-generators.ts → utils/sequences-generators.ts} +0 -0
- package/{test-Bio-62cc009524f3-db2d0836.html → test-Bio-62cc009524f3-9c526574.html} +111 -107
package/detectors.js
CHANGED
|
@@ -8,7 +8,8 @@
|
|
|
8
8
|
* TODO: Use detectors from WebLogo pickUp.. methods
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
|
-
const
|
|
11
|
+
const SEQ_SAMPLE_LIMIT = 100;
|
|
12
|
+
const SEQ_SAMPLE_LENGTH_LIMIT = 500;
|
|
12
13
|
|
|
13
14
|
/** enum type to simplify setting "user-friendly" notation if necessary */
|
|
14
15
|
const NOTATION = {
|
|
@@ -30,35 +31,37 @@ const ALIGNMENT = {
|
|
|
30
31
|
};
|
|
31
32
|
|
|
32
33
|
/** Class for handling notation units in Macromolecule columns */
|
|
33
|
-
|
|
34
|
-
|
|
34
|
+
const UnitsHandler = {
|
|
35
|
+
TAGS: {
|
|
35
36
|
aligned: 'aligned',
|
|
36
37
|
alphabet: 'alphabet',
|
|
37
38
|
alphabetSize: '.alphabetSize',
|
|
38
39
|
alphabetIsMultichar: '.alphabetIsMultichar',
|
|
39
40
|
separator: 'separator',
|
|
40
|
-
}
|
|
41
|
-
}
|
|
41
|
+
},
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
const isUrlRe = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)?/i;
|
|
42
45
|
|
|
43
46
|
class BioPackageDetectors extends DG.Package {
|
|
44
47
|
|
|
45
|
-
|
|
48
|
+
PeptideFastaAlphabet = new Set([
|
|
46
49
|
'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
|
|
47
50
|
'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T',
|
|
48
51
|
'MeNle', 'MeA', 'MeG', 'MeF',
|
|
49
52
|
]);
|
|
50
53
|
|
|
51
|
-
|
|
54
|
+
DnaFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
|
|
52
55
|
|
|
53
|
-
|
|
56
|
+
RnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
|
|
54
57
|
|
|
55
|
-
|
|
58
|
+
SmilesRawAlphabet = new Set([
|
|
56
59
|
'A', 'B', 'C', 'E', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'Z',
|
|
57
60
|
'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'r', 's', 't', 'u',
|
|
58
61
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
|
59
62
|
'+', '-', '.', , '/', '\\', '@', '[', ']', '(', ')', '#', '%', '=']);
|
|
60
63
|
|
|
61
|
-
|
|
64
|
+
SmartsRawAlphabet = new Set([
|
|
62
65
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
|
63
66
|
'!', '#', '$', '&', '(', ')', '*', '+', ',', '-', '.', ':', ';', '=', '@', '~', '[', ']',
|
|
64
67
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
|
|
@@ -69,7 +72,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
69
72
|
|
|
70
73
|
/** @param s {String} - string to check
|
|
71
74
|
* @returns {boolean} */
|
|
72
|
-
|
|
75
|
+
isHelm(s) {
|
|
73
76
|
return s.startsWith('PEPTIDE1{') || s.startsWith('CHEM1{') || s.startsWith('BLOB1{') ||
|
|
74
77
|
s.startsWith('RNA1{') || s.startsWith('DNA1{');
|
|
75
78
|
}
|
|
@@ -78,114 +81,123 @@ class BioPackageDetectors extends DG.Package {
|
|
|
78
81
|
//input: column col
|
|
79
82
|
//output: string semType
|
|
80
83
|
detectMacromolecule(col) {
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
84
|
+
const t1 = Date.now();
|
|
85
|
+
try {
|
|
86
|
+
// Fail early
|
|
87
|
+
if (col.type !== DG.TYPE.STRING) return null;
|
|
88
|
+
|
|
89
|
+
const categoriesSample = col.categories.length < SEQ_SAMPLE_LIMIT ? col.categories :
|
|
90
|
+
this.sample(col.categories, SEQ_SAMPLE_LIMIT);
|
|
91
|
+
|
|
92
|
+
// To collect alphabet freq three strategies can be used:
|
|
93
|
+
// as chars, as fasta (single or within square brackets), as with the separator.
|
|
94
|
+
if (
|
|
95
|
+
!(col.categories.length == 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category value
|
|
96
|
+
DG.Detector.sampleCategories(col, (s) => this.isHelm(s), 1, SEQ_SAMPLE_LIMIT)
|
|
97
|
+
) {
|
|
98
|
+
const statsAsHelm = this.getStats(categoriesSample, 2,
|
|
99
|
+
this.getSplitterAsHelm(SEQ_SAMPLE_LENGTH_LIMIT));
|
|
100
|
+
col.setTag(DG.TAGS.UNITS, NOTATION.HELM);
|
|
101
|
+
|
|
102
|
+
// alphabetSize calculated on (sub)sample of data is incorrect
|
|
103
|
+
// const alphabetSize = Object.keys(statsAsHelm.freq).length;
|
|
104
|
+
const alphabetIsMultichar = Object.keys(statsAsHelm.freq).some((m) => m.length > 1);
|
|
105
|
+
// col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
106
|
+
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
104
107
|
|
|
105
|
-
const decoyAlphabets = [
|
|
106
|
-
['SMILES', BioPackageDetectors.SmilesRawAlphabet, 0.30],
|
|
107
|
-
['SMARTS', BioPackageDetectors.SmartsRawAlphabet, 0.45],
|
|
108
|
-
];
|
|
109
|
-
|
|
110
|
-
const candidateAlphabets = [
|
|
111
|
-
[ALPHABET.PT, BioPackageDetectors.PeptideFastaAlphabet, 0.50],
|
|
112
|
-
[ALPHABET.DNA, BioPackageDetectors.DnaFastaAlphabet, 0.55],
|
|
113
|
-
[ALPHABET.RNA, BioPackageDetectors.RnaFastaAlphabet, 0.55],
|
|
114
|
-
];
|
|
115
|
-
|
|
116
|
-
// Check for url column, maybe it is too heavy check
|
|
117
|
-
const isUrlCheck = (s) => {
|
|
118
|
-
let res = true;
|
|
119
|
-
try {
|
|
120
|
-
const url = new URL(s);
|
|
121
|
-
res = true;
|
|
122
|
-
} catch {
|
|
123
|
-
res = false;
|
|
124
|
-
}
|
|
125
|
-
return res;
|
|
126
|
-
};
|
|
127
|
-
const isUrl = categoriesSample.every((v) => { return !v || isUrlCheck(v); });
|
|
128
|
-
if (isUrl) return null;
|
|
129
|
-
|
|
130
|
-
// TODO: Detect HELM sequence
|
|
131
|
-
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
132
|
-
const statsAsChars = BioPackageDetectors.getStats(categoriesSample, 5, BioPackageDetectors.splitterAsChars);
|
|
133
|
-
// if (Object.keys(statsAsChars.freq).length === 0) return;
|
|
134
|
-
|
|
135
|
-
const decoy = BioPackageDetectors.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
|
|
136
|
-
if (decoy != ALPHABET.UN) return null;
|
|
137
|
-
|
|
138
|
-
if (statsAsChars.sameLength) {
|
|
139
|
-
if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
|
|
140
|
-
const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, candidateAlphabets, '-');
|
|
141
|
-
if (alphabet === ALPHABET.UN) return null;
|
|
142
|
-
|
|
143
|
-
const units = NOTATION.FASTA;
|
|
144
|
-
col.setTag(DG.TAGS.UNITS, units);
|
|
145
|
-
col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
|
|
146
|
-
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
147
108
|
return DG.SEMTYPE.MACROMOLECULE;
|
|
148
109
|
}
|
|
149
|
-
|
|
150
|
-
const
|
|
110
|
+
|
|
111
|
+
const decoyAlphabets = [
|
|
112
|
+
['SMILES', this.SmilesRawAlphabet, 0.30],
|
|
113
|
+
['SMARTS', this.SmartsRawAlphabet, 0.43],
|
|
114
|
+
];
|
|
115
|
+
|
|
116
|
+
const candidateAlphabets = [
|
|
117
|
+
[ALPHABET.PT, this.PeptideFastaAlphabet, 0.50],
|
|
118
|
+
[ALPHABET.DNA, this.DnaFastaAlphabet, 0.55],
|
|
119
|
+
[ALPHABET.RNA, this.RnaFastaAlphabet, 0.55],
|
|
120
|
+
];
|
|
121
|
+
|
|
122
|
+
// Check for url column, maybe it is too heavy check
|
|
123
|
+
const isUrlCheck = (s) => {
|
|
124
|
+
let res = true;
|
|
125
|
+
try {
|
|
126
|
+
const url = new URL(s);
|
|
127
|
+
res = true;
|
|
128
|
+
} catch {
|
|
129
|
+
res = false;
|
|
130
|
+
}
|
|
131
|
+
return res;
|
|
132
|
+
// return isUrlRe.test(s);
|
|
133
|
+
};
|
|
134
|
+
const isUrl = categoriesSample.every((v) => { return !v || isUrlCheck(v); });
|
|
135
|
+
if (isUrl) return null;
|
|
136
|
+
|
|
137
|
+
// TODO: Detect HELM sequence
|
|
138
|
+
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
139
|
+
const statsAsChars = this.getStats(categoriesSample, 5,
|
|
140
|
+
this.getSplitterAsChars(SEQ_SAMPLE_LENGTH_LIMIT));
|
|
141
|
+
// if (Object.keys(statsAsChars.freq).length === 0) return;
|
|
142
|
+
|
|
143
|
+
const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
|
|
144
|
+
if (decoy != ALPHABET.UN) return null;
|
|
145
|
+
|
|
146
|
+
const separator = this.detectSeparator(statsAsChars.freq);
|
|
147
|
+
const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
|
|
151
148
|
const gapSymbol = separator ? '' : '-';
|
|
152
|
-
const splitter = separator ?
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
if (
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
149
|
+
const splitter = separator ? this.getSplitterWithSeparator(separator, SEQ_SAMPLE_LENGTH_LIMIT) :
|
|
150
|
+
this.getSplitterAsFasta(SEQ_SAMPLE_LENGTH_LIMIT);
|
|
151
|
+
|
|
152
|
+
col.setTag(DG.TAGS.UNITS, units);
|
|
153
|
+
if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
|
|
154
|
+
|
|
155
|
+
if (statsAsChars.sameLength) {
|
|
156
|
+
if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
|
|
157
|
+
const stats = this.getStats(categoriesSample, 5, splitter);
|
|
158
|
+
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-');
|
|
159
|
+
if (alphabet === ALPHABET.UN) return null;
|
|
160
|
+
|
|
161
|
+
col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
|
|
162
|
+
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
163
|
+
return DG.SEMTYPE.MACROMOLECULE;
|
|
164
|
+
}
|
|
165
|
+
} else {
|
|
166
|
+
const stats = this.getStats(categoriesSample, 5, splitter);
|
|
167
|
+
// Empty monomer alphabet is not allowed
|
|
168
|
+
if (Object.keys(stats.freq).length === 0) return null;
|
|
169
|
+
// Long monomer names for sequences with separators have constraints
|
|
170
|
+
if (separator && this.checkForbiddenWithSeparators(stats.freq)) return null;
|
|
171
|
+
|
|
172
|
+
const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
|
|
173
|
+
|
|
174
|
+
// TODO: If separator detected, then extra efforts to detect alphabet are allowed.
|
|
175
|
+
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
|
|
176
|
+
|
|
177
|
+
// const forbidden = this.checkForbiddenWoSeparator(stats.freq);
|
|
178
|
+
if (separator || alphabet != 'UN') {
|
|
179
|
+
col.setTag(UnitsHandler.TAGS.aligned, aligned);
|
|
180
|
+
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
181
|
+
if (alphabet === ALPHABET.UN) {
|
|
182
|
+
// alphabetSize calculated on (sub)sample of data is incorrect
|
|
183
|
+
// const alphabetSize = Object.keys(stats.freq).length;
|
|
184
|
+
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
185
|
+
// col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
186
|
+
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
187
|
+
}
|
|
188
|
+
return DG.SEMTYPE.MACROMOLECULE;
|
|
179
189
|
}
|
|
180
|
-
return DG.SEMTYPE.MACROMOLECULE;
|
|
181
190
|
}
|
|
191
|
+
} finally {
|
|
192
|
+
const t2 = Date.now();
|
|
193
|
+
console.debug('Bio: detectMacromolecule() ' + `ET = ${t2 - t1} ms.`);
|
|
182
194
|
}
|
|
183
195
|
}
|
|
184
196
|
|
|
185
197
|
/** Detects the most frequent char with a rate of at least 0.15 of others in sum.
|
|
186
198
|
* Does not use any splitting strategies, estimates just by single characters.
|
|
187
199
|
* */
|
|
188
|
-
|
|
200
|
+
detectSeparator(freq) {
|
|
189
201
|
// To detect a separator we analyse col's sequences character frequencies.
|
|
190
202
|
// If there is an exceptionally frequent symbol, then we will call it the separator.
|
|
191
203
|
// The most frequent symbol should occur with a rate of at least 0.15
|
|
@@ -195,13 +207,13 @@ class BioPackageDetectors extends DG.Package {
|
|
|
195
207
|
// !!! What is the difference between the gap symbol and separator symbol in stats terms?
|
|
196
208
|
// const noSeparatorRe = /[a-z\d]+$/i;
|
|
197
209
|
const noSeparatorChemRe = /[HBCNOFPSKVYI]/i; // Mendeleev's periodic table single char elements
|
|
198
|
-
const noSeparatorAlphaDigitRe = /[\dA-Z,& _]/i; // ..., comma, ampersand, space, underscore
|
|
210
|
+
const noSeparatorAlphaDigitRe = /[\dA-Z,& _\r\n]/i; // ..., comma, ampersand, space, underscore, CR, LF
|
|
199
211
|
const noSeparatorBracketsRe = /[\[\]()<>{}]/i;
|
|
200
212
|
const cleanFreq = Object.assign({}, ...Object.entries(freq)
|
|
201
213
|
.filter(([m, f]) =>
|
|
202
214
|
!noSeparatorChemRe.test(m) && !noSeparatorAlphaDigitRe.test(m) && !noSeparatorBracketsRe.test(m) &&
|
|
203
|
-
!
|
|
204
|
-
!
|
|
215
|
+
!this.PeptideFastaAlphabet.has(m) &&
|
|
216
|
+
!this.DnaFastaAlphabet.has(m))
|
|
205
217
|
.map(([m, f]) => ({[m]: f})));
|
|
206
218
|
if (Object.keys(cleanFreq).length == 0) return null;
|
|
207
219
|
|
|
@@ -218,19 +230,19 @@ class BioPackageDetectors extends DG.Package {
|
|
|
218
230
|
/** With a separator, spaces are nor allowed in monomer names.
|
|
219
231
|
* The monomer name/label cannot contain digits only.
|
|
220
232
|
*/
|
|
221
|
-
|
|
233
|
+
checkForbiddenWithSeparators(freq) {
|
|
222
234
|
const forbiddenRe = /[ ]|^\d+$/i;
|
|
223
235
|
return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
|
|
224
236
|
}
|
|
225
237
|
|
|
226
238
|
// /** Without a separator, special symbols or digits are not allowed as monomers. */
|
|
227
|
-
//
|
|
239
|
+
// checkForbiddenWoSeparator(freq) {
|
|
228
240
|
// const forbiddenRe = /[\d!@#$%^&*()_+\-=\[\]{};':"\\|,.<>\/?]/i;
|
|
229
241
|
// return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
|
|
230
242
|
// }
|
|
231
243
|
|
|
232
244
|
/** Stats of sequences with specified splitter func, returns { freq, sameLength } */
|
|
233
|
-
|
|
245
|
+
getStats(values, minLength, splitter) {
|
|
234
246
|
const freq = {};
|
|
235
247
|
let sameLength = true;
|
|
236
248
|
let firstLength = null;
|
|
@@ -260,9 +272,9 @@ class BioPackageDetectors extends DG.Package {
|
|
|
260
272
|
* @param freq frequencies of monomers in sequence set
|
|
261
273
|
* @param candidates an array of pairs [name, monomer set]
|
|
262
274
|
* */
|
|
263
|
-
|
|
275
|
+
detectAlphabet(freq, candidates, gapSymbol) {
|
|
264
276
|
const candidatesSims = candidates.map((c) => {
|
|
265
|
-
const sim =
|
|
277
|
+
const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol);
|
|
266
278
|
return [c[0], c[1], c[2], freq, sim];
|
|
267
279
|
});
|
|
268
280
|
|
|
@@ -277,7 +289,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
277
289
|
return alphabetName;
|
|
278
290
|
}
|
|
279
291
|
|
|
280
|
-
|
|
292
|
+
getAlphabetSimilarity(freq, alphabet, gapSymbol) {
|
|
281
293
|
const keys = new Set([...new Set(Object.keys(freq)), ...alphabet]);
|
|
282
294
|
keys.delete(gapSymbol);
|
|
283
295
|
|
|
@@ -288,11 +300,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
288
300
|
alphabetA.push(alphabet.has(m) ? 10 : -20 /* penalty for character outside alphabet set*/);
|
|
289
301
|
}
|
|
290
302
|
/* There were a few ideas: chi-squared, pearson correlation (variance?), scalar product */
|
|
291
|
-
const cos =
|
|
303
|
+
const cos = this.vectorDotProduct(freqA, alphabetA) / (this.vectorLength(freqA) * this.vectorLength(alphabetA));
|
|
292
304
|
return cos;
|
|
293
305
|
}
|
|
294
306
|
|
|
295
|
-
|
|
307
|
+
vectorLength(v) {
|
|
296
308
|
let sqrSum = 0;
|
|
297
309
|
for (let i = 0; i < v.length; i++) {
|
|
298
310
|
sqrSum += v[i] * v[i];
|
|
@@ -300,7 +312,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
300
312
|
return Math.sqrt(sqrSum);
|
|
301
313
|
}
|
|
302
314
|
|
|
303
|
-
|
|
315
|
+
vectorDotProduct(v1, v2) {
|
|
304
316
|
if (v1.length != v2.length) {
|
|
305
317
|
throw Error('The dimensionality of the vectors must match');
|
|
306
318
|
}
|
|
@@ -312,66 +324,87 @@ class BioPackageDetectors extends DG.Package {
|
|
|
312
324
|
}
|
|
313
325
|
|
|
314
326
|
/** For trivial checks split by single chars*/
|
|
315
|
-
|
|
316
|
-
return seq
|
|
327
|
+
getSplitterAsChars(lengthLimit) {
|
|
328
|
+
return function(seq) {
|
|
329
|
+
return seq.split('', lengthLimit);
|
|
330
|
+
}.bind(this);
|
|
317
331
|
}
|
|
318
332
|
|
|
319
|
-
|
|
333
|
+
getSplitterWithSeparator(separator, lengthLimit) {
|
|
320
334
|
return function(seq) {
|
|
321
|
-
|
|
322
|
-
|
|
335
|
+
// if (!!lengthLimit) {
|
|
336
|
+
// const res = new Array(lengthLimit);
|
|
337
|
+
// let pos = 0, count = 0;
|
|
338
|
+
// while (pos < seq.length && count < lengthLimit) {
|
|
339
|
+
// const newPos = seq.indexOf(separator, pos);
|
|
340
|
+
// res[count] = seq.substring(pos, newPos);
|
|
341
|
+
// count++;
|
|
342
|
+
// pos = newPos;
|
|
343
|
+
// }
|
|
344
|
+
//
|
|
345
|
+
// return res.slice(0, count);
|
|
346
|
+
// } else {
|
|
347
|
+
return seq.split(separator, lengthLimit);
|
|
348
|
+
// }
|
|
349
|
+
}.bind(this);
|
|
323
350
|
}
|
|
324
351
|
|
|
325
352
|
// Multichar monomer names in square brackets, single char monomers or gap symbol
|
|
326
|
-
|
|
353
|
+
monomerRe = /\[(\w+)\]|(\w)|(-)/g;
|
|
327
354
|
|
|
328
355
|
/** Split sequence for single character monomers, square brackets multichar monomer names or gap symbol. */
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
356
|
+
getSplitterAsFasta(lengthLimit) {
|
|
357
|
+
return function(seq) {
|
|
358
|
+
const res = wu(seq.toString().matchAll(this.monomerRe))
|
|
359
|
+
.take(lengthLimit)
|
|
360
|
+
.map((ma) => {
|
|
361
|
+
let mRes;
|
|
362
|
+
const m = ma[0];
|
|
363
|
+
if (m.length > 1) {
|
|
364
|
+
mRes = ma[1];
|
|
365
|
+
} else {
|
|
366
|
+
mRes = m;
|
|
367
|
+
}
|
|
368
|
+
return mRes;
|
|
369
|
+
}).toArray();
|
|
340
370
|
|
|
341
|
-
|
|
371
|
+
return res;
|
|
372
|
+
}.bind(this);
|
|
342
373
|
}
|
|
343
374
|
|
|
344
375
|
/** Only some of the synonyms. These were obtained from the clustered oligopeptide dataset. */
|
|
345
|
-
|
|
376
|
+
aaSynonyms = {
|
|
346
377
|
'[MeNle]': 'L', // Nle - norleucine
|
|
347
378
|
'[MeA]': 'A', '[MeG]': 'G', '[MeF]': 'F',
|
|
348
379
|
};
|
|
349
380
|
|
|
350
|
-
|
|
351
|
-
|
|
381
|
+
helmRe = /(PEPTIDE1|DNA1|RNA1)\{([^}]+)}/g;
|
|
382
|
+
helmPp1Re = /\[([^\[\]]+)]/g;
|
|
352
383
|
|
|
353
384
|
/** Splits Helm string to monomers, but does not replace monomer names to other notation (e.g. for RNA). */
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
385
|
+
getSplitterAsHelm(lengthLimit) {
|
|
386
|
+
return function(seq) {
|
|
387
|
+
this.helmRe.lastIndex = 0;
|
|
388
|
+
const ea = this.helmRe.exec(seq.toString());
|
|
389
|
+
const inSeq = ea ? ea[2] : null;
|
|
390
|
+
|
|
391
|
+
const mmPostProcess = (mm) => {
|
|
392
|
+
this.helmPp1Re.lastIndex = 0;
|
|
393
|
+
const pp1M = this.helmPp1Re.exec(mm);
|
|
394
|
+
if (pp1M && pp1M.length >= 2) {
|
|
395
|
+
return pp1M[1];
|
|
396
|
+
} else {
|
|
397
|
+
return mm;
|
|
398
|
+
}
|
|
399
|
+
};
|
|
368
400
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
401
|
+
const mmList = inSeq ? inSeq.split('.') : [];
|
|
402
|
+
const mmListRes = mmList.map(mmPostProcess);
|
|
403
|
+
return mmListRes;
|
|
404
|
+
}.bind(this);
|
|
372
405
|
}
|
|
373
406
|
|
|
374
|
-
|
|
407
|
+
sample(src, n) {
|
|
375
408
|
if (src.length < n) {
|
|
376
409
|
throw new Error('Sample source is less than n requested.');
|
|
377
410
|
}
|