@datagrok/bio 2.1.2 → 2.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/detectors.js CHANGED
@@ -8,6 +8,9 @@
8
8
  * TODO: Use detectors from WebLogo pickUp.. methods
9
9
  */
10
10
 
11
+ const SEQ_SAMPLE_LIMIT = 100;
12
+ const SEQ_SAMPLE_LENGTH_LIMIT = 500;
13
+
11
14
  /** enum type to simplify setting "user-friendly" notation if necessary */
12
15
  const NOTATION = {
13
16
  FASTA: 'fasta',
@@ -28,35 +31,37 @@ const ALIGNMENT = {
28
31
  };
29
32
 
30
33
  /** Class for handling notation units in Macromolecule columns */
31
- class UnitsHandler {
32
- static TAGS = {
34
+ const UnitsHandler = {
35
+ TAGS: {
33
36
  aligned: 'aligned',
34
37
  alphabet: 'alphabet',
35
38
  alphabetSize: '.alphabetSize',
36
39
  alphabetIsMultichar: '.alphabetIsMultichar',
37
40
  separator: 'separator',
38
- };
39
- }
41
+ },
42
+ };
43
+
44
+ const isUrlRe = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)?/i;
40
45
 
41
46
  class BioPackageDetectors extends DG.Package {
42
47
 
43
- static PeptideFastaAlphabet = new Set([
48
+ PeptideFastaAlphabet = new Set([
44
49
  'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
45
50
  'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T',
46
51
  'MeNle', 'MeA', 'MeG', 'MeF',
47
52
  ]);
48
53
 
49
- static DnaFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
54
+ DnaFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
50
55
 
51
- static RnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
56
+ RnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
52
57
 
53
- static SmilesRawAlphabet = new Set([
58
+ SmilesRawAlphabet = new Set([
54
59
  'A', 'B', 'C', 'E', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'Z',
55
60
  'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'r', 's', 't', 'u',
56
61
  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
57
62
  '+', '-', '.', , '/', '\\', '@', '[', ']', '(', ')', '#', '%', '=']);
58
63
 
59
- static SmartsRawAlphabet = new Set([
64
+ SmartsRawAlphabet = new Set([
60
65
  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
61
66
  '!', '#', '$', '&', '(', ')', '*', '+', ',', '-', '.', ':', ';', '=', '@', '~', '[', ']',
62
67
  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
@@ -67,7 +72,7 @@ class BioPackageDetectors extends DG.Package {
67
72
 
68
73
  /** @param s {String} - string to check
69
74
  * @returns {boolean} */
70
- static isHelm(s) {
75
+ isHelm(s) {
71
76
  return s.startsWith('PEPTIDE1{') || s.startsWith('CHEM1{') || s.startsWith('BLOB1{') ||
72
77
  s.startsWith('RNA1{') || s.startsWith('DNA1{');
73
78
  }
@@ -76,106 +81,123 @@ class BioPackageDetectors extends DG.Package {
76
81
  //input: column col
77
82
  //output: string semType
78
83
  detectMacromolecule(col) {
79
- // To collect alphabet freq three strategies can be used:
80
- // as chars, as fasta (single or within square brackets), as with the separator.
81
- if (
82
- !(col.categories.length == 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category value
83
- DG.Detector.sampleCategories(col, (s) => BioPackageDetectors.isHelm(s), 1)
84
- ) {
85
- const statsAsHelm = BioPackageDetectors.getStats(col, 2, BioPackageDetectors.splitterAsHelm);
86
- col.setTag(DG.TAGS.UNITS, NOTATION.HELM);
87
-
88
- const alphabetSize = Object.keys(statsAsHelm.freq).length;
89
- const alphabetIsMultichar = Object.keys(statsAsHelm.freq).some((m) => m.length > 1);
90
- col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
91
- col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
92
-
93
- return DG.SEMTYPE.MACROMOLECULE;
94
- }
84
+ const t1 = Date.now();
85
+ try {
86
+ // Fail early
87
+ if (col.type !== DG.TYPE.STRING) return null;
88
+
89
+ const categoriesSample = col.categories.length < SEQ_SAMPLE_LIMIT ? col.categories :
90
+ this.sample(col.categories, SEQ_SAMPLE_LIMIT);
91
+
92
+ // To collect alphabet freq three strategies can be used:
93
+ // as chars, as fasta (single or within square brackets), as with the separator.
94
+ if (
95
+ !(col.categories.length == 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category value
96
+ DG.Detector.sampleCategories(col, (s) => this.isHelm(s), 1, SEQ_SAMPLE_LIMIT)
97
+ ) {
98
+ const statsAsHelm = this.getStats(categoriesSample, 2,
99
+ this.getSplitterAsHelm(SEQ_SAMPLE_LENGTH_LIMIT));
100
+ col.setTag(DG.TAGS.UNITS, NOTATION.HELM);
101
+
102
+ // alphabetSize calculated on (sub)sample of data is incorrect
103
+ // const alphabetSize = Object.keys(statsAsHelm.freq).length;
104
+ const alphabetIsMultichar = Object.keys(statsAsHelm.freq).some((m) => m.length > 1);
105
+ // col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
106
+ col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
95
107
 
96
- const decoyAlphabets = [
97
- ['SMILES', BioPackageDetectors.SmilesRawAlphabet, 0.30],
98
- ['SMARTS', BioPackageDetectors.SmartsRawAlphabet, 0.45],
99
- ];
100
-
101
- const candidateAlphabets = [
102
- [ALPHABET.PT, BioPackageDetectors.PeptideFastaAlphabet, 0.50],
103
- [ALPHABET.DNA, BioPackageDetectors.DnaFastaAlphabet, 0.55],
104
- [ALPHABET.RNA, BioPackageDetectors.RnaFastaAlphabet, 0.55],
105
- ];
106
-
107
- // Check for url column, maybe it is too heavy check
108
- const isUrlCheck = (s) => {
109
- let res = true;
110
- try {
111
- const url = new URL(s);
112
- res = true;
113
- } catch {
114
- res = false;
115
- }
116
- return res;
117
- };
118
- const isUrl = DG.Detector.sampleCategories(col, isUrlCheck, 1);
119
- if (isUrl) return null;
120
-
121
- // TODO: Detect HELM sequence
122
- // TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
123
- const statsAsChars = BioPackageDetectors.getStats(col, 5, BioPackageDetectors.splitterAsChars);
124
- // if (Object.keys(statsAsChars.freq).length === 0) return;
125
-
126
- const decoy = BioPackageDetectors.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
127
- if (decoy != ALPHABET.UN) return null;
128
-
129
- if (statsAsChars.sameLength) {
130
- if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
131
- const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, candidateAlphabets, '-');
132
- if (alphabet === ALPHABET.UN) return null;
133
-
134
- const units = NOTATION.FASTA;
135
- col.setTag(DG.TAGS.UNITS, units);
136
- col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
137
- col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
138
108
  return DG.SEMTYPE.MACROMOLECULE;
139
109
  }
140
- } else {
141
- const separator = BioPackageDetectors.detectSeparator(statsAsChars.freq);
110
+
111
+ const decoyAlphabets = [
112
+ ['SMILES', this.SmilesRawAlphabet, 0.30],
113
+ ['SMARTS', this.SmartsRawAlphabet, 0.43],
114
+ ];
115
+
116
+ const candidateAlphabets = [
117
+ [ALPHABET.PT, this.PeptideFastaAlphabet, 0.50],
118
+ [ALPHABET.DNA, this.DnaFastaAlphabet, 0.55],
119
+ [ALPHABET.RNA, this.RnaFastaAlphabet, 0.55],
120
+ ];
121
+
122
+ // Check for url column, maybe it is too heavy check
123
+ const isUrlCheck = (s) => {
124
+ let res = true;
125
+ try {
126
+ const url = new URL(s);
127
+ res = true;
128
+ } catch {
129
+ res = false;
130
+ }
131
+ return res;
132
+ // return isUrlRe.test(s);
133
+ };
134
+ const isUrl = categoriesSample.every((v) => { return !v || isUrlCheck(v); });
135
+ if (isUrl) return null;
136
+
137
+ // TODO: Detect HELM sequence
138
+ // TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
139
+ const statsAsChars = this.getStats(categoriesSample, 5,
140
+ this.getSplitterAsChars(SEQ_SAMPLE_LENGTH_LIMIT));
141
+ // if (Object.keys(statsAsChars.freq).length === 0) return;
142
+
143
+ const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
144
+ if (decoy != ALPHABET.UN) return null;
145
+
146
+ const separator = this.detectSeparator(statsAsChars.freq);
147
+ const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
142
148
  const gapSymbol = separator ? '' : '-';
143
- const splitter = separator ? BioPackageDetectors.getSplitterWithSeparator(separator) :
144
- BioPackageDetectors.splitterAsFasta;
145
-
146
- const stats = BioPackageDetectors.getStats(col, 5, splitter);
147
- // Empty monomer alphabet is not allowed
148
- if (Object.keys(stats.freq).length === 0) return null;
149
- // Long monomer names for sequences with separators have constraints
150
- if (separator && BioPackageDetectors.checkForbiddenWithSeparators(stats.freq)) return null;
151
-
152
- const format = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
153
- const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
154
-
155
- // TODO: If separator detected, then extra efforts to detect alphabet are allowed.
156
- const alphabet = BioPackageDetectors.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
157
-
158
- // const forbidden = BioPackageDetectors.checkForbiddenWoSeparator(stats.freq);
159
- if (separator || alphabet != 'UN') {
160
- col.setTag(DG.TAGS.UNITS, format);
161
- col.setTag(UnitsHandler.TAGS.aligned, aligned);
162
- col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
163
- if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
164
- if (alphabet === ALPHABET.UN) {
165
- const alphabetSize = Object.keys(stats.freq).length;
166
- const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
167
- col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
168
- col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
149
+ const splitter = separator ? this.getSplitterWithSeparator(separator, SEQ_SAMPLE_LENGTH_LIMIT) :
150
+ this.getSplitterAsFasta(SEQ_SAMPLE_LENGTH_LIMIT);
151
+
152
+ col.setTag(DG.TAGS.UNITS, units);
153
+ if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
154
+
155
+ if (statsAsChars.sameLength) {
156
+ if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
157
+ const stats = this.getStats(categoriesSample, 5, splitter);
158
+ const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-');
159
+ if (alphabet === ALPHABET.UN) return null;
160
+
161
+ col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
162
+ col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
163
+ return DG.SEMTYPE.MACROMOLECULE;
164
+ }
165
+ } else {
166
+ const stats = this.getStats(categoriesSample, 5, splitter);
167
+ // Empty monomer alphabet is not allowed
168
+ if (Object.keys(stats.freq).length === 0) return null;
169
+ // Long monomer names for sequences with separators have constraints
170
+ if (separator && this.checkForbiddenWithSeparators(stats.freq)) return null;
171
+
172
+ const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
173
+
174
+ // TODO: If separator detected, then extra efforts to detect alphabet are allowed.
175
+ const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
176
+
177
+ // const forbidden = this.checkForbiddenWoSeparator(stats.freq);
178
+ if (separator || alphabet != 'UN') {
179
+ col.setTag(UnitsHandler.TAGS.aligned, aligned);
180
+ col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
181
+ if (alphabet === ALPHABET.UN) {
182
+ // alphabetSize calculated on (sub)sample of data is incorrect
183
+ // const alphabetSize = Object.keys(stats.freq).length;
184
+ const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
185
+ // col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
186
+ col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
187
+ }
188
+ return DG.SEMTYPE.MACROMOLECULE;
169
189
  }
170
- return DG.SEMTYPE.MACROMOLECULE;
171
190
  }
191
+ } finally {
192
+ const t2 = Date.now();
193
+ console.debug('Bio: detectMacromolecule() ' + `ET = ${t2 - t1} ms.`);
172
194
  }
173
195
  }
174
196
 
175
197
  /** Detects the most frequent char with a rate of at least 0.15 of others in sum.
176
198
  * Does not use any splitting strategies, estimates just by single characters.
177
199
  * */
178
- static detectSeparator(freq) {
200
+ detectSeparator(freq) {
179
201
  // To detect a separator we analyse col's sequences character frequencies.
180
202
  // If there is an exceptionally frequent symbol, then we will call it the separator.
181
203
  // The most frequent symbol should occur with a rate of at least 0.15
@@ -190,8 +212,8 @@ class BioPackageDetectors extends DG.Package {
190
212
  const cleanFreq = Object.assign({}, ...Object.entries(freq)
191
213
  .filter(([m, f]) =>
192
214
  !noSeparatorChemRe.test(m) && !noSeparatorAlphaDigitRe.test(m) && !noSeparatorBracketsRe.test(m) &&
193
- !BioPackageDetectors.PeptideFastaAlphabet.has(m) &&
194
- !BioPackageDetectors.DnaFastaAlphabet.has(m))
215
+ !this.PeptideFastaAlphabet.has(m) &&
216
+ !this.DnaFastaAlphabet.has(m))
195
217
  .map(([m, f]) => ({[m]: f})));
196
218
  if (Object.keys(cleanFreq).length == 0) return null;
197
219
 
@@ -208,24 +230,24 @@ class BioPackageDetectors extends DG.Package {
208
230
  /** With a separator, spaces are nor allowed in monomer names.
209
231
  * The monomer name/label cannot contain digits only.
210
232
  */
211
- static checkForbiddenWithSeparators(freq) {
233
+ checkForbiddenWithSeparators(freq) {
212
234
  const forbiddenRe = /[ ]|^\d+$/i;
213
235
  return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
214
236
  }
215
237
 
216
238
  // /** Without a separator, special symbols or digits are not allowed as monomers. */
217
- // static checkForbiddenWoSeparator(freq) {
239
+ // checkForbiddenWoSeparator(freq) {
218
240
  // const forbiddenRe = /[\d!@#$%^&*()_+\-=\[\]{};':"\\|,.<>\/?]/i;
219
241
  // return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
220
242
  // }
221
243
 
222
244
  /** Stats of sequences with specified splitter func, returns { freq, sameLength } */
223
- static getStats(seqCol, minLength, splitter) {
245
+ getStats(values, minLength, splitter) {
224
246
  const freq = {};
225
247
  let sameLength = true;
226
248
  let firstLength = null;
227
249
 
228
- for (const seq of seqCol.categories) {
250
+ for (const seq of values) {
229
251
  const mSeq = splitter(seq);
230
252
 
231
253
  if (firstLength == null) {
@@ -250,9 +272,9 @@ class BioPackageDetectors extends DG.Package {
250
272
  * @param freq frequencies of monomers in sequence set
251
273
  * @param candidates an array of pairs [name, monomer set]
252
274
  * */
253
- static detectAlphabet(freq, candidates, gapSymbol) {
275
+ detectAlphabet(freq, candidates, gapSymbol) {
254
276
  const candidatesSims = candidates.map((c) => {
255
- const sim = BioPackageDetectors.getAlphabetSimilarity(freq, c[1], gapSymbol);
277
+ const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol);
256
278
  return [c[0], c[1], c[2], freq, sim];
257
279
  });
258
280
 
@@ -267,7 +289,7 @@ class BioPackageDetectors extends DG.Package {
267
289
  return alphabetName;
268
290
  }
269
291
 
270
- static getAlphabetSimilarity(freq, alphabet, gapSymbol) {
292
+ getAlphabetSimilarity(freq, alphabet, gapSymbol) {
271
293
  const keys = new Set([...new Set(Object.keys(freq)), ...alphabet]);
272
294
  keys.delete(gapSymbol);
273
295
 
@@ -278,11 +300,11 @@ class BioPackageDetectors extends DG.Package {
278
300
  alphabetA.push(alphabet.has(m) ? 10 : -20 /* penalty for character outside alphabet set*/);
279
301
  }
280
302
  /* There were a few ideas: chi-squared, pearson correlation (variance?), scalar product */
281
- const cos = BioPackageDetectors.vectorDotProduct(freqA, alphabetA) / (BioPackageDetectors.vectorLength(freqA) * BioPackageDetectors.vectorLength(alphabetA));
303
+ const cos = this.vectorDotProduct(freqA, alphabetA) / (this.vectorLength(freqA) * this.vectorLength(alphabetA));
282
304
  return cos;
283
305
  }
284
306
 
285
- static vectorLength(v) {
307
+ vectorLength(v) {
286
308
  let sqrSum = 0;
287
309
  for (let i = 0; i < v.length; i++) {
288
310
  sqrSum += v[i] * v[i];
@@ -290,7 +312,7 @@ class BioPackageDetectors extends DG.Package {
290
312
  return Math.sqrt(sqrSum);
291
313
  }
292
314
 
293
- static vectorDotProduct(v1, v2) {
315
+ vectorDotProduct(v1, v2) {
294
316
  if (v1.length != v2.length) {
295
317
  throw Error('The dimensionality of the vectors must match');
296
318
  }
@@ -302,62 +324,99 @@ class BioPackageDetectors extends DG.Package {
302
324
  }
303
325
 
304
326
  /** For trivial checks split by single chars*/
305
- static splitterAsChars(seq) {
306
- return seq.split('');
327
+ getSplitterAsChars(lengthLimit) {
328
+ return function(seq) {
329
+ return seq.split('', lengthLimit);
330
+ }.bind(this);
307
331
  }
308
332
 
309
- static getSplitterWithSeparator(separator) {
333
+ getSplitterWithSeparator(separator, lengthLimit) {
310
334
  return function(seq) {
311
- return seq.split(separator);
312
- };
335
+ // if (!!lengthLimit) {
336
+ // const res = new Array(lengthLimit);
337
+ // let pos = 0, count = 0;
338
+ // while (pos < seq.length && count < lengthLimit) {
339
+ // const newPos = seq.indexOf(separator, pos);
340
+ // res[count] = seq.substring(pos, newPos);
341
+ // count++;
342
+ // pos = newPos;
343
+ // }
344
+ //
345
+ // return res.slice(0, count);
346
+ // } else {
347
+ return seq.split(separator, lengthLimit);
348
+ // }
349
+ }.bind(this);
313
350
  }
314
351
 
315
352
  // Multichar monomer names in square brackets, single char monomers or gap symbol
316
- static monomerRe = /\[(\w+)\]|(\w)|(-)/g;
353
+ monomerRe = /\[(\w+)\]|(\w)|(-)/g;
317
354
 
318
355
  /** Split sequence for single character monomers, square brackets multichar monomer names or gap symbol. */
319
- static splitterAsFasta(seq) {
320
- const res = wu(seq.toString().matchAll(BioPackageDetectors.monomerRe)).map((ma) => {
321
- let mRes;
322
- const m = ma[0];
323
- if (m.length > 1) {
324
- mRes = ma[1];
325
- } else {
326
- mRes = m;
327
- }
328
- return mRes;
329
- }).toArray();
356
+ getSplitterAsFasta(lengthLimit) {
357
+ return function(seq) {
358
+ const res = wu(seq.toString().matchAll(this.monomerRe))
359
+ .take(lengthLimit)
360
+ .map((ma) => {
361
+ let mRes;
362
+ const m = ma[0];
363
+ if (m.length > 1) {
364
+ mRes = ma[1];
365
+ } else {
366
+ mRes = m;
367
+ }
368
+ return mRes;
369
+ }).toArray();
330
370
 
331
- return res;
371
+ return res;
372
+ }.bind(this);
332
373
  }
333
374
 
334
375
  /** Only some of the synonyms. These were obtained from the clustered oligopeptide dataset. */
335
- static aaSynonyms = {
376
+ aaSynonyms = {
336
377
  '[MeNle]': 'L', // Nle - norleucine
337
378
  '[MeA]': 'A', '[MeG]': 'G', '[MeF]': 'F',
338
379
  };
339
380
 
340
- static helmRe = /(PEPTIDE1|DNA1|RNA1)\{([^}]+)}/g;
341
- static helmPp1Re = /\[([^\[\]]+)]/g;
381
+ helmRe = /(PEPTIDE1|DNA1|RNA1)\{([^}]+)}/g;
382
+ helmPp1Re = /\[([^\[\]]+)]/g;
342
383
 
343
384
  /** Splits Helm string to monomers, but does not replace monomer names to other notation (e.g. for RNA). */
344
- static splitterAsHelm(seq) {
345
- BioPackageDetectors.helmRe.lastIndex = 0;
346
- const ea = BioPackageDetectors.helmRe.exec(seq.toString());
347
- const inSeq = ea ? ea[2] : null;
348
-
349
- const mmPostProcess = (mm) => {
350
- BioPackageDetectors.helmPp1Re.lastIndex = 0;
351
- const pp1M = BioPackageDetectors.helmPp1Re.exec(mm);
352
- if (pp1M && pp1M.length >= 2) {
353
- return pp1M[1];
354
- } else {
355
- return mm;
385
+ getSplitterAsHelm(lengthLimit) {
386
+ return function(seq) {
387
+ this.helmRe.lastIndex = 0;
388
+ const ea = this.helmRe.exec(seq.toString());
389
+ const inSeq = ea ? ea[2] : null;
390
+
391
+ const mmPostProcess = (mm) => {
392
+ this.helmPp1Re.lastIndex = 0;
393
+ const pp1M = this.helmPp1Re.exec(mm);
394
+ if (pp1M && pp1M.length >= 2) {
395
+ return pp1M[1];
396
+ } else {
397
+ return mm;
398
+ }
399
+ };
400
+
401
+ const mmList = inSeq ? inSeq.split('.') : [];
402
+ const mmListRes = mmList.map(mmPostProcess);
403
+ return mmListRes;
404
+ }.bind(this);
405
+ }
406
+
407
+ sample(src, n) {
408
+ if (src.length < n) {
409
+ throw new Error('Sample source is less than n requested.');
410
+ }
411
+
412
+ const idxSet = new Set();
413
+ while (idxSet.size < n) {
414
+ const idx = Math.floor(Math.random() * src.length);
415
+ if (!idxSet.has(idx)) {
416
+ idxSet.add(idx);
356
417
  }
357
- };
418
+ }
358
419
 
359
- const mmList = inSeq ? inSeq.split('.') : [];
360
- const mmListRes = mmList.map(mmPostProcess);
361
- return mmListRes;
420
+ return [...idxSet].map((idx) => src[idx]);
362
421
  }
363
422
  }