@datagrok/bio 2.1.1 → 2.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/detectors.js CHANGED
@@ -8,6 +8,8 @@
8
8
  * TODO: Use detectors from WebLogo pickUp.. methods
9
9
  */
10
10
 
11
+ const CATEGORIES_SAMPLE_LIMIT = 100;
12
+
11
13
  /** enum type to simplify setting "user-friendly" notation if necessary */
12
14
  const NOTATION = {
13
15
  FASTA: 'fasta',
@@ -76,18 +78,25 @@ class BioPackageDetectors extends DG.Package {
76
78
  //input: column col
77
79
  //output: string semType
78
80
  detectMacromolecule(col) {
81
+ // Fail early
82
+ if (col.type !== DG.TYPE.STRING) return null;
83
+
84
+ const categoriesSample = col.categories.length < CATEGORIES_SAMPLE_LIMIT ? col.categories :
85
+ BioPackageDetectors.sample(col.categories, CATEGORIES_SAMPLE_LIMIT);
86
+
79
87
  // To collect alphabet freq three strategies can be used:
80
88
  // as chars, as fasta (single or within square brackets), as with the separator.
81
89
  if (
82
90
  !(col.categories.length == 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category value
83
- DG.Detector.sampleCategories(col, (s) => BioPackageDetectors.isHelm(s), 1)
91
+ DG.Detector.sampleCategories(col, (s) => BioPackageDetectors.isHelm(s), 1, CATEGORIES_SAMPLE_LIMIT)
84
92
  ) {
85
- const statsAsHelm = BioPackageDetectors.getStats(col, 2, BioPackageDetectors.splitterAsHelm);
93
+ const statsAsHelm = BioPackageDetectors.getStats(categoriesSample, 2, BioPackageDetectors.splitterAsHelm);
86
94
  col.setTag(DG.TAGS.UNITS, NOTATION.HELM);
87
95
 
88
- const alphabetSize = Object.keys(statsAsHelm.freq).length;
96
+ // alphabetSize calculated on (sub)sample of data is incorrect
97
+ // const alphabetSize = Object.keys(statsAsHelm.freq).length;
89
98
  const alphabetIsMultichar = Object.keys(statsAsHelm.freq).some((m) => m.length > 1);
90
- col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
99
+ // col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
91
100
  col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
92
101
 
93
102
  return DG.SEMTYPE.MACROMOLECULE;
@@ -115,12 +124,12 @@ class BioPackageDetectors extends DG.Package {
115
124
  }
116
125
  return res;
117
126
  };
118
- const isUrl = DG.Detector.sampleCategories(col, isUrlCheck, 1);
127
+ const isUrl = categoriesSample.every((v) => { return !v || isUrlCheck(v); });
119
128
  if (isUrl) return null;
120
129
 
121
130
  // TODO: Detect HELM sequence
122
131
  // TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
123
- const statsAsChars = BioPackageDetectors.getStats(col, 5, BioPackageDetectors.splitterAsChars);
132
+ const statsAsChars = BioPackageDetectors.getStats(categoriesSample, 5, BioPackageDetectors.splitterAsChars);
124
133
  // if (Object.keys(statsAsChars.freq).length === 0) return;
125
134
 
126
135
  const decoy = BioPackageDetectors.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
@@ -143,7 +152,7 @@ class BioPackageDetectors extends DG.Package {
143
152
  const splitter = separator ? BioPackageDetectors.getSplitterWithSeparator(separator) :
144
153
  BioPackageDetectors.splitterAsFasta;
145
154
 
146
- const stats = BioPackageDetectors.getStats(col, 5, splitter);
155
+ const stats = BioPackageDetectors.getStats(categoriesSample, 5, splitter);
147
156
  // Empty monomer alphabet is not allowed
148
157
  if (Object.keys(stats.freq).length === 0) return null;
149
158
  // Long monomer names for sequences with separators have constraints
@@ -162,9 +171,10 @@ class BioPackageDetectors extends DG.Package {
162
171
  col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
163
172
  if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
164
173
  if (alphabet === ALPHABET.UN) {
165
- const alphabetSize = Object.keys(stats.freq).length;
174
+ // alphabetSize calculated on (sub)sample of data is incorrect
175
+ // const alphabetSize = Object.keys(stats.freq).length;
166
176
  const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
167
- col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
177
+ // col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
168
178
  col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
169
179
  }
170
180
  return DG.SEMTYPE.MACROMOLECULE;
@@ -220,12 +230,12 @@ class BioPackageDetectors extends DG.Package {
220
230
  // }
221
231
 
222
232
  /** Stats of sequences with specified splitter func, returns { freq, sameLength } */
223
- static getStats(seqCol, minLength, splitter) {
233
+ static getStats(values, minLength, splitter) {
224
234
  const freq = {};
225
235
  let sameLength = true;
226
236
  let firstLength = null;
227
237
 
228
- for (const seq of seqCol.categories) {
238
+ for (const seq of values) {
229
239
  const mSeq = splitter(seq);
230
240
 
231
241
  if (firstLength == null) {
@@ -360,4 +370,20 @@ class BioPackageDetectors extends DG.Package {
360
370
  const mmListRes = mmList.map(mmPostProcess);
361
371
  return mmListRes;
362
372
  }
373
+
374
+ static sample(src, n) {
375
+ if (src.length < n) {
376
+ throw new Error('Sample source is less than n requested.');
377
+ }
378
+
379
+ const idxSet = new Set();
380
+ while (idxSet.size < n) {
381
+ const idx = Math.floor(Math.random() * src.length);
382
+ if (!idxSet.has(idx)) {
383
+ idxSet.add(idx);
384
+ }
385
+ }
386
+
387
+ return [...idxSet].map((idx) => src[idx]);
388
+ }
363
389
  }