@datagrok/bio 2.1.2 → 2.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/detectors.js +37 -11
- package/dist/package-test.js +93 -36
- package/dist/package.js +29 -15
- package/dist/vendors-node_modules_datagrok-libraries_ml_src_workers_dimensionality-reducer_js.js +3 -3
- package/package.json +4 -4
- package/src/package.ts +2 -0
- package/src/tests/checkInputColumn-tests.ts +1 -1
- package/src/tests/detectors-test.ts +18 -18
- package/src/tests/renderers-test.ts +6 -1
- package/src/tests/splitters-test.ts +6 -1
- package/{test-Bio-62cc009524f3-73ccfff9.html → test-Bio-62cc009524f3-db2d0836.html} +97 -99
package/detectors.js
CHANGED
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
* TODO: Use detectors from WebLogo pickUp.. methods
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
|
+
const CATEGORIES_SAMPLE_LIMIT = 100;
|
|
12
|
+
|
|
11
13
|
/** enum type to simplify setting "user-friendly" notation if necessary */
|
|
12
14
|
const NOTATION = {
|
|
13
15
|
FASTA: 'fasta',
|
|
@@ -76,18 +78,25 @@ class BioPackageDetectors extends DG.Package {
|
|
|
76
78
|
//input: column col
|
|
77
79
|
//output: string semType
|
|
78
80
|
detectMacromolecule(col) {
|
|
81
|
+
// Fail early
|
|
82
|
+
if (col.type !== DG.TYPE.STRING) return null;
|
|
83
|
+
|
|
84
|
+
const categoriesSample = col.categories.length < CATEGORIES_SAMPLE_LIMIT ? col.categories :
|
|
85
|
+
BioPackageDetectors.sample(col.categories, CATEGORIES_SAMPLE_LIMIT);
|
|
86
|
+
|
|
79
87
|
// To collect alphabet freq three strategies can be used:
|
|
80
88
|
// as chars, as fasta (single or within square brackets), as with the separator.
|
|
81
89
|
if (
|
|
82
90
|
!(col.categories.length == 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category value
|
|
83
|
-
DG.Detector.sampleCategories(col, (s) => BioPackageDetectors.isHelm(s), 1)
|
|
91
|
+
DG.Detector.sampleCategories(col, (s) => BioPackageDetectors.isHelm(s), 1, CATEGORIES_SAMPLE_LIMIT)
|
|
84
92
|
) {
|
|
85
|
-
const statsAsHelm = BioPackageDetectors.getStats(
|
|
93
|
+
const statsAsHelm = BioPackageDetectors.getStats(categoriesSample, 2, BioPackageDetectors.splitterAsHelm);
|
|
86
94
|
col.setTag(DG.TAGS.UNITS, NOTATION.HELM);
|
|
87
95
|
|
|
88
|
-
|
|
96
|
+
// alphabetSize calculated on (sub)sample of data is incorrect
|
|
97
|
+
// const alphabetSize = Object.keys(statsAsHelm.freq).length;
|
|
89
98
|
const alphabetIsMultichar = Object.keys(statsAsHelm.freq).some((m) => m.length > 1);
|
|
90
|
-
col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
99
|
+
// col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
91
100
|
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
92
101
|
|
|
93
102
|
return DG.SEMTYPE.MACROMOLECULE;
|
|
@@ -115,12 +124,12 @@ class BioPackageDetectors extends DG.Package {
|
|
|
115
124
|
}
|
|
116
125
|
return res;
|
|
117
126
|
};
|
|
118
|
-
const isUrl =
|
|
127
|
+
const isUrl = categoriesSample.every((v) => { return !v || isUrlCheck(v); });
|
|
119
128
|
if (isUrl) return null;
|
|
120
129
|
|
|
121
130
|
// TODO: Detect HELM sequence
|
|
122
131
|
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
123
|
-
const statsAsChars = BioPackageDetectors.getStats(
|
|
132
|
+
const statsAsChars = BioPackageDetectors.getStats(categoriesSample, 5, BioPackageDetectors.splitterAsChars);
|
|
124
133
|
// if (Object.keys(statsAsChars.freq).length === 0) return;
|
|
125
134
|
|
|
126
135
|
const decoy = BioPackageDetectors.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
|
|
@@ -143,7 +152,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
143
152
|
const splitter = separator ? BioPackageDetectors.getSplitterWithSeparator(separator) :
|
|
144
153
|
BioPackageDetectors.splitterAsFasta;
|
|
145
154
|
|
|
146
|
-
const stats = BioPackageDetectors.getStats(
|
|
155
|
+
const stats = BioPackageDetectors.getStats(categoriesSample, 5, splitter);
|
|
147
156
|
// Empty monomer alphabet is not allowed
|
|
148
157
|
if (Object.keys(stats.freq).length === 0) return null;
|
|
149
158
|
// Long monomer names for sequences with separators have constraints
|
|
@@ -162,9 +171,10 @@ class BioPackageDetectors extends DG.Package {
|
|
|
162
171
|
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
163
172
|
if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
|
|
164
173
|
if (alphabet === ALPHABET.UN) {
|
|
165
|
-
|
|
174
|
+
// alphabetSize calculated on (sub)sample of data is incorrect
|
|
175
|
+
// const alphabetSize = Object.keys(stats.freq).length;
|
|
166
176
|
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
167
|
-
col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
177
|
+
// col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
168
178
|
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
169
179
|
}
|
|
170
180
|
return DG.SEMTYPE.MACROMOLECULE;
|
|
@@ -220,12 +230,12 @@ class BioPackageDetectors extends DG.Package {
|
|
|
220
230
|
// }
|
|
221
231
|
|
|
222
232
|
/** Stats of sequences with specified splitter func, returns { freq, sameLength } */
|
|
223
|
-
static getStats(
|
|
233
|
+
static getStats(values, minLength, splitter) {
|
|
224
234
|
const freq = {};
|
|
225
235
|
let sameLength = true;
|
|
226
236
|
let firstLength = null;
|
|
227
237
|
|
|
228
|
-
for (const seq of
|
|
238
|
+
for (const seq of values) {
|
|
229
239
|
const mSeq = splitter(seq);
|
|
230
240
|
|
|
231
241
|
if (firstLength == null) {
|
|
@@ -360,4 +370,20 @@ class BioPackageDetectors extends DG.Package {
|
|
|
360
370
|
const mmListRes = mmList.map(mmPostProcess);
|
|
361
371
|
return mmListRes;
|
|
362
372
|
}
|
|
373
|
+
|
|
374
|
+
static sample(src, n) {
|
|
375
|
+
if (src.length < n) {
|
|
376
|
+
throw new Error('Sample source is less than n requested.');
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
const idxSet = new Set();
|
|
380
|
+
while (idxSet.size < n) {
|
|
381
|
+
const idx = Math.floor(Math.random() * src.length);
|
|
382
|
+
if (!idxSet.has(idx)) {
|
|
383
|
+
idxSet.add(idx);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
return [...idxSet].map((idx) => src[idx]);
|
|
388
|
+
}
|
|
363
389
|
}
|