@datagrok/bio 2.4.2 → 2.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +1 -1
- package/detectors.js +21 -17
- package/dist/package-test.js +1 -1
- package/dist/package-test.js.map +1 -1
- package/package.json +1 -1
- package/src/package-test.ts +1 -0
- package/src/tests/detectors-weak-and-likely-tests.ts +129 -0
- package/tsconfig.json +1 -1
package/.eslintrc.json
CHANGED
package/detectors.js
CHANGED
|
@@ -46,7 +46,7 @@ const isUrlRe = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9
|
|
|
46
46
|
class BioPackageDetectors extends DG.Package {
|
|
47
47
|
|
|
48
48
|
/** Parts of the column name required in the column's name under the detector. It must be in lowercase. */
|
|
49
|
-
|
|
49
|
+
likelyColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein'];
|
|
50
50
|
|
|
51
51
|
peptideFastaAlphabet = new Set([
|
|
52
52
|
'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
|
|
@@ -89,9 +89,9 @@ class BioPackageDetectors extends DG.Package {
|
|
|
89
89
|
const t1 = Date.now();
|
|
90
90
|
try {
|
|
91
91
|
const colName = col.name;
|
|
92
|
-
|
|
93
|
-
(requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart)
|
|
94
|
-
|
|
92
|
+
const colNameLikely = this.likelyColNamePartList.some(
|
|
93
|
+
(requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart));
|
|
94
|
+
const seqMinLength = colNameLikely ? 3 : 5;
|
|
95
95
|
|
|
96
96
|
// Fail early
|
|
97
97
|
if (col.type !== DG.TYPE.STRING) return null;
|
|
@@ -147,7 +147,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
147
147
|
|
|
148
148
|
// TODO: Detect HELM sequence
|
|
149
149
|
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
150
|
-
const statsAsChars = this.getStats(categoriesSample,
|
|
150
|
+
const statsAsChars = this.getStats(categoriesSample, seqMinLength,
|
|
151
151
|
this.getSplitterAsChars(SEQ_SAMPLE_LENGTH_LIMIT));
|
|
152
152
|
// Empty statsAsShars.freq alphabet means no strings of enough length presented in the data
|
|
153
153
|
if (Object.keys(statsAsChars.freq).length === 0) return null;
|
|
@@ -164,17 +164,21 @@ class BioPackageDetectors extends DG.Package {
|
|
|
164
164
|
this.getSplitterAsFasta(SEQ_SAMPLE_LENGTH_LIMIT);
|
|
165
165
|
|
|
166
166
|
if (statsAsChars.sameLength) {
|
|
167
|
-
const stats = this.getStats(categoriesSample,
|
|
168
|
-
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-');
|
|
169
|
-
if (alphabet === ALPHABET.UN) return null;
|
|
167
|
+
const stats = this.getStats(categoriesSample, seqMinLength, splitter);
|
|
168
|
+
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-', colNameLikely);
|
|
169
|
+
if (alphabet === ALPHABET.UN && !colNameLikely) return null;
|
|
170
170
|
|
|
171
171
|
col.setTag(DG.TAGS.UNITS, units);
|
|
172
172
|
if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
|
|
173
173
|
col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
|
|
174
174
|
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
175
|
+
if (alphabet === ALPHABET.UN) {
|
|
176
|
+
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
177
|
+
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
178
|
+
}
|
|
175
179
|
return DG.SEMTYPE.MACROMOLECULE;
|
|
176
180
|
} else {
|
|
177
|
-
const stats = this.getStats(categoriesSample,
|
|
181
|
+
const stats = this.getStats(categoriesSample, seqMinLength, splitter);
|
|
178
182
|
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
179
183
|
// Empty monomer alphabet is not allowed
|
|
180
184
|
if (Object.keys(stats.freq).length === 0) return null;
|
|
@@ -189,8 +193,9 @@ class BioPackageDetectors extends DG.Package {
|
|
|
189
193
|
const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
|
|
190
194
|
|
|
191
195
|
// TODO: If separator detected, then extra efforts to detect alphabet are allowed.
|
|
192
|
-
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
|
|
193
|
-
|
|
196
|
+
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol, colNameLikely);
|
|
197
|
+
/* Likely column name allows detecting 'fasta' notation with 'UN' alphabet, 2023-04-13, atanas, askalkin */
|
|
198
|
+
if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar && !colNameLikely) return null;
|
|
194
199
|
|
|
195
200
|
// const forbidden = this.checkForbiddenWoSeparator(stats.freq);
|
|
196
201
|
col.setTag(DG.TAGS.UNITS, units);
|
|
@@ -199,9 +204,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
199
204
|
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
200
205
|
if (alphabet === ALPHABET.UN) {
|
|
201
206
|
// alphabetSize calculated on (sub)sample of data is incorrect
|
|
202
|
-
// const alphabetSize = Object.keys(stats.freq).length;
|
|
203
207
|
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
204
|
-
// col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
205
208
|
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
206
209
|
}
|
|
207
210
|
return DG.SEMTYPE.MACROMOLECULE;
|
|
@@ -287,7 +290,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
287
290
|
sameLength = false;
|
|
288
291
|
}
|
|
289
292
|
|
|
290
|
-
if (mSeq.length
|
|
293
|
+
if (mSeq.length >= minLength) {
|
|
291
294
|
for (const m of mSeq) {
|
|
292
295
|
if (!(m in freq)) freq[m] = 0;
|
|
293
296
|
freq[m] += 1;
|
|
@@ -300,10 +303,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
300
303
|
/** Detects alphabet for freq by freq similarity to alphabet monomer set.
|
|
301
304
|
* @param freq frequencies of monomers in sequence set
|
|
302
305
|
* @param candidates an array of pairs [name, monomer set]
|
|
303
|
-
*
|
|
304
|
-
|
|
306
|
+
* @param {boolean} colNameLikely The column name suggests the column is Macromolecule more likely
|
|
307
|
+
*/
|
|
308
|
+
detectAlphabet(freq, candidates, gapSymbol, colNameLikely = false) {
|
|
305
309
|
const candidatesSims = candidates.map((c) => {
|
|
306
|
-
const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol);
|
|
310
|
+
const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol) + (colNameLikely ? 0.15 : 0);
|
|
307
311
|
return [c[0], c[1], c[2], freq, sim];
|
|
308
312
|
});
|
|
309
313
|
|