@datagrok/bio 2.4.2 → 2.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.eslintrc.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "env": {
3
3
  "browser": true,
4
- "es2021": true
4
+ "es2022": true
5
5
  },
6
6
  "extends": [
7
7
  "google"
package/detectors.js CHANGED
@@ -46,7 +46,7 @@ const isUrlRe = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9
46
46
  class BioPackageDetectors extends DG.Package {
47
47
 
48
48
  /** Parts of the column name required in the column's name under the detector. It must be in lowercase. */
49
- requiredColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein'];
49
+ likelyColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein'];
50
50
 
51
51
  peptideFastaAlphabet = new Set([
52
52
  'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
@@ -89,9 +89,9 @@ class BioPackageDetectors extends DG.Package {
89
89
  const t1 = Date.now();
90
90
  try {
91
91
  const colName = col.name;
92
- if (!this.requiredColNamePartList.some(
93
- (requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart),
94
- )) return null;
92
+ const colNameLikely = this.likelyColNamePartList.some(
93
+ (requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart));
94
+ const seqMinLength = colNameLikely ? 3 : 5;
95
95
 
96
96
  // Fail early
97
97
  if (col.type !== DG.TYPE.STRING) return null;
@@ -147,7 +147,7 @@ class BioPackageDetectors extends DG.Package {
147
147
 
148
148
  // TODO: Detect HELM sequence
149
149
  // TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
150
- const statsAsChars = this.getStats(categoriesSample, 5,
150
+ const statsAsChars = this.getStats(categoriesSample, seqMinLength,
151
151
  this.getSplitterAsChars(SEQ_SAMPLE_LENGTH_LIMIT));
152
152
  // Empty statsAsShars.freq alphabet means no strings of enough length presented in the data
153
153
  if (Object.keys(statsAsChars.freq).length === 0) return null;
@@ -164,17 +164,21 @@ class BioPackageDetectors extends DG.Package {
164
164
  this.getSplitterAsFasta(SEQ_SAMPLE_LENGTH_LIMIT);
165
165
 
166
166
  if (statsAsChars.sameLength) {
167
- const stats = this.getStats(categoriesSample, 5, splitter);
168
- const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-');
169
- if (alphabet === ALPHABET.UN) return null;
167
+ const stats = this.getStats(categoriesSample, seqMinLength, splitter);
168
+ const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-', colNameLikely);
169
+ if (alphabet === ALPHABET.UN && !colNameLikely) return null;
170
170
 
171
171
  col.setTag(DG.TAGS.UNITS, units);
172
172
  if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
173
173
  col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
174
174
  col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
175
+ if (alphabet === ALPHABET.UN) {
176
+ const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
177
+ col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
178
+ }
175
179
  return DG.SEMTYPE.MACROMOLECULE;
176
180
  } else {
177
- const stats = this.getStats(categoriesSample, 5, splitter);
181
+ const stats = this.getStats(categoriesSample, seqMinLength, splitter);
178
182
  const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
179
183
  // Empty monomer alphabet is not allowed
180
184
  if (Object.keys(stats.freq).length === 0) return null;
@@ -189,8 +193,9 @@ class BioPackageDetectors extends DG.Package {
189
193
  const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
190
194
 
191
195
  // TODO: If separator detected, then extra efforts to detect alphabet are allowed.
192
- const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
193
- if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar) return null;
196
+ const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol, colNameLikely);
197
+ /* Likely column name allows detecting 'fasta' notation with 'UN' alphabet, 2023-04-13, atanas, askalkin */
198
+ if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar && !colNameLikely) return null;
194
199
 
195
200
  // const forbidden = this.checkForbiddenWoSeparator(stats.freq);
196
201
  col.setTag(DG.TAGS.UNITS, units);
@@ -199,9 +204,7 @@ class BioPackageDetectors extends DG.Package {
199
204
  col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
200
205
  if (alphabet === ALPHABET.UN) {
201
206
  // alphabetSize calculated on (sub)sample of data is incorrect
202
- // const alphabetSize = Object.keys(stats.freq).length;
203
207
  const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
204
- // col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
205
208
  col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
206
209
  }
207
210
  return DG.SEMTYPE.MACROMOLECULE;
@@ -287,7 +290,7 @@ class BioPackageDetectors extends DG.Package {
287
290
  sameLength = false;
288
291
  }
289
292
 
290
- if (mSeq.length > minLength) {
293
+ if (mSeq.length >= minLength) {
291
294
  for (const m of mSeq) {
292
295
  if (!(m in freq)) freq[m] = 0;
293
296
  freq[m] += 1;
@@ -300,10 +303,11 @@ class BioPackageDetectors extends DG.Package {
300
303
  /** Detects alphabet for freq by freq similarity to alphabet monomer set.
301
304
  * @param freq frequencies of monomers in sequence set
302
305
  * @param candidates an array of pairs [name, monomer set]
303
- * */
304
- detectAlphabet(freq, candidates, gapSymbol) {
306
+ * @param {boolean} colNameLikely The column name suggests the column is Macromolecule more likely
307
+ */
308
+ detectAlphabet(freq, candidates, gapSymbol, colNameLikely = false) {
305
309
  const candidatesSims = candidates.map((c) => {
306
- const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol);
310
+ const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol) + (colNameLikely ? 0.15 : 0);
307
311
  return [c[0], c[1], c[2], freq, sim];
308
312
  });
309
313