@datagrok/bio 2.5.0 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/detectors.js CHANGED
@@ -158,7 +158,7 @@ class BioPackageDetectors extends DG.Package {
158
158
  const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null, colNameLikely ? -0.05 : 0);
159
159
  if (decoy !== ALPHABET.UN) return null;
160
160
 
161
- const separator = this.detectSeparator(statsAsChars.freq);
161
+ const separator = this.detectSeparator(statsAsChars.freq, categoriesSample);
162
162
  if (this.checkForbiddenSeparator(separator)) return null;
163
163
 
164
164
  const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
@@ -227,8 +227,10 @@ class BioPackageDetectors extends DG.Package {
227
227
 
228
228
  /** Detects the most frequent char with a rate of at least 0.15 of others in sum.
229
229
  * Does not use any splitting strategies, estimates just by single characters.
230
- * */
231
- detectSeparator(freq) {
230
+ * @param freq Dictionary of characters freqs
231
+ * @param sample A string array of seqs sample
232
+ */
233
+ detectSeparator(freq, categoriesSample) {
232
234
  // To detect a separator we analyze col's sequences character frequencies.
233
235
  // If there is an exceptionally frequent symbol, then we will call it the separator.
234
236
  // The most frequent symbol should occur with a rate of at least 0.15
@@ -254,8 +256,24 @@ class BioPackageDetectors extends DG.Package {
254
256
  const sepFreq = freq[sep];
255
257
  const otherSumFreq = Object.entries(freq).filter((kv) => kv[0] !== sep)
256
258
  .map((kv) => kv[1]).reduce((pSum, a) => pSum + a, 0);
257
- const freqThreshold = 3.5 * (1 / Object.keys(freq).length);
258
- return sepFreq / otherSumFreq > freqThreshold ? sep : null;
259
+
260
+ // Splitter with separator test application
261
+ const splitter = this.getSplitterWithSeparator(sep, SEQ_SAMPLE_LENGTH_LIMIT);
262
+ const stats = this.getStats(categoriesSample, 0, splitter);
263
+ // TODO: Test for Gamma/Erlang distribution
264
+ const totalMonomerCount = wu(Object.values(stats.freq)).reduce((sum, a) => sum + a, 0);
265
+ const mLengthAvg = wu.entries(stats.freq)
266
+ .reduce((sum, [m, c]) => sum + m.length * c, 0) / totalMonomerCount;
267
+ const mLengthVarN = Math.sqrt(wu.entries(stats.freq)
268
+ .reduce((sum, [m, c]) => sum + Math.pow(m.length - mLengthAvg, 2) * c, 0) / (totalMonomerCount - 1),
269
+ ) / mLengthAvg;
270
+
271
+ const sepRate = sepFreq / (sepFreq + otherSumFreq);
272
+ const expSepRate = 1 / Object.keys(freq).length; // expected
273
+ // const freqThreshold = (1 / (Math.log2(Object.keys(freq).length) + 2));
274
+
275
+ return (sepRate / expSepRate > 2.2 && mLengthVarN < 0.7) ||
276
+ (sepRate / expSepRate > 4) ? sep : null;
259
277
  }
260
278
 
261
279
  checkForbiddenSeparator(separator) {