@datagrok/bio 1.4.2 → 1.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/detectors.js CHANGED
@@ -42,6 +42,8 @@ class BioPackageDetectors extends DG.Package {
42
42
  if (statsAsChars.sameLength) {
43
43
  if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
44
44
  const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, alphabetCandidates, '-');
45
+ if (alphabet === 'UN') return null;
46
+
45
47
  const units = `fasta:SEQ.MSA:${alphabet}`;
46
48
  col.setTag(DG.TAGS.UNITS, units);
47
49
  return BioPackageDetectors.mmSemType;
@@ -52,7 +54,10 @@ class BioPackageDetectors extends DG.Package {
52
54
  const splitter = separator ? BioPackageDetectors.getSplitterWithSeparator(separator) : BioPackageDetectors.splitterAsFasta;
53
55
 
54
56
  const stats = BioPackageDetectors.getStats(col, 5, splitter);
57
+ // Empty monomer alphabet is not allowed
55
58
  if (Object.keys(stats.freq).length === 0) return null;
59
+ // Long monomer names for sequences with separators have constraints
60
+ if (separator && BioPackageDetectors.checkForbiddenWithSeparators(stats.freq)) return null;
56
61
 
57
62
  const format = separator ? 'separator' : 'fasta';
58
63
  const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';
@@ -82,9 +87,11 @@ class BioPackageDetectors extends DG.Package {
82
87
  // !!! But there is a caveat because exceptionally frequent char can be a gap symbol in MSA.
83
88
  // !!! What is the difference between the gap symbol and separator symbol in stats terms?
84
89
  // const noSeparatorRe = /[a-z\d]+$/i;
85
- const noSeparatorRe = /[HBCNOFPSKVYI]/i; // Mendeleev's periodic table single char elements
90
+ const noSeparatorChemRe = /[HBCNOFPSKVYI]/i; // Mendeleev's periodic table single char elements
91
+ const noSeparatorAlphaDigitRe = /[\dA-Z]/i;
86
92
  const cleanFreq = Object.assign({}, ...Object.entries(freq)
87
- .filter(([m, f]) => !noSeparatorRe.test(m) &&
93
+ .filter(([m, f]) => m != ' ' &&
94
+ !noSeparatorChemRe.test(m) && !noSeparatorAlphaDigitRe.test(m) &&
88
95
  !BioPackageDetectors.AminoacidsFastaAlphabet.has(m) &&
89
96
  !BioPackageDetectors.NucleotidesFastaAlphabet.has(m))
90
97
  .map(([m, f]) => ({[m]: f})));
@@ -100,6 +107,12 @@ class BioPackageDetectors extends DG.Package {
100
107
  return sepFreq / otherSumFreq > freqThreshold ? sep : null;
101
108
  }
102
109
 
110
+ /** With a separator, spaces are nor allowed in monomer names. */
111
+ static checkForbiddenWithSeparators(freq) {
112
+ const forbiddenRe = /[ ]/i;
113
+ return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
114
+ }
115
+
103
116
  /** Without a separator, special symbols or digits are not allowed as monomers. */
104
117
  static checkForbiddenWoSeparator(freq) {
105
118
  const forbiddenRe = /[\d!@#$%^&*()_+\-=\[\]{};':"\\|,.<>\/?]/i;
@@ -193,9 +206,9 @@ class BioPackageDetectors extends DG.Package {
193
206
  return seq.split('');
194
207
  }
195
208
 
196
- static getSplitterWithSeparator(sep) {
209
+ static getSplitterWithSeparator(separator) {
197
210
  return function(seq) {
198
- return seq.split(sep);
211
+ return seq.split(separator);
199
212
  };
200
213
  }
201
214