@datagrok/bio 1.7.8 → 1.7.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/detectors.js CHANGED
@@ -30,7 +30,8 @@ class BioPackageDetectors extends DG.Package {
30
30
  /** @param s {String} - string to check
31
31
  * @returns {boolean} */
32
32
  static isHelm(s) {
33
- return s.startsWith('PEPTIDE1{') || s.startsWith('RNA1{') || s.startsWith('CHEM1{') || s.startsWith('BLOB1{');
33
+ return s.startsWith('PEPTIDE1{') || s.startsWith('CHEM1{') || s.startsWith('BLOB1{') ||
34
+ s.startsWith('RNA1{') || s.startsWith('DNA1{');
34
35
  }
35
36
 
36
37
  //tags: semTypeDetector
@@ -54,6 +55,20 @@ class BioPackageDetectors extends DG.Package {
54
55
  ['RNA', BioPackageDetectors.RnaFastaAlphabet],
55
56
  ];
56
57
 
58
+ // Check for url column, maybe it is too heavy check
59
+ const isUrlCheck = (s) => {
60
+ let res = true;
61
+ try {
62
+ const url = new URL(s);
63
+ res = true;
64
+ } catch {
65
+ res = false;
66
+ }
67
+ return res;
68
+ };
69
+ const isUrl = DG.Detector.sampleCategories(col, isUrlCheck, 1);
70
+ if (isUrl) return null;
71
+
57
72
  // TODO: Detect HELM sequence
58
73
  // TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
59
74
  const statsAsChars = BioPackageDetectors.getStats(col, 5, BioPackageDetectors.splitterAsChars);
@@ -198,7 +213,7 @@ class BioPackageDetectors extends DG.Package {
198
213
  const alphabetA = [];
199
214
  for (const m of keys) {
200
215
  freqA.push(m in freq ? freq[m] : 0);
201
- alphabetA.push(alphabet.has(m) ? 1 : 0);
216
+ alphabetA.push(alphabet.has(m) ? 10 : -10 /* penalty for character outside alphabet set*/);
202
217
  }
203
218
  /* There were a few ideas: chi-squared, pearson correlation (variance?), scalar product */
204
219
  const cos = BioPackageDetectors.vectorDotProduct(freqA, alphabetA) / (BioPackageDetectors.vectorLength(freqA) * BioPackageDetectors.vectorLength(alphabetA));