@datagrok/bio 1.7.7 → 1.7.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/detectors.js +22 -7
- package/dist/package-test.js +378 -82
- package/dist/package.js +276 -62
- package/files/tests/testActivityCliffs.csv +311 -0
- package/files/tests/testSpgi100.csv +8437 -0
- package/files/tests/testUnichemSources.csv +36 -0
- package/package.json +2 -2
- package/src/package.ts +74 -0
- package/src/tests/convert-test.ts +55 -22
- package/src/tests/detectors-test.ts +45 -6
- package/src/tests/splitters-test.ts +19 -0
- package/src/utils/cell-renderer.ts +114 -1
- package/src/utils/constants.ts +2 -1
- package/src/utils/convert.ts +17 -3
- package/src/utils/multiple-sequence-alignment.ts +2 -1
- package/src/utils/utils.ts +10 -8
- package/{test-Bio-34f75e5127b8-7c42ea4b.html → test-Bio-34f75e5127b8-f542cbde.html} +22 -10
package/detectors.js
CHANGED
|
@@ -23,14 +23,15 @@ class BioPackageDetectors extends DG.Package {
|
|
|
23
23
|
static RnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
|
|
24
24
|
|
|
25
25
|
static SmilesRawAlphabet = new Set([
|
|
26
|
-
'
|
|
27
|
-
'1', '2', '3', '4', '5', '6', '7',
|
|
26
|
+
'B', 'C', 'c', 'E', 'H', 'L', 'M', 'N', 'O', 'S', 'F', '(', ')',
|
|
27
|
+
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
|
28
28
|
'+', '-', '@', '[', ']', '/', '\\', '#', '=']);
|
|
29
29
|
|
|
30
30
|
/** @param s {String} - string to check
|
|
31
31
|
* @returns {boolean} */
|
|
32
32
|
static isHelm(s) {
|
|
33
|
-
return s.startsWith('PEPTIDE1{') || s.startsWith('
|
|
33
|
+
return s.startsWith('PEPTIDE1{') || s.startsWith('CHEM1{') || s.startsWith('BLOB1{') ||
|
|
34
|
+
s.startsWith('RNA1{') || s.startsWith('DNA1{');
|
|
34
35
|
}
|
|
35
36
|
|
|
36
37
|
//tags: semTypeDetector
|
|
@@ -54,11 +55,25 @@ class BioPackageDetectors extends DG.Package {
|
|
|
54
55
|
['RNA', BioPackageDetectors.RnaFastaAlphabet],
|
|
55
56
|
];
|
|
56
57
|
|
|
58
|
+
// Check for url column, maybe it is too heavy check
|
|
59
|
+
const isUrlCheck = (s) => {
|
|
60
|
+
let res = true;
|
|
61
|
+
try {
|
|
62
|
+
const url = new URL(s);
|
|
63
|
+
res = true;
|
|
64
|
+
} catch {
|
|
65
|
+
res = false;
|
|
66
|
+
}
|
|
67
|
+
return res;
|
|
68
|
+
};
|
|
69
|
+
const isUrl = DG.Detector.sampleCategories(col, isUrlCheck, 1);
|
|
70
|
+
if (isUrl) return null;
|
|
71
|
+
|
|
57
72
|
// TODO: Detect HELM sequence
|
|
58
73
|
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
59
74
|
const statsAsChars = BioPackageDetectors.getStats(col, 5, BioPackageDetectors.splitterAsChars);
|
|
60
75
|
|
|
61
|
-
const decoy = BioPackageDetectors.detectAlphabet(statsAsChars.freq, decoyAlphabets, null, 0.
|
|
76
|
+
const decoy = BioPackageDetectors.detectAlphabet(statsAsChars.freq, decoyAlphabets, null, 0.35);
|
|
62
77
|
if (decoy != 'UN') return null;
|
|
63
78
|
|
|
64
79
|
if (statsAsChars.sameLength) {
|
|
@@ -110,10 +125,10 @@ class BioPackageDetectors extends DG.Package {
|
|
|
110
125
|
// !!! What is the difference between the gap symbol and separator symbol in stats terms?
|
|
111
126
|
// const noSeparatorRe = /[a-z\d]+$/i;
|
|
112
127
|
const noSeparatorChemRe = /[HBCNOFPSKVYI]/i; // Mendeleev's periodic table single char elements
|
|
113
|
-
const noSeparatorAlphaDigitRe = /[\dA-Z]/i;
|
|
128
|
+
const noSeparatorAlphaDigitRe = /[\dA-Z,& _]/i; // ..., comma, ampersand, space, underscore
|
|
114
129
|
const noSeparatorBracketsRe = /[\[\]()<>{}]/i;
|
|
115
130
|
const cleanFreq = Object.assign({}, ...Object.entries(freq)
|
|
116
|
-
.filter(([m, f]) =>
|
|
131
|
+
.filter(([m, f]) =>
|
|
117
132
|
!noSeparatorChemRe.test(m) && !noSeparatorAlphaDigitRe.test(m) && !noSeparatorBracketsRe.test(m) &&
|
|
118
133
|
!BioPackageDetectors.PeptideFastaAlphabet.has(m) &&
|
|
119
134
|
!BioPackageDetectors.DnaFastaAlphabet.has(m))
|
|
@@ -198,7 +213,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
198
213
|
const alphabetA = [];
|
|
199
214
|
for (const m of keys) {
|
|
200
215
|
freqA.push(m in freq ? freq[m] : 0);
|
|
201
|
-
alphabetA.push(alphabet.has(m) ?
|
|
216
|
+
alphabetA.push(alphabet.has(m) ? 10 : -10 /* penalty for character outside alphabet set*/);
|
|
202
217
|
}
|
|
203
218
|
/* There were a few ideas: chi-squared, pearson correlation (variance?), scalar product */
|
|
204
219
|
const cos = BioPackageDetectors.vectorDotProduct(freqA, alphabetA) / (BioPackageDetectors.vectorLength(freqA) * BioPackageDetectors.vectorLength(alphabetA));
|