@datagrok/bio 1.5.9 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/detectors.js +17 -5
- package/dist/package-test.js +616 -496
- package/dist/package.js +404 -473
- package/files/samples/sample_FASTA.csv +0 -1
- package/files/samples/sample_FASTA.fasta +0 -3
- package/files/samples/sample_FASTA_DNA.csv +101 -0
- package/files/samples/sample_FASTA_PT.csv +101 -0
- package/files/samples/sample_FASTA_RNA.csv +101 -0
- package/files/samples/testSmiles2.csv +12248 -0
- package/package.json +2 -2
- package/scripts/generate_fasta_csv_for_alphabets.R +70 -0
- package/src/package-test.ts +1 -0
- package/src/package.ts +105 -20
- package/src/tests/convert-test.ts +8 -8
- package/src/tests/detectors-test.ts +15 -3
- package/src/tests/renderer-test.ts +40 -18
- package/src/utils/cell-renderer.ts +47 -75
- package/src/utils/convert.ts +10 -14
- package/src/utils/multiple-sequence-alignment.ts +4 -2
- package/src/utils/notation-converter.ts +215 -55
- package/{test-Bio-34f75e5127b8-b47d4664.html → test-Bio-34f75e5127b8-7af21e5d.html} +17 -21
- package/src/utils/chem-palette.ts +0 -280
- package/src/utils/misc.ts +0 -29
package/detectors.js
CHANGED
|
@@ -22,6 +22,10 @@ class BioPackageDetectors extends DG.Package {
|
|
|
22
22
|
|
|
23
23
|
static RnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
|
|
24
24
|
|
|
25
|
+
static SmilesRawAlphabet = new Set([
|
|
26
|
+
'O', 'C', 'c', 'N', 'S', 'F', '(', ')',
|
|
27
|
+
'1', '2', '3', '4', '5', '6', '7',
|
|
28
|
+
'+', '-', '@', '[', ']', '/', '\\', '#', '=']);
|
|
25
29
|
|
|
26
30
|
/** @param s {String} - string to check
|
|
27
31
|
* @returns {boolean} */
|
|
@@ -40,7 +44,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
40
44
|
return BioPackageDetectors.mmSemType;
|
|
41
45
|
}
|
|
42
46
|
|
|
43
|
-
const
|
|
47
|
+
const decoyAlphabets = [
|
|
48
|
+
['SMILES', BioPackageDetectors.SmilesRawAlphabet],
|
|
49
|
+
];
|
|
50
|
+
|
|
51
|
+
const candidateAlphabets = [
|
|
44
52
|
['PT', BioPackageDetectors.PeptideFastaAlphabet],
|
|
45
53
|
['DNA', BioPackageDetectors.DnaFastaAlphabet],
|
|
46
54
|
['RNA', BioPackageDetectors.RnaFastaAlphabet],
|
|
@@ -49,9 +57,13 @@ class BioPackageDetectors extends DG.Package {
|
|
|
49
57
|
// TODO: Detect HELM sequence
|
|
50
58
|
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
51
59
|
const statsAsChars = BioPackageDetectors.getStats(col, 5, BioPackageDetectors.splitterAsChars);
|
|
60
|
+
|
|
61
|
+
const decoy = BioPackageDetectors.detectAlphabet(statsAsChars.freq, decoyAlphabets, null, 0.5);
|
|
62
|
+
if (decoy != 'UN') return null;
|
|
63
|
+
|
|
52
64
|
if (statsAsChars.sameLength) {
|
|
53
65
|
if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
|
|
54
|
-
const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq,
|
|
66
|
+
const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, candidateAlphabets, '-');
|
|
55
67
|
if (alphabet === 'UN') return null;
|
|
56
68
|
|
|
57
69
|
const units = `fasta:SEQ.MSA:${alphabet}`;
|
|
@@ -73,7 +85,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
73
85
|
const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';
|
|
74
86
|
|
|
75
87
|
// TODO: If separator detected, then extra efforts to detect alphabet are allowed.
|
|
76
|
-
const alphabet = BioPackageDetectors.detectAlphabet(stats.freq,
|
|
88
|
+
const alphabet = BioPackageDetectors.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
|
|
77
89
|
|
|
78
90
|
// const forbidden = BioPackageDetectors.checkForbiddenWoSeparator(stats.freq);
|
|
79
91
|
if (separator || alphabet != 'UN') {
|
|
@@ -160,7 +172,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
160
172
|
* @param freq frequencies of monomers in sequence set
|
|
161
173
|
* @param candidates an array of pairs [name, monomer set]
|
|
162
174
|
* */
|
|
163
|
-
static detectAlphabet(freq, candidates, gapSymbol) {
|
|
175
|
+
static detectAlphabet(freq, candidates, gapSymbol, cut = 0.55) {
|
|
164
176
|
const candidatesSims = candidates.map((c) => {
|
|
165
177
|
const sim = BioPackageDetectors.getAlphabetSimilarity(freq, c[1], gapSymbol);
|
|
166
178
|
return [c[0], c[1], freq, sim];
|
|
@@ -168,7 +180,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
168
180
|
|
|
169
181
|
let alphabetName;
|
|
170
182
|
const maxSim = Math.max(...candidatesSims.map((cs) => cs[3]));
|
|
171
|
-
if (maxSim >
|
|
183
|
+
if (maxSim > cut) {
|
|
172
184
|
const sim = candidatesSims.find((cs) => cs[3] == maxSim);
|
|
173
185
|
alphabetName = sim[0];
|
|
174
186
|
} else {
|