@datagrok/bio 1.4.2 → 1.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/detectors.js +17 -4
- package/dist/package-test.js +862 -635
- package/dist/package.js +664 -584
- package/dist/vendors-node_modules_datagrok-libraries_ml_src_workers_dimensionality-reducer_js.js +1665 -1651
- package/files/sample_MSA.csv +541 -0
- package/files/samples/id.csv +313 -0
- package/package.json +7 -6
- package/setup.cmd +10 -1
- package/src/package-test.ts +1 -0
- package/src/package.ts +70 -25
- package/src/tests/activity-cliffs-tests.ts +49 -0
- package/src/tests/detectors-test.ts +138 -34
- package/src/tests/sequence-space-test.ts +21 -19
- package/src/tests/utils.ts +9 -3
- package/src/utils/convert.ts +8 -9
- package/src/utils/multiple-sequence-alignment.ts +1 -1
- package/src/utils/sequence-activity-cliffs.ts +36 -0
- package/src/utils/sequence-space.ts +30 -30
package/detectors.js
CHANGED
|
@@ -42,6 +42,8 @@ class BioPackageDetectors extends DG.Package {
|
|
|
42
42
|
if (statsAsChars.sameLength) {
|
|
43
43
|
if (Object.keys(statsAsChars.freq).length > 0) { // require non empty alphabet
|
|
44
44
|
const alphabet = BioPackageDetectors.detectAlphabet(statsAsChars.freq, alphabetCandidates, '-');
|
|
45
|
+
if (alphabet === 'UN') return null;
|
|
46
|
+
|
|
45
47
|
const units = `fasta:SEQ.MSA:${alphabet}`;
|
|
46
48
|
col.setTag(DG.TAGS.UNITS, units);
|
|
47
49
|
return BioPackageDetectors.mmSemType;
|
|
@@ -52,7 +54,10 @@ class BioPackageDetectors extends DG.Package {
|
|
|
52
54
|
const splitter = separator ? BioPackageDetectors.getSplitterWithSeparator(separator) : BioPackageDetectors.splitterAsFasta;
|
|
53
55
|
|
|
54
56
|
const stats = BioPackageDetectors.getStats(col, 5, splitter);
|
|
57
|
+
// Empty monomer alphabet is not allowed
|
|
55
58
|
if (Object.keys(stats.freq).length === 0) return null;
|
|
59
|
+
// Long monomer names for sequences with separators have constraints
|
|
60
|
+
if (separator && BioPackageDetectors.checkForbiddenWithSeparators(stats.freq)) return null;
|
|
56
61
|
|
|
57
62
|
const format = separator ? 'separator' : 'fasta';
|
|
58
63
|
const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';
|
|
@@ -82,9 +87,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
82
87
|
// !!! But there is a caveat because exceptionally frequent char can be a gap symbol in MSA.
|
|
83
88
|
// !!! What is the difference between the gap symbol and separator symbol in stats terms?
|
|
84
89
|
// const noSeparatorRe = /[a-z\d]+$/i;
|
|
85
|
-
const
|
|
90
|
+
const noSeparatorChemRe = /[HBCNOFPSKVYI]/i; // Mendeleev's periodic table single char elements
|
|
91
|
+
const noSeparatorAlphaDigitRe = /[\dA-Z]/i;
|
|
86
92
|
const cleanFreq = Object.assign({}, ...Object.entries(freq)
|
|
87
|
-
.filter(([m, f]) =>
|
|
93
|
+
.filter(([m, f]) => m != ' ' &&
|
|
94
|
+
!noSeparatorChemRe.test(m) && !noSeparatorAlphaDigitRe.test(m) &&
|
|
88
95
|
!BioPackageDetectors.AminoacidsFastaAlphabet.has(m) &&
|
|
89
96
|
!BioPackageDetectors.NucleotidesFastaAlphabet.has(m))
|
|
90
97
|
.map(([m, f]) => ({[m]: f})));
|
|
@@ -100,6 +107,12 @@ class BioPackageDetectors extends DG.Package {
|
|
|
100
107
|
return sepFreq / otherSumFreq > freqThreshold ? sep : null;
|
|
101
108
|
}
|
|
102
109
|
|
|
110
|
+
/** With a separator, spaces are nor allowed in monomer names. */
|
|
111
|
+
static checkForbiddenWithSeparators(freq) {
|
|
112
|
+
const forbiddenRe = /[ ]/i;
|
|
113
|
+
return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
|
|
114
|
+
}
|
|
115
|
+
|
|
103
116
|
/** Without a separator, special symbols or digits are not allowed as monomers. */
|
|
104
117
|
static checkForbiddenWoSeparator(freq) {
|
|
105
118
|
const forbiddenRe = /[\d!@#$%^&*()_+\-=\[\]{};':"\\|,.<>\/?]/i;
|
|
@@ -193,9 +206,9 @@ class BioPackageDetectors extends DG.Package {
|
|
|
193
206
|
return seq.split('');
|
|
194
207
|
}
|
|
195
208
|
|
|
196
|
-
static getSplitterWithSeparator(
|
|
209
|
+
static getSplitterWithSeparator(separator) {
|
|
197
210
|
return function(seq) {
|
|
198
|
-
return seq.split(
|
|
211
|
+
return seq.split(separator);
|
|
199
212
|
};
|
|
200
213
|
}
|
|
201
214
|
|