@datagrok-libraries/bio 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/sequence-encoder.js +155 -0
- package/src/sequence-encoder.ts +0 -177
package/package.json
CHANGED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import { assert } from '@datagrok-libraries/utils/src/operations';
|
|
2
|
+
class SideChainScales {
|
|
3
|
+
static getAvailableScales() {
|
|
4
|
+
return Object.entries(this.scales).map(([k, _]) => k);
|
|
5
|
+
}
|
|
6
|
+
static getScale(name) {
|
|
7
|
+
assert(!(this.scales[name] === undefined), `Scale '${name}' was not found.`);
|
|
8
|
+
return this.scales[name];
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
SideChainScales.scales = {
|
|
12
|
+
// Wimley-White interfacial hydrophobicity scale
|
|
13
|
+
'WimleyWhite': {
|
|
14
|
+
'-': 0,
|
|
15
|
+
'A': 0.17,
|
|
16
|
+
'C': -0.24,
|
|
17
|
+
'D': -0.07,
|
|
18
|
+
'E': -0.01,
|
|
19
|
+
'F': -1.13,
|
|
20
|
+
'G': 0.01,
|
|
21
|
+
'H': 0.17,
|
|
22
|
+
'I': -0.31,
|
|
23
|
+
'K': 0.99,
|
|
24
|
+
'L': -0.56,
|
|
25
|
+
'M': -0.23,
|
|
26
|
+
'N': 0.42,
|
|
27
|
+
'P': 0.45,
|
|
28
|
+
'Q': 0.58,
|
|
29
|
+
'R': 0.81,
|
|
30
|
+
'S': 0.13,
|
|
31
|
+
'T': 0.14,
|
|
32
|
+
'V': 0.07,
|
|
33
|
+
'W': -1.85,
|
|
34
|
+
'Y': -0.94,
|
|
35
|
+
},
|
|
36
|
+
'categorial': {
|
|
37
|
+
'-': 0,
|
|
38
|
+
'A': 1,
|
|
39
|
+
'C': 2,
|
|
40
|
+
'D': 3,
|
|
41
|
+
'E': 4,
|
|
42
|
+
'F': 5,
|
|
43
|
+
'G': 6,
|
|
44
|
+
'H': 7,
|
|
45
|
+
'I': 8,
|
|
46
|
+
'K': 9,
|
|
47
|
+
'L': 10,
|
|
48
|
+
'M': 11,
|
|
49
|
+
'N': 12,
|
|
50
|
+
'P': 13,
|
|
51
|
+
'Q': 14,
|
|
52
|
+
'R': 15,
|
|
53
|
+
'S': 16,
|
|
54
|
+
'T': 17,
|
|
55
|
+
'V': 18,
|
|
56
|
+
'W': 19,
|
|
57
|
+
'Y': 20,
|
|
58
|
+
},
|
|
59
|
+
};
|
|
60
|
+
/**
|
|
61
|
+
* Class to categorial encode/decode aligned amino acid residues sequence.
|
|
62
|
+
*
|
|
63
|
+
* @export
|
|
64
|
+
* @class AlignedSequenceEncoder
|
|
65
|
+
*/
|
|
66
|
+
export class AlignedSequenceEncoder {
|
|
67
|
+
constructor(scale = 'categorial') {
|
|
68
|
+
this.aa2num = SideChainScales.getScale(scale);
|
|
69
|
+
this.num2aa = {};
|
|
70
|
+
Object.entries(this.aa2num).forEach(([k, v]) => (this.num2aa[v] = k));
|
|
71
|
+
}
|
|
72
|
+
/**
|
|
73
|
+
* Truncate NH2 and -COOH terminals of the given sequence.
|
|
74
|
+
*
|
|
75
|
+
* @static
|
|
76
|
+
* @param {string} seq The sequence provided.
|
|
77
|
+
* @return {string} Truncated sequence.
|
|
78
|
+
* @memberof AlignedSequenceEncoder
|
|
79
|
+
*/
|
|
80
|
+
static _truncateSequence(seq) {
|
|
81
|
+
let start = 0;
|
|
82
|
+
let end = seq.length;
|
|
83
|
+
const termina = ['NH2', 'COOH'];
|
|
84
|
+
if (seq.startsWith(termina[0])) {
|
|
85
|
+
const l = termina[0].length; // Cut only 'NH2' without following '-'.
|
|
86
|
+
assert(seq[l] == '-', `Wrong sequence format: ${termina[0]} without following '-' in '${seq}'.`);
|
|
87
|
+
start = l;
|
|
88
|
+
}
|
|
89
|
+
if (seq.endsWith(termina[1])) {
|
|
90
|
+
const l = termina[1].length + 1; // Cut both 'COOH' and precending '-'.
|
|
91
|
+
assert(seq[end - l] == '-', `Wrong sequence format: ${termina[1]} without '-' precending in '${seq}'.`);
|
|
92
|
+
end -= l;
|
|
93
|
+
}
|
|
94
|
+
return seq.substring(start, end);
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Cuts auxiliary defises before a residue.
|
|
98
|
+
*
|
|
99
|
+
* @static
|
|
100
|
+
* @param {string} seq The sequence to process.
|
|
101
|
+
* @return {string} Processed sequence.
|
|
102
|
+
* @memberof AlignedSequenceEncoder
|
|
103
|
+
*/
|
|
104
|
+
static _dropDefises(seq) {
|
|
105
|
+
return seq.replace(/(-)([^-]+)/g, '$2');
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Performs truncation and cutting auxiliary defises.
|
|
109
|
+
*
|
|
110
|
+
* @static
|
|
111
|
+
* @param {string} sequence The sequence work under process.
|
|
112
|
+
* @return {string} Result of cleaning.
|
|
113
|
+
* @memberof AlignedSequenceEncoder
|
|
114
|
+
*/
|
|
115
|
+
static clean(sequence) {
|
|
116
|
+
return AlignedSequenceEncoder._dropDefises(AlignedSequenceEncoder._truncateSequence(sequence));
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Categorial encode of the sequence provided.
|
|
120
|
+
*
|
|
121
|
+
* @param {string} sequence The sequence.
|
|
122
|
+
* @return {number[]} Encoded vector.
|
|
123
|
+
* @memberof AlignedSequenceEncoder
|
|
124
|
+
*/
|
|
125
|
+
encode(sequence) {
|
|
126
|
+
const nItems = sequence.length;
|
|
127
|
+
const values = new Array(nItems).fill(0);
|
|
128
|
+
for (let i = 0; i < nItems; ++i) {
|
|
129
|
+
const char = sequence[i];
|
|
130
|
+
assert(char in this.aa2num, `Unknown char '${char}' found in sequence '${sequence}'`);
|
|
131
|
+
values[i] = this.encodeLettter(char);
|
|
132
|
+
}
|
|
133
|
+
return values;
|
|
134
|
+
}
|
|
135
|
+
encodeLettter(letter) {
|
|
136
|
+
return this.aa2num[letter];
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Decode the encoded vector into the sequence back.
|
|
140
|
+
*
|
|
141
|
+
* @param {number[]} value The vector encoded.
|
|
142
|
+
* @return {string} Decoded sequence.
|
|
143
|
+
* @memberof AlignedSequenceEncoder
|
|
144
|
+
*/
|
|
145
|
+
decode(value) {
|
|
146
|
+
let s = '';
|
|
147
|
+
for (let i = 0; i < value.length; ++i) {
|
|
148
|
+
const code = value[i];
|
|
149
|
+
assert(code in this.num2aa, `Unknown code '${code}' found in vector '${value}'`);
|
|
150
|
+
s += this.num2aa[code];
|
|
151
|
+
}
|
|
152
|
+
return s;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoic2VxdWVuY2UtZW5jb2Rlci5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbInNlcXVlbmNlLWVuY29kZXIudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQUEsT0FBTyxFQUFDLE1BQU0sRUFBQyxNQUFNLDBDQUEwQyxDQUFDO0FBS2hFLE1BQU0sZUFBZTtJQW1EbkIsTUFBTSxDQUFDLGtCQUFrQjtRQUN2QixPQUFPLE1BQU0sQ0FBQyxPQUFPLENBQUMsSUFBSSxDQUFDLE1BQU0sQ0FBQyxDQUFDLEdBQUcsQ0FBQyxDQUFDLENBQUMsQ0FBQyxFQUFFLENBQUMsQ0FBQyxFQUFFLEVBQUUsQ0FBQyxDQUFDLENBQUMsQ0FBQztJQUN4RCxDQUFDO0lBRUQsTUFBTSxDQUFDLFFBQVEsQ0FBQyxJQUFZO1FBQzFCLE1BQU0sQ0FBQyxDQUFDLENBQUMsSUFBSSxDQUFDLE1BQU0sQ0FBQyxJQUFJLENBQUMsS0FBSyxTQUFTLENBQUMsRUFBRSxVQUFVLElBQUksa0JBQWtCLENBQUMsQ0FBQztRQUM3RSxPQUFPLElBQUksQ0FBQyxNQUFNLENBQUMsSUFBSSxDQUFDLENBQUM7SUFDM0IsQ0FBQzs7QUF6RE0sc0JBQU0sR0FBNkI7SUFDeEMsZ0RBQWdEO0lBQ2hELGFBQWEsRUFBRTtRQUNiLEdBQUcsRUFBRSxDQUFDO1FBQ04sR0FBRyxFQUFFLElBQUk7UUFDVCxHQUFHLEVBQUUsQ0FBQyxJQUFJO1FBQ1YsR0FBRyxFQUFFLENBQUMsSUFBSTtRQUNWLEdBQUcsRUFBRSxDQUFDLElBQUk7UUFDVixHQUFHLEVBQUUsQ0FBQyxJQUFJO1FBQ1YsR0FBRyxFQUFFLElBQUk7UUFDVCxHQUFHLEVBQUUsSUFBSTtRQUNULEdBQUcsRUFBRSxDQUFDLElBQUk7UUFDVixHQUFHLEVBQUUsSUFBSTtRQUNULEdBQUcsRUFBRSxDQUFDLElBQUk7UUFDVixHQUFHLEVBQUUsQ0FBQyxJQUFJO1FBQ1YsR0FBRyxFQUFFLElBQUk7UUFDVCxHQUFHLEVBQUUsSUFBSTtRQUNULEdBQUcsRUFBRSxJQUFJO1FBQ1QsR0FBRyxFQUFFLElBQUk7UUFDVCxHQUFHLEVBQUUsSUFBSTtRQUNULEdBQUcsRUFBRSxJQUFJO1FBQ1QsR0FBRyxFQUFFLElBQUk7UUFDVCxHQUFHLEVBQUUsQ0FBQyxJQUFJO1FBQ1YsR0FBRyxFQUFFLENBQUMsSUFBSTtLQUNYO0lBQ0QsWUFBWSxFQUFFO1FBQ1osR0FBRyxFQUFFLENBQUM7UUFDTixHQUFHLEVBQUUsQ0FBQztRQUNOLEdBQUcsRUFBRSxDQUFDO1FBQ04sR0FBRyxFQUFFLENBQUM7UUFDTixHQUFHLEVBQUUsQ0FBQztRQUNOLEdBQUcsRUFBRSxDQUFDO1FBQ04sR0FBRyxFQUFFLENBQUM7UUFDTixHQUFHLEVBQUUsQ0FBQztRQUNOLEdBQUcsRUFBRSxDQUFDO1FBQ04sR0FBRyxFQUFFLENBQUM7UUFDTixHQUFHLEVBQUUsRUFBRTtRQUNQLEdBQUcsRUFBRSxFQUFFO1FBQ1AsR0FBRyxFQUFFLEVBQUU7UUFDUCxHQUFHLEVBQUUsRUFBRTtRQUNQLEdBQUcsRUFBRSxFQUFFO1FBQ1AsR0FBRyxFQUFFLEVBQUU7UUFDUCxHQUFHLEVBQUUsRUFBRTtRQUNQLEdBQUcsRUFBRSxFQUFFO1FBQ1AsR0FBRyxFQUFFLEVBQUU7UUFDUCxHQUFHLEVBQUUsRUFBRTtRQUNQLEdBQUcsRUFBRSxFQUFFO0tBQ1I7Q0FDRixDQUFDO0FBWUo7Ozs7O0dBS0c7QUFDSCxNQUFNLE9BQU8sc0JBQXNCO0lBSWpDLFlBQVksUUFBZ0IsWUFBWTtRQUN0QyxJQUFJLENBQUMsTUFBTSxHQUFHLGVBQWUsQ0FBQyxRQUFRLENBQUMsS0FBSyxDQUFDLENBQUM7UUFDOUMsSUFBSSxDQUFDLE1BQU0sR0FBRyxFQUFFLENBQUM7UUFDakIsTUFBTSxDQUFDLE9BQU8sQ0FBQyxJQUFJLENBQUMsTUFBTSxDQUFDLENBQUMsT0FBTyxDQUFDLENBQUMsQ0FBQyxDQUFDLEVBQUUsQ0FBQyxDQUFDLEVBQUUsRUFBRSxDQUFDLENBQUMsSUFBSSxDQUFDLE1BQU0sQ0FBQyxDQUFDLENBQUMsR0FBRyxDQUFDLENBQUMsQ0FBQyxDQUFDO0lBQ3hFLENBQUM7SUFFRDs7Ozs7OztTQU9LO0lBQ0wsTUFBTSxDQUFDLGlCQUFpQixDQUFDLEdBQVc7UUFDbEMsSUFBSSxLQUFLLEdBQUcsQ0FBQyxDQUFDO1FBQ2QsSUFBSSxHQUFHLEdBQUcsR0FBRyxDQUFDLE1BQU0sQ0FBQztRQUNyQixNQUFNLE9BQU8sR0FBRyxDQUFDLEtBQUssRUFBRSxNQUFNLENBQUMsQ0FBQztRQUVoQyxJQUFJLEdBQUcsQ0FBQyxVQUFVLENBQUMsT0FBTyxDQUFDLENBQUMsQ0FBQyxDQUFDLEVBQUU7WUFDOUIsTUFBTSxDQUFDLEdBQUcsT0FBTyxDQUFDLENBQUMsQ0FBQyxDQUFDLE1BQU0sQ0FBQyxDQUFDLHdDQUF3QztZQUNyRSxNQUFNLENBQUMsR0FBRyxDQUFDLENBQUMsQ0FBQyxJQUFJLEdBQUcsRUFBRSwwQkFBMEIsT0FBTyxDQUFDLENBQUMsQ0FBQyw4QkFBOEIsR0FBRyxJQUFJLENBQUMsQ0FBQztZQUNqRyxLQUFLLEdBQUcsQ0FBQyxDQUFDO1NBQ1g7UUFDRCxJQUFJLEdBQUcsQ0FBQyxRQUFRLENBQUMsT0FBTyxDQUFDLENBQUMsQ0FBQyxDQUFDLEVBQUU7WUFDNUIsTUFBTSxDQUFDLEdBQUcsT0FBTyxDQUFDLENBQUMsQ0FBQyxDQUFDLE1BQU0sR0FBQyxDQUFDLENBQUMsQ0FBQyxzQ0FBc0M7WUFDckUsTUFBTSxDQUFDLEdBQUcsQ0FBQyxHQUFHLEdBQUMsQ0FBQyxDQUFDLElBQUksR0FBRyxFQUFFLDBCQUEwQixPQUFPLENBQUMsQ0FBQyxDQUFDLCtCQUErQixHQUFHLElBQUksQ0FBQyxDQUFDO1lBQ3RHLEdBQUcsSUFBSSxDQUFDLENBQUM7U0FDVjtRQUNELE9BQU8sR0FBRyxDQUFDLFNBQVMsQ0FBQyxLQUFLLEVBQUUsR0FBRyxDQUFDLENBQUM7SUFDbkMsQ0FBQztJQUVEOzs7Ozs7O1NBT0s7SUFDTCxNQUFNLENBQUMsWUFBWSxDQUFDLEdBQVc7UUFDN0IsT0FBTyxHQUFHLENBQUMsT0FBTyxDQUFDLGFBQWEsRUFBRSxJQUFJLENBQUMsQ0FBQztJQUMxQyxDQUFDO0lBRUQ7Ozs7Ozs7U0FPSztJQUNMLE1BQU0sQ0FBQyxLQUFLLENBQUMsUUFBZ0I7UUFDM0IsT0FBTyxzQkFBc0IsQ0FBQyxZQUFZLENBQUMsc0JBQXNCLENBQUMsaUJBQWlCLENBQUMsUUFBUSxDQUFDLENBQUMsQ0FBQztJQUNqRyxDQUFDO0lBRUQ7Ozs7OztTQU1LO0lBQ0UsTUFBTSxDQUFDLFFBQWdCO1FBQzVCLE1BQU0sTUFBTSxHQUFHLFFBQVEsQ0FBQyxNQUFNLENBQUM7UUFDL0IsTUFBTSxNQUFNLEdBQUcsSUFBSSxLQUFLLENBQUMsTUFBTSxDQUFDLENBQUMsSUFBSSxDQUFDLENBQUMsQ0FBQyxDQUFDO1FBRXpDLEtBQUssSUFBSSxDQUFDLEdBQUcsQ0FBQyxFQUFFLENBQUMsR0FBRyxNQUFNLEVBQUUsRUFBRSxDQUFDLEVBQUU7WUFDL0IsTUFBTSxJQUFJLEdBQUcsUUFBUSxDQUFDLENBQUMsQ0FBQyxDQUFDO1lBRXpCLE1BQU0sQ0FBQyxJQUFJLElBQUksSUFBSSxDQUFDLE1BQU0sRUFBRSxpQkFBaUIsSUFBSSx3QkFBd0IsUUFBUSxHQUFHLENBQUMsQ0FBQztZQUV0RixNQUFNLENBQUMsQ0FBQyxDQUFDLEdBQUcsSUFBSSxDQUFDLGFBQWEsQ0FBQyxJQUFJLENBQUMsQ0FBQztTQUN0QztRQUNELE9BQU8sTUFBTSxDQUFDO0lBQ2hCLENBQUM7SUFFTSxhQUFhLENBQUMsTUFBYztRQUNqQyxPQUFPLElBQUksQ0FBQyxNQUFNLENBQUMsTUFBTSxDQUFDLENBQUM7SUFDN0IsQ0FBQztJQUVEOzs7Ozs7U0FNSztJQUNFLE1BQU0sQ0FBQyxLQUFlO1FBQzNCLElBQUksQ0FBQyxHQUFXLEVBQUUsQ0FBQztRQUVuQixLQUFLLElBQUksQ0FBQyxHQUFHLENBQUMsRUFBRSxDQUFDLEdBQUcsS0FBSyxDQUFDLE1BQU0sRUFBRSxFQUFFLENBQUMsRUFBRTtZQUNyQyxNQUFNLElBQUksR0FBRyxLQUFLLENBQUMsQ0FBQyxDQUFDLENBQUM7WUFFdEIsTUFBTSxDQUFDLElBQUksSUFBSSxJQUFJLENBQUMsTUFBTSxFQUFFLGlCQUFpQixJQUFJLHNCQUFzQixLQUFLLEdBQUcsQ0FBQyxDQUFDO1lBRWpGLENBQUMsSUFBSSxJQUFJLENBQUMsTUFBTSxDQUFDLElBQUksQ0FBQyxDQUFDO1NBQ3hCO1FBQ0QsT0FBTyxDQUFDLENBQUM7SUFDWCxDQUFDO0NBQ0YiLCJzb3VyY2VzQ29udGVudCI6WyJpbXBvcnQge2Fzc2VydH0gZnJvbSAnQGRhdGFncm9rLWxpYnJhcmllcy91dGlscy9zcmMvb3BlcmF0aW9ucyc7XHJcblxyXG50eXBlIFNpZGVDaGFpblNjYWxlID0ge1tuYW1lOiBzdHJpbmddOiBudW1iZXJ9O1xyXG50eXBlIFNpZGVDaGFpblNjYWxlQ29sbGVjdGlvbiA9IHtbbmFtZTogc3RyaW5nXTogU2lkZUNoYWluU2NhbGV9O1xyXG5cclxuY2xhc3MgU2lkZUNoYWluU2NhbGVzIHtcclxuICBzdGF0aWMgc2NhbGVzOiBTaWRlQ2hhaW5TY2FsZUNvbGxlY3Rpb24gPSB7XHJcbiAgICAvLyBXaW1sZXktV2hpdGUgaW50ZXJmYWNpYWwgaHlkcm9waG9iaWNpdHkgc2NhbGVcclxuICAgICdXaW1sZXlXaGl0ZSc6IHtcclxuICAgICAgJy0nOiAwLFxyXG4gICAgICAnQSc6IDAuMTcsXHJcbiAgICAgICdDJzogLTAuMjQsXHJcbiAgICAgICdEJzogLTAuMDcsIC8vIEFzcC06IDEuMjNcclxuICAgICAgJ0UnOiAtMC4wMSwgLy8gR2x1LTogMi4wMlxyXG4gICAgICAnRic6IC0xLjEzLCAvL1xyXG4gICAgICAnRyc6IDAuMDEsXHJcbiAgICAgICdIJzogMC4xNywgLy8gSGlzKzogMC45NlxyXG4gICAgICAnSSc6IC0wLjMxLFxyXG4gICAgICAnSyc6IDAuOTksIC8vIEx5cytcclxuICAgICAgJ0wnOiAtMC41NixcclxuICAgICAgJ00nOiAtMC4yMyxcclxuICAgICAgJ04nOiAwLjQyLFxyXG4gICAgICAnUCc6IDAuNDUsXHJcbiAgICAgICdRJzogMC41OCxcclxuICAgICAgJ1InOiAwLjgxLCAvLyBBcmcrXHJcbiAgICAgICdTJzogMC4xMyxcclxuICAgICAgJ1QnOiAwLjE0LFxyXG4gICAgICAnVic6IDAuMDcsXHJcbiAgICAgICdXJzogLTEuODUsXHJcbiAgICAgICdZJzogLTAuOTQsXHJcbiAgICB9LFxyXG4gICAgJ2NhdGVnb3JpYWwnOiB7XHJcbiAgICAgICctJzogMCxcclxuICAgICAgJ0EnOiAxLFxyXG4gICAgICAnQyc6IDIsXHJcbiAgICAgICdEJzogMyxcclxuICAgICAgJ0UnOiA0LFxyXG4gICAgICAnRic6IDUsXHJcbiAgICAgICdHJzogNixcclxuICAgICAgJ0gnOiA3LFxyXG4gICAgICAnSSc6IDgsXHJcbiAgICAgICdLJzogOSxcclxuICAgICAgJ0wnOiAxMCxcclxuICAgICAgJ00nOiAxMSxcclxuICAgICAgJ04nOiAxMixcclxuICAgICAgJ1AnOiAxMyxcclxuICAgICAgJ1EnOiAxNCxcclxuICAgICAgJ1InOiAxNSxcclxuICAgICAgJ1MnOiAxNixcclxuICAgICAgJ1QnOiAxNyxcclxuICAgICAgJ1YnOiAxOCxcclxuICAgICAgJ1cnOiAxOSxcclxuICAgICAgJ1knOiAyMCxcclxuICAgIH0sXHJcbiAgfTtcclxuXHJcbiAgc3RhdGljIGdldEF2YWlsYWJsZVNjYWxlcygpOiBzdHJpbmdbXSB7XHJcbiAgICByZXR1cm4gT2JqZWN0LmVudHJpZXModGhpcy5zY2FsZXMpLm1hcCgoW2ssIF9dKSA9PiBrKTtcclxuICB9XHJcblxyXG4gIHN0YXRpYyBnZXRTY2FsZShuYW1lOiBzdHJpbmcpOiBTaWRlQ2hhaW5TY2FsZSB7XHJcbiAgICBhc3NlcnQoISh0aGlzLnNjYWxlc1tuYW1lXSA9PT0gdW5kZWZpbmVkKSwgYFNjYWxlICcke25hbWV9JyB3YXMgbm90IGZvdW5kLmApO1xyXG4gICAgcmV0dXJuIHRoaXMuc2NhbGVzW25hbWVdO1xyXG4gIH1cclxufVxyXG5cclxuLyoqXHJcbiAqIENsYXNzIHRvIGNhdGVnb3JpYWwgZW5jb2RlL2RlY29kZSBhbGlnbmVkIGFtaW5vIGFjaWQgcmVzaWR1ZXMgc2VxdWVuY2UuXHJcbiAqXHJcbiAqIEBleHBvcnRcclxuICogQGNsYXNzIEFsaWduZWRTZXF1ZW5jZUVuY29kZXJcclxuICovXHJcbmV4cG9ydCBjbGFzcyBBbGlnbmVkU2VxdWVuY2VFbmNvZGVyIHtcclxuICBwcm90ZWN0ZWQgYWEybnVtOiBTaWRlQ2hhaW5TY2FsZTtcclxuICBwcm90ZWN0ZWQgbnVtMmFhOiB7W2NvZGU6IG51bWJlcl06IHN0cmluZ307XHJcblxyXG4gIGNvbnN0cnVjdG9yKHNjYWxlOiBzdHJpbmcgPSAnY2F0ZWdvcmlhbCcpIHtcclxuICAgIHRoaXMuYWEybnVtID0gU2lkZUNoYWluU2NhbGVzLmdldFNjYWxlKHNjYWxlKTtcclxuICAgIHRoaXMubnVtMmFhID0ge307XHJcbiAgICBPYmplY3QuZW50cmllcyh0aGlzLmFhMm51bSkuZm9yRWFjaCgoW2ssIHZdKSA9PiAodGhpcy5udW0yYWFbdl0gPSBrKSk7XHJcbiAgfVxyXG5cclxuICAvKipcclxuICAgICAqIFRydW5jYXRlIE5IMiBhbmQgLUNPT0ggdGVybWluYWxzIG9mIHRoZSBnaXZlbiBzZXF1ZW5jZS5cclxuICAgICAqXHJcbiAgICAgKiBAc3RhdGljXHJcbiAgICAgKiBAcGFyYW0ge3N0cmluZ30gc2VxIFRoZSBzZXF1ZW5jZSBwcm92aWRlZC5cclxuICAgICAqIEByZXR1cm4ge3N0cmluZ30gVHJ1bmNhdGVkIHNlcXVlbmNlLlxyXG4gICAgICogQG1lbWJlcm9mIEFsaWduZWRTZXF1ZW5jZUVuY29kZXJcclxuICAgICAqL1xyXG4gIHN0YXRpYyBfdHJ1bmNhdGVTZXF1ZW5jZShzZXE6IHN0cmluZyk6IHN0cmluZyB7XHJcbiAgICBsZXQgc3RhcnQgPSAwO1xyXG4gICAgbGV0IGVuZCA9IHNlcS5sZW5ndGg7XHJcbiAgICBjb25zdCB0ZXJtaW5hID0gWydOSDInLCAnQ09PSCddO1xyXG5cclxuICAgIGlmIChzZXEuc3RhcnRzV2l0aCh0ZXJtaW5hWzBdKSkge1xyXG4gICAgICBjb25zdCBsID0gdGVybWluYVswXS5sZW5ndGg7IC8vIEN1dCBvbmx5ICdOSDInIHdpdGhvdXQgZm9sbG93aW5nICctJy5cclxuICAgICAgYXNzZXJ0KHNlcVtsXSA9PSAnLScsIGBXcm9uZyBzZXF1ZW5jZSBmb3JtYXQ6ICR7dGVybWluYVswXX0gd2l0aG91dCBmb2xsb3dpbmcgJy0nIGluICcke3NlcX0nLmApO1xyXG4gICAgICBzdGFydCA9IGw7XHJcbiAgICB9XHJcbiAgICBpZiAoc2VxLmVuZHNXaXRoKHRlcm1pbmFbMV0pKSB7XHJcbiAgICAgIGNvbnN0IGwgPSB0ZXJtaW5hWzFdLmxlbmd0aCsxOyAvLyBDdXQgYm90aCAnQ09PSCcgYW5kIHByZWNlbmRpbmcgJy0nLlxyXG4gICAgICBhc3NlcnQoc2VxW2VuZC1sXSA9PSAnLScsIGBXcm9uZyBzZXF1ZW5jZSBmb3JtYXQ6ICR7dGVybWluYVsxXX0gd2l0aG91dCAnLScgcHJlY2VuZGluZyBpbiAnJHtzZXF9Jy5gKTtcclxuICAgICAgZW5kIC09IGw7XHJcbiAgICB9XHJcbiAgICByZXR1cm4gc2VxLnN1YnN0cmluZyhzdGFydCwgZW5kKTtcclxuICB9XHJcblxyXG4gIC8qKlxyXG4gICAgICogQ3V0cyBhdXhpbGlhcnkgZGVmaXNlcyBiZWZvcmUgYSByZXNpZHVlLlxyXG4gICAgICpcclxuICAgICAqIEBzdGF0aWNcclxuICAgICAqIEBwYXJhbSB7c3RyaW5nfSBzZXEgVGhlIHNlcXVlbmNlIHRvIHByb2Nlc3MuXHJcbiAgICAgKiBAcmV0dXJuIHtzdHJpbmd9IFByb2Nlc3NlZCBzZXF1ZW5jZS5cclxuICAgICAqIEBtZW1iZXJvZiBBbGlnbmVkU2VxdWVuY2VFbmNvZGVyXHJcbiAgICAgKi9cclxuICBzdGF0aWMgX2Ryb3BEZWZpc2VzKHNlcTogc3RyaW5nKTogc3RyaW5nIHtcclxuICAgIHJldHVybiBzZXEucmVwbGFjZSgvKC0pKFteLV0rKS9nLCAnJDInKTtcclxuICB9XHJcblxyXG4gIC8qKlxyXG4gICAgICogUGVyZm9ybXMgdHJ1bmNhdGlvbiBhbmQgY3V0dGluZyBhdXhpbGlhcnkgZGVmaXNlcy5cclxuICAgICAqXHJcbiAgICAgKiBAc3RhdGljXHJcbiAgICAgKiBAcGFyYW0ge3N0cmluZ30gc2VxdWVuY2UgVGhlIHNlcXVlbmNlIHdvcmsgdW5kZXIgcHJvY2Vzcy5cclxuICAgICAqIEByZXR1cm4ge3N0cmluZ30gUmVzdWx0IG9mIGNsZWFuaW5nLlxyXG4gICAgICogQG1lbWJlcm9mIEFsaWduZWRTZXF1ZW5jZUVuY29kZXJcclxuICAgICAqL1xyXG4gIHN0YXRpYyBjbGVhbihzZXF1ZW5jZTogc3RyaW5nKTogc3RyaW5nIHtcclxuICAgIHJldHVybiBBbGlnbmVkU2VxdWVuY2VFbmNvZGVyLl9kcm9wRGVmaXNlcyhBbGlnbmVkU2VxdWVuY2VFbmNvZGVyLl90cnVuY2F0ZVNlcXVlbmNlKHNlcXVlbmNlKSk7XHJcbiAgfVxyXG5cclxuICAvKipcclxuICAgICAqIENhdGVnb3JpYWwgZW5jb2RlIG9mIHRoZSBzZXF1ZW5jZSBwcm92aWRlZC5cclxuICAgICAqXHJcbiAgICAgKiBAcGFyYW0ge3N0cmluZ30gc2VxdWVuY2UgVGhlIHNlcXVlbmNlLlxyXG4gICAgICogQHJldHVybiB7bnVtYmVyW119IEVuY29kZWQgdmVjdG9yLlxyXG4gICAgICogQG1lbWJlcm9mIEFsaWduZWRTZXF1ZW5jZUVuY29kZXJcclxuICAgICAqL1xyXG4gIHB1YmxpYyBlbmNvZGUoc2VxdWVuY2U6IHN0cmluZyk6IG51bWJlcltdIHtcclxuICAgIGNvbnN0IG5JdGVtcyA9IHNlcXVlbmNlLmxlbmd0aDtcclxuICAgIGNvbnN0IHZhbHVlcyA9IG5ldyBBcnJheShuSXRlbXMpLmZpbGwoMCk7XHJcblxyXG4gICAgZm9yIChsZXQgaSA9IDA7IGkgPCBuSXRlbXM7ICsraSkge1xyXG4gICAgICBjb25zdCBjaGFyID0gc2VxdWVuY2VbaV07XHJcblxyXG4gICAgICBhc3NlcnQoY2hhciBpbiB0aGlzLmFhMm51bSwgYFVua25vd24gY2hhciAnJHtjaGFyfScgZm91bmQgaW4gc2VxdWVuY2UgJyR7c2VxdWVuY2V9J2ApO1xyXG5cclxuICAgICAgdmFsdWVzW2ldID0gdGhpcy5lbmNvZGVMZXR0dGVyKGNoYXIpO1xyXG4gICAgfVxyXG4gICAgcmV0dXJuIHZhbHVlcztcclxuICB9XHJcblxyXG4gIHB1YmxpYyBlbmNvZGVMZXR0dGVyKGxldHRlcjogc3RyaW5nKTogbnVtYmVyIHtcclxuICAgIHJldHVybiB0aGlzLmFhMm51bVtsZXR0ZXJdO1xyXG4gIH1cclxuXHJcbiAgLyoqXHJcbiAgICAgKiBEZWNvZGUgdGhlIGVuY29kZWQgdmVjdG9yIGludG8gdGhlIHNlcXVlbmNlIGJhY2suXHJcbiAgICAgKlxyXG4gICAgICogQHBhcmFtIHtudW1iZXJbXX0gdmFsdWUgVGhlIHZlY3RvciBlbmNvZGVkLlxyXG4gICAgICogQHJldHVybiB7c3RyaW5nfSBEZWNvZGVkIHNlcXVlbmNlLlxyXG4gICAgICogQG1lbWJlcm9mIEFsaWduZWRTZXF1ZW5jZUVuY29kZXJcclxuICAgICAqL1xyXG4gIHB1YmxpYyBkZWNvZGUodmFsdWU6IG51bWJlcltdKTogc3RyaW5nIHtcclxuICAgIGxldCBzOiBzdHJpbmcgPSAnJztcclxuXHJcbiAgICBmb3IgKGxldCBpID0gMDsgaSA8IHZhbHVlLmxlbmd0aDsgKytpKSB7XHJcbiAgICAgIGNvbnN0IGNvZGUgPSB2YWx1ZVtpXTtcclxuXHJcbiAgICAgIGFzc2VydChjb2RlIGluIHRoaXMubnVtMmFhLCBgVW5rbm93biBjb2RlICcke2NvZGV9JyBmb3VuZCBpbiB2ZWN0b3IgJyR7dmFsdWV9J2ApO1xyXG5cclxuICAgICAgcyArPSB0aGlzLm51bTJhYVtjb2RlXTtcclxuICAgIH1cclxuICAgIHJldHVybiBzO1xyXG4gIH1cclxufVxyXG4iXX0=
|
package/src/sequence-encoder.ts
DELETED
|
@@ -1,177 +0,0 @@
|
|
|
1
|
-
import {assert} from '@datagrok-libraries/utils/src/operations';
|
|
2
|
-
|
|
3
|
-
type SideChainScale = {[name: string]: number};
|
|
4
|
-
type SideChainScaleCollection = {[name: string]: SideChainScale};
|
|
5
|
-
|
|
6
|
-
class SideChainScales {
|
|
7
|
-
static scales: SideChainScaleCollection = {
|
|
8
|
-
// Wimley-White interfacial hydrophobicity scale
|
|
9
|
-
'WimleyWhite': {
|
|
10
|
-
'-': 0,
|
|
11
|
-
'A': 0.17,
|
|
12
|
-
'C': -0.24,
|
|
13
|
-
'D': -0.07, // Asp-: 1.23
|
|
14
|
-
'E': -0.01, // Glu-: 2.02
|
|
15
|
-
'F': -1.13, //
|
|
16
|
-
'G': 0.01,
|
|
17
|
-
'H': 0.17, // His+: 0.96
|
|
18
|
-
'I': -0.31,
|
|
19
|
-
'K': 0.99, // Lys+
|
|
20
|
-
'L': -0.56,
|
|
21
|
-
'M': -0.23,
|
|
22
|
-
'N': 0.42,
|
|
23
|
-
'P': 0.45,
|
|
24
|
-
'Q': 0.58,
|
|
25
|
-
'R': 0.81, // Arg+
|
|
26
|
-
'S': 0.13,
|
|
27
|
-
'T': 0.14,
|
|
28
|
-
'V': 0.07,
|
|
29
|
-
'W': -1.85,
|
|
30
|
-
'Y': -0.94,
|
|
31
|
-
},
|
|
32
|
-
'categorial': {
|
|
33
|
-
'-': 0,
|
|
34
|
-
'A': 1,
|
|
35
|
-
'C': 2,
|
|
36
|
-
'D': 3,
|
|
37
|
-
'E': 4,
|
|
38
|
-
'F': 5,
|
|
39
|
-
'G': 6,
|
|
40
|
-
'H': 7,
|
|
41
|
-
'I': 8,
|
|
42
|
-
'K': 9,
|
|
43
|
-
'L': 10,
|
|
44
|
-
'M': 11,
|
|
45
|
-
'N': 12,
|
|
46
|
-
'P': 13,
|
|
47
|
-
'Q': 14,
|
|
48
|
-
'R': 15,
|
|
49
|
-
'S': 16,
|
|
50
|
-
'T': 17,
|
|
51
|
-
'V': 18,
|
|
52
|
-
'W': 19,
|
|
53
|
-
'Y': 20,
|
|
54
|
-
},
|
|
55
|
-
};
|
|
56
|
-
|
|
57
|
-
static getAvailableScales(): string[] {
|
|
58
|
-
return Object.entries(this.scales).map(([k, _]) => k);
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
static getScale(name: string): SideChainScale {
|
|
62
|
-
assert(!(this.scales[name] === undefined), `Scale '${name}' was not found.`);
|
|
63
|
-
return this.scales[name];
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
/**
|
|
68
|
-
* Class to categorial encode/decode aligned amino acid residues sequence.
|
|
69
|
-
*
|
|
70
|
-
* @export
|
|
71
|
-
* @class AlignedSequenceEncoder
|
|
72
|
-
*/
|
|
73
|
-
export class AlignedSequenceEncoder {
|
|
74
|
-
protected aa2num: SideChainScale;
|
|
75
|
-
protected num2aa: {[code: number]: string};
|
|
76
|
-
|
|
77
|
-
constructor(scale: string = 'categorial') {
|
|
78
|
-
this.aa2num = SideChainScales.getScale(scale);
|
|
79
|
-
this.num2aa = {};
|
|
80
|
-
Object.entries(this.aa2num).forEach(([k, v]) => (this.num2aa[v] = k));
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
/**
|
|
84
|
-
* Truncate NH2 and -COOH terminals of the given sequence.
|
|
85
|
-
*
|
|
86
|
-
* @static
|
|
87
|
-
* @param {string} seq The sequence provided.
|
|
88
|
-
* @return {string} Truncated sequence.
|
|
89
|
-
* @memberof AlignedSequenceEncoder
|
|
90
|
-
*/
|
|
91
|
-
static _truncateSequence(seq: string): string {
|
|
92
|
-
let start = 0;
|
|
93
|
-
let end = seq.length;
|
|
94
|
-
const termina = ['NH2', 'COOH'];
|
|
95
|
-
|
|
96
|
-
if (seq.startsWith(termina[0])) {
|
|
97
|
-
const l = termina[0].length; // Cut only 'NH2' without following '-'.
|
|
98
|
-
assert(seq[l] == '-', `Wrong sequence format: ${termina[0]} without following '-' in '${seq}'.`);
|
|
99
|
-
start = l;
|
|
100
|
-
}
|
|
101
|
-
if (seq.endsWith(termina[1])) {
|
|
102
|
-
const l = termina[1].length+1; // Cut both 'COOH' and precending '-'.
|
|
103
|
-
assert(seq[end-l] == '-', `Wrong sequence format: ${termina[1]} without '-' precending in '${seq}'.`);
|
|
104
|
-
end -= l;
|
|
105
|
-
}
|
|
106
|
-
return seq.substring(start, end);
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
/**
|
|
110
|
-
* Cuts auxiliary defises before a residue.
|
|
111
|
-
*
|
|
112
|
-
* @static
|
|
113
|
-
* @param {string} seq The sequence to process.
|
|
114
|
-
* @return {string} Processed sequence.
|
|
115
|
-
* @memberof AlignedSequenceEncoder
|
|
116
|
-
*/
|
|
117
|
-
static _dropDefises(seq: string): string {
|
|
118
|
-
return seq.replace(/(-)([^-]+)/g, '$2');
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
/**
|
|
122
|
-
* Performs truncation and cutting auxiliary defises.
|
|
123
|
-
*
|
|
124
|
-
* @static
|
|
125
|
-
* @param {string} sequence The sequence work under process.
|
|
126
|
-
* @return {string} Result of cleaning.
|
|
127
|
-
* @memberof AlignedSequenceEncoder
|
|
128
|
-
*/
|
|
129
|
-
static clean(sequence: string): string {
|
|
130
|
-
return AlignedSequenceEncoder._dropDefises(AlignedSequenceEncoder._truncateSequence(sequence));
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
/**
|
|
134
|
-
* Categorial encode of the sequence provided.
|
|
135
|
-
*
|
|
136
|
-
* @param {string} sequence The sequence.
|
|
137
|
-
* @return {number[]} Encoded vector.
|
|
138
|
-
* @memberof AlignedSequenceEncoder
|
|
139
|
-
*/
|
|
140
|
-
public encode(sequence: string): number[] {
|
|
141
|
-
const nItems = sequence.length;
|
|
142
|
-
const values = new Array(nItems).fill(0);
|
|
143
|
-
|
|
144
|
-
for (let i = 0; i < nItems; ++i) {
|
|
145
|
-
const char = sequence[i];
|
|
146
|
-
|
|
147
|
-
assert(char in this.aa2num, `Unknown char '${char}' found in sequence '${sequence}'`);
|
|
148
|
-
|
|
149
|
-
values[i] = this.encodeLettter(char);
|
|
150
|
-
}
|
|
151
|
-
return values;
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
public encodeLettter(letter: string): number {
|
|
155
|
-
return this.aa2num[letter];
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
/**
|
|
159
|
-
* Decode the encoded vector into the sequence back.
|
|
160
|
-
*
|
|
161
|
-
* @param {number[]} value The vector encoded.
|
|
162
|
-
* @return {string} Decoded sequence.
|
|
163
|
-
* @memberof AlignedSequenceEncoder
|
|
164
|
-
*/
|
|
165
|
-
public decode(value: number[]): string {
|
|
166
|
-
let s: string = '';
|
|
167
|
-
|
|
168
|
-
for (let i = 0; i < value.length; ++i) {
|
|
169
|
-
const code = value[i];
|
|
170
|
-
|
|
171
|
-
assert(code in this.num2aa, `Unknown code '${code}' found in vector '${value}'`);
|
|
172
|
-
|
|
173
|
-
s += this.num2aa[code];
|
|
174
|
-
}
|
|
175
|
-
return s;
|
|
176
|
-
}
|
|
177
|
-
}
|