@datagrok/bio 2.1.12 → 2.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -12
- package/css/helm.css +10 -0
- package/detectors.js +83 -59
- package/dist/package-test.js +2 -13168
- package/dist/package-test.js.map +1 -0
- package/dist/package.js +2 -10560
- package/dist/package.js.map +1 -0
- package/dockerfiles/Dockerfile +86 -0
- package/files/icons/composition-analysis.svg +17 -0
- package/files/icons/sequence-diversity-viewer.svg +4 -0
- package/files/icons/sequence-similarity-viewer.svg +4 -0
- package/files/icons/vdregions-viewer.svg +22 -0
- package/files/icons/weblogo-viewer.svg +7 -0
- package/files/tests/testUrl.csv +11 -0
- package/files/tests/toAtomicLevelTest.csv +4 -0
- package/package.json +24 -25
- package/src/analysis/sequence-activity-cliffs.ts +11 -9
- package/src/analysis/sequence-search-base-viewer.ts +2 -1
- package/src/analysis/sequence-similarity-viewer.ts +3 -3
- package/src/analysis/sequence-space.ts +2 -1
- package/src/calculations/monomerLevelMols.ts +4 -4
- package/src/package-test.ts +9 -2
- package/src/package.ts +215 -131
- package/src/substructure-search/substructure-search.ts +19 -16
- package/src/tests/Palettes-test.ts +1 -1
- package/src/tests/WebLogo-positions-test.ts +113 -57
- package/src/tests/_first-tests.ts +9 -0
- package/src/tests/activity-cliffs-tests.ts +8 -7
- package/src/tests/activity-cliffs-utils.ts +17 -9
- package/src/tests/bio-tests.ts +4 -5
- package/src/tests/checkInputColumn-tests.ts +1 -1
- package/src/tests/converters-test.ts +52 -17
- package/src/tests/detectors-benchmark-tests.ts +3 -2
- package/src/tests/detectors-tests.ts +177 -172
- package/src/tests/fasta-export-tests.ts +1 -1
- package/src/tests/monomer-libraries-tests.ts +34 -0
- package/src/tests/pepsea-tests.ts +21 -0
- package/src/tests/renderers-test.ts +21 -19
- package/src/tests/sequence-space-test.ts +6 -4
- package/src/tests/similarity-diversity-tests.ts +4 -4
- package/src/tests/splitters-test.ts +4 -5
- package/src/tests/substructure-filters-tests.ts +23 -1
- package/src/tests/utils/sequences-generators.ts +1 -1
- package/src/tests/utils.ts +2 -1
- package/src/tests/viewers.ts +16 -0
- package/src/utils/cell-renderer.ts +88 -35
- package/src/utils/constants.ts +7 -6
- package/src/utils/convert.ts +8 -2
- package/src/utils/monomer-lib.ts +174 -0
- package/src/utils/multiple-sequence-alignment.ts +44 -20
- package/src/utils/pepsea.ts +78 -0
- package/src/utils/save-as-fasta.ts +2 -1
- package/src/utils/ui-utils.ts +15 -3
- package/src/viewers/vd-regions-viewer.ts +113 -72
- package/src/viewers/web-logo-viewer.ts +1031 -0
- package/src/widgets/bio-substructure-filter.ts +38 -24
- package/tsconfig.json +71 -72
- package/webpack.config.js +4 -11
- package/dist/vendors-node_modules_datagrok-libraries_ml_src_workers_dimensionality-reducer_js.js +0 -9039
package/README.md
CHANGED
|
@@ -4,7 +4,7 @@ Bio is a bioinformatics support [package](https://datagrok.ai/help/develop/devel
|
|
|
4
4
|
[Datagrok](https://datagrok.ai) platform with an extensive toolset supporting SAR analisys for small molecules
|
|
5
5
|
and antibodies.
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
## Notations
|
|
8
8
|
|
|
9
9
|
[@datagrok/bio](https://github.com/datagrok-ai/public/tree/master/packages/Bio) can ingest data in multiple file
|
|
10
10
|
formats (such as fasta o csv) and multiple notations for natural and modified residues, aligned and non-aligned forms,
|
|
@@ -18,7 +18,7 @@ See:
|
|
|
18
18
|
* [detectMacromolecule()](../Bio/detectors.js)
|
|
19
19
|
* [class NotationConverter](../../libraries/bio/src/utils/notation-converter.ts)
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
## Atomic-Level structures from sequences
|
|
22
22
|
|
|
23
23
|
For linear sequences, the linear form (see the illustration below) of molecules is reproduced. This is useful
|
|
24
24
|
for better visual inspection of sequence and duplex comparison. Structure at atomic level could be saved in available
|
|
@@ -34,11 +34,10 @@ See:
|
|
|
34
34
|
|
|
35
35
|
* [getMolfilesFromSeq()](./src/utils/atomic-works.ts)
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
## MSA
|
|
38
38
|
|
|
39
39
|
For multiple-sequence alignment, Datagrok uses the “kalign” that relies on Wu-Manber string-matching algorithm
|
|
40
|
-
[Lassmann, Timo. _Kalign 3: multiple sequence alignment of large data sets._ **Bioinformatics** (2019).
|
|
41
|
-
https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btz795/30314127/btz795.pdf)].
|
|
40
|
+
[Lassmann, Timo. _Kalign 3: multiple sequence alignment of large data sets._ **Bioinformatics** (2019).pdf](https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btz795/30314127/btz795.pdf).
|
|
42
41
|
“kalign“ is suited for sequences containing only natural monomers. Sequences of a particular column can be analyzed
|
|
43
42
|
using MSA algorithm available at the top menu. Aligned sequences can be inspected for base composition
|
|
44
43
|
at the position of MSA result.
|
|
@@ -52,7 +51,7 @@ See:
|
|
|
52
51
|
|
|
53
52
|
TODO: MSA with PepSeA
|
|
54
53
|
|
|
55
|
-
|
|
54
|
+
## Splitting to monomers
|
|
56
55
|
|
|
57
56
|
Splitting to monomers allows splitting aligned sequences in separate monomers.
|
|
58
57
|
|
|
@@ -62,7 +61,7 @@ See:
|
|
|
62
61
|
|
|
63
62
|
* [splitAlignedSequences()](../../libraries/bio/src/utils/splitter.ts)
|
|
64
63
|
|
|
65
|
-
|
|
64
|
+
## Web Logo
|
|
66
65
|
|
|
67
66
|
Web Logo visualizes a graphical representation of multiple sequence alignment (amino acids or nucleotides or
|
|
68
67
|
modified residues with multi-char labels). Each logo consists of stacks of symbols, one for each position
|
|
@@ -81,13 +80,13 @@ You can customize the look of the viewer with properties. Properties ```startPos
|
|
|
81
80
|
allow to display multiple alignment partially. If property ```startPosition``` (```endPosition```)
|
|
82
81
|
is not specified, then the Logo will be plotted from the first (till the last) position of sequences.
|
|
83
82
|
|
|
84
|
-
|
|
83
|
+
### General
|
|
85
84
|
|
|
86
85
|
| | |
|
|
87
86
|
|-------------|--------------|
|
|
88
87
|
| Right click | Context menu |
|
|
89
88
|
|
|
90
|
-
|
|
89
|
+
### Properties
|
|
91
90
|
|
|
92
91
|
| Property name | Default | Description |
|
|
93
92
|
|----------------------|----------|-------------------------------------------------------------------------------------------------------------------------|
|
|
@@ -116,7 +115,7 @@ See also:
|
|
|
116
115
|
* [Viewers](../../help/visualize/viewers.md)
|
|
117
116
|
* [Table view](../../help/datagrok/table-view.md)
|
|
118
117
|
|
|
119
|
-
|
|
118
|
+
## Sequence space
|
|
120
119
|
|
|
121
120
|
Datagrok allows visualizing multidimensional sequence space using a dimensionality reduction approach.
|
|
122
121
|
Several distance-based dimensionality reduction algorithms are available, such as UMAP or t-SNE.
|
|
@@ -132,7 +131,7 @@ See:
|
|
|
132
131
|
|
|
133
132
|
* [sequenceSpace()](src/utils/sequence-space.ts)
|
|
134
133
|
|
|
135
|
-
|
|
134
|
+
## Sequence activity cliffs
|
|
136
135
|
|
|
137
136
|
Activity cliffs tool finds pairs of sequences where small changes in the sequence yield significant
|
|
138
137
|
changes in activity or any other numerical property. open the tool from a top menu by selecting.
|
|
@@ -145,4 +144,4 @@ To launch the analysis from the top menu, select Bio | Sequence Activity Cliffs.
|
|
|
145
144
|
|
|
146
145
|
See:
|
|
147
146
|
|
|
148
|
-
* [getActivityCliffs()](../../libraries/ml/src/viewers/activity-cliffs.ts)
|
|
147
|
+
* [getActivityCliffs()](../../libraries/ml/src/viewers/activity-cliffs.ts)
|
package/css/helm.css
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
1
1
|
.d4-g-cell[semType="Macromolecule"] * {
|
|
2
2
|
pointer-events: none !important;
|
|
3
3
|
}
|
|
4
|
+
|
|
5
|
+
.helm-substructure-filter {
|
|
6
|
+
border: 1px solid var(--grey-2);
|
|
7
|
+
height: 25px;
|
|
8
|
+
display: flex;
|
|
9
|
+
justify-content: center;
|
|
10
|
+
align-items: center;
|
|
11
|
+
padding-left: 0px;
|
|
12
|
+
margin-left: 0px;
|
|
13
|
+
}
|
package/detectors.js
CHANGED
|
@@ -45,23 +45,28 @@ const isUrlRe = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9
|
|
|
45
45
|
|
|
46
46
|
class BioPackageDetectors extends DG.Package {
|
|
47
47
|
|
|
48
|
-
|
|
48
|
+
/** Parts of the column name required in the column's name under the detector. It must be in lowercase. */
|
|
49
|
+
requiredColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein'];
|
|
50
|
+
|
|
51
|
+
peptideFastaAlphabet = new Set([
|
|
49
52
|
'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
|
|
50
53
|
'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T',
|
|
51
54
|
'MeNle', 'MeA', 'MeG', 'MeF',
|
|
52
55
|
]);
|
|
53
56
|
|
|
54
|
-
|
|
57
|
+
dnaFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
|
|
58
|
+
|
|
59
|
+
rnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
|
|
55
60
|
|
|
56
|
-
|
|
61
|
+
numbersRawAlphabet = new Set(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']);
|
|
57
62
|
|
|
58
|
-
|
|
63
|
+
smilesRawAlphabet = new Set([
|
|
59
64
|
'A', 'B', 'C', 'E', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'Z',
|
|
60
65
|
'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'r', 's', 't', 'u',
|
|
61
66
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
|
62
67
|
'+', '-', '.', , '/', '\\', '@', '[', ']', '(', ')', '#', '%', '=']);
|
|
63
68
|
|
|
64
|
-
|
|
69
|
+
smartsRawAlphabet = new Set([
|
|
65
70
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
|
66
71
|
'!', '#', '$', '&', '(', ')', '*', '+', ',', '-', '.', ':', ';', '=', '@', '~', '[', ']',
|
|
67
72
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
|
|
@@ -83,6 +88,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
83
88
|
detectMacromolecule(col) {
|
|
84
89
|
const t1 = Date.now();
|
|
85
90
|
try {
|
|
91
|
+
const colName = col.name;
|
|
92
|
+
if (!this.requiredColNamePartList.some(
|
|
93
|
+
(requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart),
|
|
94
|
+
)) return null;
|
|
95
|
+
|
|
86
96
|
// Fail early
|
|
87
97
|
if (col.type !== DG.TYPE.STRING) return null;
|
|
88
98
|
|
|
@@ -92,7 +102,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
92
102
|
// To collect alphabet freq three strategies can be used:
|
|
93
103
|
// as chars, as fasta (single or within square brackets), as with the separator.
|
|
94
104
|
if (
|
|
95
|
-
!(col.categories.length
|
|
105
|
+
!(col.categories.length === 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category
|
|
96
106
|
DG.Detector.sampleCategories(col, (s) => this.isHelm(s), 1, SEQ_SAMPLE_LIMIT)
|
|
97
107
|
) {
|
|
98
108
|
const statsAsHelm = this.getStats(categoriesSample, 2,
|
|
@@ -109,14 +119,15 @@ class BioPackageDetectors extends DG.Package {
|
|
|
109
119
|
}
|
|
110
120
|
|
|
111
121
|
const decoyAlphabets = [
|
|
112
|
-
['
|
|
113
|
-
['
|
|
122
|
+
['NUMBERS', this.numbersRawAlphabet, 0.25],
|
|
123
|
+
['SMILES', this.smilesRawAlphabet, 0.25],
|
|
124
|
+
['SMARTS', this.smartsRawAlphabet, 0.43],
|
|
114
125
|
];
|
|
115
126
|
|
|
116
127
|
const candidateAlphabets = [
|
|
117
|
-
[ALPHABET.PT, this.
|
|
118
|
-
[ALPHABET.DNA, this.
|
|
119
|
-
[ALPHABET.RNA, this.
|
|
128
|
+
[ALPHABET.PT, this.peptideFastaAlphabet, 0.50],
|
|
129
|
+
[ALPHABET.DNA, this.dnaFastaAlphabet, 0.55],
|
|
130
|
+
[ALPHABET.RNA, this.rnaFastaAlphabet, 0.55],
|
|
120
131
|
];
|
|
121
132
|
|
|
122
133
|
// Check for url column, maybe it is too heavy check
|
|
@@ -142,9 +153,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
142
153
|
if (Object.keys(statsAsChars.freq).length === 0) return null;
|
|
143
154
|
|
|
144
155
|
const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
|
|
145
|
-
if (decoy
|
|
156
|
+
if (decoy !== ALPHABET.UN) return null;
|
|
146
157
|
|
|
147
158
|
const separator = this.detectSeparator(statsAsChars.freq);
|
|
159
|
+
if (this.checkForbiddenSeparator(separator)) return null;
|
|
160
|
+
|
|
148
161
|
const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
|
|
149
162
|
const gapSymbol = separator ? '' : '-';
|
|
150
163
|
const splitter = separator ? this.getSplitterWithSeparator(separator, SEQ_SAMPLE_LENGTH_LIMIT) :
|
|
@@ -162,31 +175,36 @@ class BioPackageDetectors extends DG.Package {
|
|
|
162
175
|
return DG.SEMTYPE.MACROMOLECULE;
|
|
163
176
|
} else {
|
|
164
177
|
const stats = this.getStats(categoriesSample, 5, splitter);
|
|
178
|
+
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
165
179
|
// Empty monomer alphabet is not allowed
|
|
166
180
|
if (Object.keys(stats.freq).length === 0) return null;
|
|
167
181
|
// Long monomer names for sequences with separators have constraints
|
|
168
|
-
if (
|
|
182
|
+
if (
|
|
183
|
+
((units === NOTATION.SEPARATOR || (units === NOTATION.FASTA && alphabetIsMultichar)) &&
|
|
184
|
+
this.checkForbiddenMultichar(stats.freq)) ||
|
|
185
|
+
((units === NOTATION.FASTA && !alphabetIsMultichar) &&
|
|
186
|
+
this.checkForbiddenSinglechar(stats.freq))
|
|
187
|
+
) return null;
|
|
169
188
|
|
|
170
189
|
const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
|
|
171
190
|
|
|
172
191
|
// TODO: If separator detected, then extra efforts to detect alphabet are allowed.
|
|
173
192
|
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
|
|
193
|
+
if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar) return null;
|
|
174
194
|
|
|
175
195
|
// const forbidden = this.checkForbiddenWoSeparator(stats.freq);
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
187
|
-
}
|
|
188
|
-
return DG.SEMTYPE.MACROMOLECULE;
|
|
196
|
+
col.setTag(DG.TAGS.UNITS, units);
|
|
197
|
+
if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
|
|
198
|
+
col.setTag(UnitsHandler.TAGS.aligned, aligned);
|
|
199
|
+
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
200
|
+
if (alphabet === ALPHABET.UN) {
|
|
201
|
+
// alphabetSize calculated on (sub)sample of data is incorrect
|
|
202
|
+
// const alphabetSize = Object.keys(stats.freq).length;
|
|
203
|
+
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
204
|
+
// col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
205
|
+
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
189
206
|
}
|
|
207
|
+
return DG.SEMTYPE.MACROMOLECULE;
|
|
190
208
|
}
|
|
191
209
|
} finally {
|
|
192
210
|
const t2 = Date.now();
|
|
@@ -207,15 +225,15 @@ class BioPackageDetectors extends DG.Package {
|
|
|
207
225
|
// !!! What is the difference between the gap symbol and separator symbol in stats terms?
|
|
208
226
|
// const noSeparatorRe = /[a-z\d]+$/i;
|
|
209
227
|
const noSeparatorChemRe = /[HBCNOFPSKVYI]/i; // Mendeleev's periodic table single char elements
|
|
210
|
-
const noSeparatorAlphaDigitRe = /[\dA-Z
|
|
228
|
+
const noSeparatorAlphaDigitRe = /[\dA-Z]/i;
|
|
211
229
|
const noSeparatorBracketsRe = /[\[\]()<>{}]/i;
|
|
212
230
|
const cleanFreq = Object.assign({}, ...Object.entries(freq)
|
|
213
231
|
.filter(([m, f]) =>
|
|
214
232
|
!noSeparatorChemRe.test(m) && !noSeparatorAlphaDigitRe.test(m) && !noSeparatorBracketsRe.test(m) &&
|
|
215
|
-
!this.
|
|
216
|
-
!this.
|
|
233
|
+
!this.peptideFastaAlphabet.has(m) &&
|
|
234
|
+
!this.dnaFastaAlphabet.has(m))
|
|
217
235
|
.map(([m, f]) => ({[m]: f})));
|
|
218
|
-
if (Object.keys(cleanFreq).length
|
|
236
|
+
if (Object.keys(cleanFreq).length === 0) return null;
|
|
219
237
|
|
|
220
238
|
const maxFreq = Math.max(...Object.values(cleanFreq));
|
|
221
239
|
|
|
@@ -227,12 +245,24 @@ class BioPackageDetectors extends DG.Package {
|
|
|
227
245
|
return sepFreq / otherSumFreq > freqThreshold ? sep : null;
|
|
228
246
|
}
|
|
229
247
|
|
|
230
|
-
|
|
248
|
+
checkForbiddenSeparator(separator) {
|
|
249
|
+
// dot, comma, ampersand, space, underscore, CR, LF
|
|
250
|
+
const forbiddenSepRe = / |\.|,|&|_|\r\n|\n/i;
|
|
251
|
+
return forbiddenSepRe.test(separator);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/** Spaces, dots and colons are nor allowed in multichar monomer names.
|
|
231
255
|
* The monomer name/label cannot contain digits only.
|
|
232
256
|
*/
|
|
233
|
-
|
|
234
|
-
const forbiddenRe = /[ ]|^\d+$/i;
|
|
235
|
-
return Object.keys(freq).
|
|
257
|
+
checkForbiddenMultichar(freq) {
|
|
258
|
+
const forbiddenRe = /[ .:]|^\d+$/i;
|
|
259
|
+
return Object.keys(freq).some((m) => forbiddenRe.test(m));
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/** Space, dot, colon, semicolon, digit, underscore are not allowed as singe char monomer names.*/
|
|
263
|
+
checkForbiddenSinglechar(freq) {
|
|
264
|
+
const forbiddenRe = /[ .:;\d_]/i;
|
|
265
|
+
return Object.keys(freq).some((m) => forbiddenRe.test(m));
|
|
236
266
|
}
|
|
237
267
|
|
|
238
268
|
// /** Without a separator, special symbols or digits are not allowed as monomers. */
|
|
@@ -250,7 +280,8 @@ class BioPackageDetectors extends DG.Package {
|
|
|
250
280
|
for (const seq of values) {
|
|
251
281
|
const mSeq = splitter(seq);
|
|
252
282
|
|
|
253
|
-
if (firstLength
|
|
283
|
+
if (firstLength === null) {
|
|
284
|
+
//
|
|
254
285
|
firstLength = mSeq.length;
|
|
255
286
|
} else if (mSeq.length !== firstLength) {
|
|
256
287
|
sameLength = false;
|
|
@@ -258,9 +289,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
258
289
|
|
|
259
290
|
if (mSeq.length > minLength) {
|
|
260
291
|
for (const m of mSeq) {
|
|
261
|
-
if (!(m in freq))
|
|
262
|
-
freq[m] = 0;
|
|
263
|
-
}
|
|
292
|
+
if (!(m in freq)) freq[m] = 0;
|
|
264
293
|
freq[m] += 1;
|
|
265
294
|
}
|
|
266
295
|
}
|
|
@@ -281,7 +310,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
281
310
|
let alphabetName;
|
|
282
311
|
const maxSim = Math.max(...candidatesSims.map((cs) => cs[4] > cs[2] ? cs[4] : -1));
|
|
283
312
|
if (maxSim > 0) {
|
|
284
|
-
const sim = candidatesSims.find((cs) => cs[4]
|
|
313
|
+
const sim = candidatesSims.find((cs) => cs[4] === maxSim);
|
|
285
314
|
alphabetName = sim[0];
|
|
286
315
|
} else {
|
|
287
316
|
alphabetName = ALPHABET.UN;
|
|
@@ -306,20 +335,19 @@ class BioPackageDetectors extends DG.Package {
|
|
|
306
335
|
|
|
307
336
|
vectorLength(v) {
|
|
308
337
|
let sqrSum = 0;
|
|
309
|
-
for (let i = 0; i < v.length; i++)
|
|
338
|
+
for (let i = 0; i < v.length; i++)
|
|
310
339
|
sqrSum += v[i] * v[i];
|
|
311
|
-
}
|
|
312
340
|
return Math.sqrt(sqrSum);
|
|
313
341
|
}
|
|
314
342
|
|
|
315
343
|
vectorDotProduct(v1, v2) {
|
|
316
|
-
if (v1.length
|
|
344
|
+
if (v1.length !== v2.length)
|
|
317
345
|
throw Error('The dimensionality of the vectors must match');
|
|
318
|
-
|
|
346
|
+
|
|
319
347
|
let prod = 0;
|
|
320
|
-
for (let i = 0; i < v1.length; i++)
|
|
348
|
+
for (let i = 0; i < v1.length; i++)
|
|
321
349
|
prod += v1[i] * v2[i];
|
|
322
|
-
|
|
350
|
+
|
|
323
351
|
return prod;
|
|
324
352
|
}
|
|
325
353
|
|
|
@@ -327,7 +355,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
327
355
|
getSplitterAsChars(lengthLimit) {
|
|
328
356
|
return function(seq) {
|
|
329
357
|
return seq.split('', lengthLimit);
|
|
330
|
-
}
|
|
358
|
+
};
|
|
331
359
|
}
|
|
332
360
|
|
|
333
361
|
getSplitterWithSeparator(separator, lengthLimit) {
|
|
@@ -346,11 +374,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
346
374
|
// } else {
|
|
347
375
|
return seq.split(separator, lengthLimit);
|
|
348
376
|
// }
|
|
349
|
-
}
|
|
377
|
+
};
|
|
350
378
|
}
|
|
351
379
|
|
|
352
380
|
// Multichar monomer names in square brackets, single char monomers or gap symbol
|
|
353
|
-
monomerRe = /\[(\w+)\]|(
|
|
381
|
+
monomerRe = /\[(\w+)\]|(.)/g;
|
|
354
382
|
|
|
355
383
|
/** Split sequence for single character monomers, square brackets multichar monomer names or gap symbol. */
|
|
356
384
|
getSplitterAsFasta(lengthLimit) {
|
|
@@ -360,11 +388,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
360
388
|
.map((ma) => {
|
|
361
389
|
let mRes;
|
|
362
390
|
const m = ma[0];
|
|
363
|
-
if (m.length > 1)
|
|
391
|
+
if (m.length > 1)
|
|
364
392
|
mRes = ma[1];
|
|
365
|
-
|
|
393
|
+
else
|
|
366
394
|
mRes = m;
|
|
367
|
-
|
|
395
|
+
|
|
368
396
|
return mRes;
|
|
369
397
|
}).toArray();
|
|
370
398
|
|
|
@@ -391,11 +419,10 @@ class BioPackageDetectors extends DG.Package {
|
|
|
391
419
|
const mmPostProcess = (mm) => {
|
|
392
420
|
this.helmPp1Re.lastIndex = 0;
|
|
393
421
|
const pp1M = this.helmPp1Re.exec(mm);
|
|
394
|
-
if (pp1M && pp1M.length >= 2)
|
|
422
|
+
if (pp1M && pp1M.length >= 2)
|
|
395
423
|
return pp1M[1];
|
|
396
|
-
|
|
424
|
+
else
|
|
397
425
|
return mm;
|
|
398
|
-
}
|
|
399
426
|
};
|
|
400
427
|
|
|
401
428
|
const mmList = inSeq ? inSeq.split('.') : [];
|
|
@@ -405,16 +432,13 @@ class BioPackageDetectors extends DG.Package {
|
|
|
405
432
|
}
|
|
406
433
|
|
|
407
434
|
sample(src, n) {
|
|
408
|
-
if (src.length < n)
|
|
435
|
+
if (src.length < n)
|
|
409
436
|
throw new Error('Sample source is less than n requested.');
|
|
410
|
-
}
|
|
411
437
|
|
|
412
438
|
const idxSet = new Set();
|
|
413
439
|
while (idxSet.size < n) {
|
|
414
440
|
const idx = Math.floor(Math.random() * src.length);
|
|
415
|
-
if (!idxSet.has(idx))
|
|
416
|
-
idxSet.add(idx);
|
|
417
|
-
}
|
|
441
|
+
if (!idxSet.has(idx)) idxSet.add(idx);
|
|
418
442
|
}
|
|
419
443
|
|
|
420
444
|
return [...idxSet].map((idx) => src[idx]);
|