@datagrok/bio 2.1.12 → 2.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +1 -1
- package/README.md +11 -12
- package/css/helm.css +10 -0
- package/detectors.js +97 -69
- package/dist/package-test.js +2 -13168
- package/dist/package-test.js.map +1 -0
- package/dist/package.js +2 -10560
- package/dist/package.js.map +1 -0
- package/dockerfiles/Dockerfile +86 -0
- package/files/icons/composition-analysis.svg +17 -0
- package/files/icons/sequence-diversity-viewer.svg +4 -0
- package/files/icons/sequence-similarity-viewer.svg +4 -0
- package/files/icons/vdregions-viewer.svg +22 -0
- package/files/icons/weblogo-viewer.svg +7 -0
- package/files/tests/testUrl.csv +11 -0
- package/files/tests/toAtomicLevelTest.csv +4 -0
- package/package.json +24 -25
- package/src/analysis/sequence-activity-cliffs.ts +11 -9
- package/src/analysis/sequence-search-base-viewer.ts +2 -1
- package/src/analysis/sequence-similarity-viewer.ts +3 -3
- package/src/analysis/sequence-space.ts +2 -1
- package/src/calculations/monomerLevelMols.ts +4 -4
- package/src/package-test.ts +10 -2
- package/src/package.ts +215 -131
- package/src/substructure-search/substructure-search.ts +19 -16
- package/src/tests/Palettes-test.ts +1 -1
- package/src/tests/WebLogo-positions-test.ts +113 -57
- package/src/tests/_first-tests.ts +9 -0
- package/src/tests/activity-cliffs-tests.ts +8 -7
- package/src/tests/activity-cliffs-utils.ts +17 -9
- package/src/tests/bio-tests.ts +4 -5
- package/src/tests/checkInputColumn-tests.ts +1 -1
- package/src/tests/converters-test.ts +52 -17
- package/src/tests/detectors-benchmark-tests.ts +3 -2
- package/src/tests/detectors-tests.ts +177 -172
- package/src/tests/detectors-weak-and-likely-tests.ts +129 -0
- package/src/tests/fasta-export-tests.ts +1 -1
- package/src/tests/monomer-libraries-tests.ts +34 -0
- package/src/tests/pepsea-tests.ts +21 -0
- package/src/tests/renderers-test.ts +21 -19
- package/src/tests/sequence-space-test.ts +6 -4
- package/src/tests/similarity-diversity-tests.ts +4 -4
- package/src/tests/splitters-test.ts +4 -5
- package/src/tests/substructure-filters-tests.ts +23 -1
- package/src/tests/utils/sequences-generators.ts +1 -1
- package/src/tests/utils.ts +2 -1
- package/src/tests/viewers.ts +16 -0
- package/src/utils/cell-renderer.ts +88 -35
- package/src/utils/constants.ts +7 -6
- package/src/utils/convert.ts +8 -2
- package/src/utils/monomer-lib.ts +174 -0
- package/src/utils/multiple-sequence-alignment.ts +44 -20
- package/src/utils/pepsea.ts +78 -0
- package/src/utils/save-as-fasta.ts +2 -1
- package/src/utils/ui-utils.ts +15 -3
- package/src/viewers/vd-regions-viewer.ts +113 -72
- package/src/viewers/web-logo-viewer.ts +1031 -0
- package/src/widgets/bio-substructure-filter.ts +38 -24
- package/tsconfig.json +71 -72
- package/webpack.config.js +4 -11
- package/dist/vendors-node_modules_datagrok-libraries_ml_src_workers_dimensionality-reducer_js.js +0 -9039
package/.eslintrc.json
CHANGED
package/README.md
CHANGED
|
@@ -4,7 +4,7 @@ Bio is a bioinformatics support [package](https://datagrok.ai/help/develop/devel
|
|
|
4
4
|
[Datagrok](https://datagrok.ai) platform with an extensive toolset supporting SAR analisys for small molecules
|
|
5
5
|
and antibodies.
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
## Notations
|
|
8
8
|
|
|
9
9
|
[@datagrok/bio](https://github.com/datagrok-ai/public/tree/master/packages/Bio) can ingest data in multiple file
|
|
10
10
|
formats (such as fasta o csv) and multiple notations for natural and modified residues, aligned and non-aligned forms,
|
|
@@ -18,7 +18,7 @@ See:
|
|
|
18
18
|
* [detectMacromolecule()](../Bio/detectors.js)
|
|
19
19
|
* [class NotationConverter](../../libraries/bio/src/utils/notation-converter.ts)
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
## Atomic-Level structures from sequences
|
|
22
22
|
|
|
23
23
|
For linear sequences, the linear form (see the illustration below) of molecules is reproduced. This is useful
|
|
24
24
|
for better visual inspection of sequence and duplex comparison. Structure at atomic level could be saved in available
|
|
@@ -34,11 +34,10 @@ See:
|
|
|
34
34
|
|
|
35
35
|
* [getMolfilesFromSeq()](./src/utils/atomic-works.ts)
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
## MSA
|
|
38
38
|
|
|
39
39
|
For multiple-sequence alignment, Datagrok uses the “kalign” that relies on Wu-Manber string-matching algorithm
|
|
40
|
-
[Lassmann, Timo. _Kalign 3: multiple sequence alignment of large data sets._ **Bioinformatics** (2019).
|
|
41
|
-
https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btz795/30314127/btz795.pdf)].
|
|
40
|
+
[Lassmann, Timo. _Kalign 3: multiple sequence alignment of large data sets._ **Bioinformatics** (2019).pdf](https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btz795/30314127/btz795.pdf).
|
|
42
41
|
“kalign“ is suited for sequences containing only natural monomers. Sequences of a particular column can be analyzed
|
|
43
42
|
using MSA algorithm available at the top menu. Aligned sequences can be inspected for base composition
|
|
44
43
|
at the position of MSA result.
|
|
@@ -52,7 +51,7 @@ See:
|
|
|
52
51
|
|
|
53
52
|
TODO: MSA with PepSeA
|
|
54
53
|
|
|
55
|
-
|
|
54
|
+
## Splitting to monomers
|
|
56
55
|
|
|
57
56
|
Splitting to monomers allows splitting aligned sequences in separate monomers.
|
|
58
57
|
|
|
@@ -62,7 +61,7 @@ See:
|
|
|
62
61
|
|
|
63
62
|
* [splitAlignedSequences()](../../libraries/bio/src/utils/splitter.ts)
|
|
64
63
|
|
|
65
|
-
|
|
64
|
+
## Web Logo
|
|
66
65
|
|
|
67
66
|
Web Logo visualizes a graphical representation of multiple sequence alignment (amino acids or nucleotides or
|
|
68
67
|
modified residues with multi-char labels). Each logo consists of stacks of symbols, one for each position
|
|
@@ -81,13 +80,13 @@ You can customize the look of the viewer with properties. Properties ```startPos
|
|
|
81
80
|
allow to display multiple alignment partially. If property ```startPosition``` (```endPosition```)
|
|
82
81
|
is not specified, then the Logo will be plotted from the first (till the last) position of sequences.
|
|
83
82
|
|
|
84
|
-
|
|
83
|
+
### General
|
|
85
84
|
|
|
86
85
|
| | |
|
|
87
86
|
|-------------|--------------|
|
|
88
87
|
| Right click | Context menu |
|
|
89
88
|
|
|
90
|
-
|
|
89
|
+
### Properties
|
|
91
90
|
|
|
92
91
|
| Property name | Default | Description |
|
|
93
92
|
|----------------------|----------|-------------------------------------------------------------------------------------------------------------------------|
|
|
@@ -116,7 +115,7 @@ See also:
|
|
|
116
115
|
* [Viewers](../../help/visualize/viewers.md)
|
|
117
116
|
* [Table view](../../help/datagrok/table-view.md)
|
|
118
117
|
|
|
119
|
-
|
|
118
|
+
## Sequence space
|
|
120
119
|
|
|
121
120
|
Datagrok allows visualizing multidimensional sequence space using a dimensionality reduction approach.
|
|
122
121
|
Several distance-based dimensionality reduction algorithms are available, such as UMAP or t-SNE.
|
|
@@ -132,7 +131,7 @@ See:
|
|
|
132
131
|
|
|
133
132
|
* [sequenceSpace()](src/utils/sequence-space.ts)
|
|
134
133
|
|
|
135
|
-
|
|
134
|
+
## Sequence activity cliffs
|
|
136
135
|
|
|
137
136
|
Activity cliffs tool finds pairs of sequences where small changes in the sequence yield significant
|
|
138
137
|
changes in activity or any other numerical property. open the tool from a top menu by selecting.
|
|
@@ -145,4 +144,4 @@ To launch the analysis from the top menu, select Bio | Sequence Activity Cliffs.
|
|
|
145
144
|
|
|
146
145
|
See:
|
|
147
146
|
|
|
148
|
-
* [getActivityCliffs()](../../libraries/ml/src/viewers/activity-cliffs.ts)
|
|
147
|
+
* [getActivityCliffs()](../../libraries/ml/src/viewers/activity-cliffs.ts)
|
package/css/helm.css
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
1
1
|
.d4-g-cell[semType="Macromolecule"] * {
|
|
2
2
|
pointer-events: none !important;
|
|
3
3
|
}
|
|
4
|
+
|
|
5
|
+
.helm-substructure-filter {
|
|
6
|
+
border: 1px solid var(--grey-2);
|
|
7
|
+
height: 25px;
|
|
8
|
+
display: flex;
|
|
9
|
+
justify-content: center;
|
|
10
|
+
align-items: center;
|
|
11
|
+
padding-left: 0px;
|
|
12
|
+
margin-left: 0px;
|
|
13
|
+
}
|
package/detectors.js
CHANGED
|
@@ -45,23 +45,28 @@ const isUrlRe = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9
|
|
|
45
45
|
|
|
46
46
|
class BioPackageDetectors extends DG.Package {
|
|
47
47
|
|
|
48
|
-
|
|
48
|
+
/** Parts of the column name required in the column's name under the detector. It must be in lowercase. */
|
|
49
|
+
likelyColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein'];
|
|
50
|
+
|
|
51
|
+
peptideFastaAlphabet = new Set([
|
|
49
52
|
'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
|
|
50
53
|
'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T',
|
|
51
54
|
'MeNle', 'MeA', 'MeG', 'MeF',
|
|
52
55
|
]);
|
|
53
56
|
|
|
54
|
-
|
|
57
|
+
dnaFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
|
|
58
|
+
|
|
59
|
+
rnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
|
|
55
60
|
|
|
56
|
-
|
|
61
|
+
numbersRawAlphabet = new Set(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']);
|
|
57
62
|
|
|
58
|
-
|
|
63
|
+
smilesRawAlphabet = new Set([
|
|
59
64
|
'A', 'B', 'C', 'E', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'Z',
|
|
60
65
|
'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'r', 's', 't', 'u',
|
|
61
66
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
|
62
67
|
'+', '-', '.', , '/', '\\', '@', '[', ']', '(', ')', '#', '%', '=']);
|
|
63
68
|
|
|
64
|
-
|
|
69
|
+
smartsRawAlphabet = new Set([
|
|
65
70
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
|
66
71
|
'!', '#', '$', '&', '(', ')', '*', '+', ',', '-', '.', ':', ';', '=', '@', '~', '[', ']',
|
|
67
72
|
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
|
|
@@ -83,6 +88,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
83
88
|
detectMacromolecule(col) {
|
|
84
89
|
const t1 = Date.now();
|
|
85
90
|
try {
|
|
91
|
+
const colName = col.name;
|
|
92
|
+
const colNameLikely = this.likelyColNamePartList.some(
|
|
93
|
+
(requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart));
|
|
94
|
+
const seqMinLength = colNameLikely ? 3 : 5;
|
|
95
|
+
|
|
86
96
|
// Fail early
|
|
87
97
|
if (col.type !== DG.TYPE.STRING) return null;
|
|
88
98
|
|
|
@@ -92,7 +102,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
92
102
|
// To collect alphabet freq three strategies can be used:
|
|
93
103
|
// as chars, as fasta (single or within square brackets), as with the separator.
|
|
94
104
|
if (
|
|
95
|
-
!(col.categories.length
|
|
105
|
+
!(col.categories.length === 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category
|
|
96
106
|
DG.Detector.sampleCategories(col, (s) => this.isHelm(s), 1, SEQ_SAMPLE_LIMIT)
|
|
97
107
|
) {
|
|
98
108
|
const statsAsHelm = this.getStats(categoriesSample, 2,
|
|
@@ -109,14 +119,15 @@ class BioPackageDetectors extends DG.Package {
|
|
|
109
119
|
}
|
|
110
120
|
|
|
111
121
|
const decoyAlphabets = [
|
|
112
|
-
['
|
|
113
|
-
['
|
|
122
|
+
['NUMBERS', this.numbersRawAlphabet, 0.25],
|
|
123
|
+
['SMILES', this.smilesRawAlphabet, 0.25],
|
|
124
|
+
['SMARTS', this.smartsRawAlphabet, 0.43],
|
|
114
125
|
];
|
|
115
126
|
|
|
116
127
|
const candidateAlphabets = [
|
|
117
|
-
[ALPHABET.PT, this.
|
|
118
|
-
[ALPHABET.DNA, this.
|
|
119
|
-
[ALPHABET.RNA, this.
|
|
128
|
+
[ALPHABET.PT, this.peptideFastaAlphabet, 0.50],
|
|
129
|
+
[ALPHABET.DNA, this.dnaFastaAlphabet, 0.55],
|
|
130
|
+
[ALPHABET.RNA, this.rnaFastaAlphabet, 0.55],
|
|
120
131
|
];
|
|
121
132
|
|
|
122
133
|
// Check for url column, maybe it is too heavy check
|
|
@@ -136,57 +147,67 @@ class BioPackageDetectors extends DG.Package {
|
|
|
136
147
|
|
|
137
148
|
// TODO: Detect HELM sequence
|
|
138
149
|
// TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
|
|
139
|
-
const statsAsChars = this.getStats(categoriesSample,
|
|
150
|
+
const statsAsChars = this.getStats(categoriesSample, seqMinLength,
|
|
140
151
|
this.getSplitterAsChars(SEQ_SAMPLE_LENGTH_LIMIT));
|
|
141
152
|
// Empty statsAsShars.freq alphabet means no strings of enough length presented in the data
|
|
142
153
|
if (Object.keys(statsAsChars.freq).length === 0) return null;
|
|
143
154
|
|
|
144
155
|
const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
|
|
145
|
-
if (decoy
|
|
156
|
+
if (decoy !== ALPHABET.UN) return null;
|
|
146
157
|
|
|
147
158
|
const separator = this.detectSeparator(statsAsChars.freq);
|
|
159
|
+
if (this.checkForbiddenSeparator(separator)) return null;
|
|
160
|
+
|
|
148
161
|
const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
|
|
149
162
|
const gapSymbol = separator ? '' : '-';
|
|
150
163
|
const splitter = separator ? this.getSplitterWithSeparator(separator, SEQ_SAMPLE_LENGTH_LIMIT) :
|
|
151
164
|
this.getSplitterAsFasta(SEQ_SAMPLE_LENGTH_LIMIT);
|
|
152
165
|
|
|
153
166
|
if (statsAsChars.sameLength) {
|
|
154
|
-
const stats = this.getStats(categoriesSample,
|
|
155
|
-
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-');
|
|
156
|
-
if (alphabet === ALPHABET.UN) return null;
|
|
167
|
+
const stats = this.getStats(categoriesSample, seqMinLength, splitter);
|
|
168
|
+
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-', colNameLikely);
|
|
169
|
+
if (alphabet === ALPHABET.UN && !colNameLikely) return null;
|
|
157
170
|
|
|
158
171
|
col.setTag(DG.TAGS.UNITS, units);
|
|
159
172
|
if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
|
|
160
173
|
col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
|
|
161
174
|
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
175
|
+
if (alphabet === ALPHABET.UN) {
|
|
176
|
+
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
177
|
+
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
178
|
+
}
|
|
162
179
|
return DG.SEMTYPE.MACROMOLECULE;
|
|
163
180
|
} else {
|
|
164
|
-
const stats = this.getStats(categoriesSample,
|
|
181
|
+
const stats = this.getStats(categoriesSample, seqMinLength, splitter);
|
|
182
|
+
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
165
183
|
// Empty monomer alphabet is not allowed
|
|
166
184
|
if (Object.keys(stats.freq).length === 0) return null;
|
|
167
185
|
// Long monomer names for sequences with separators have constraints
|
|
168
|
-
if (
|
|
186
|
+
if (
|
|
187
|
+
((units === NOTATION.SEPARATOR || (units === NOTATION.FASTA && alphabetIsMultichar)) &&
|
|
188
|
+
this.checkForbiddenMultichar(stats.freq)) ||
|
|
189
|
+
((units === NOTATION.FASTA && !alphabetIsMultichar) &&
|
|
190
|
+
this.checkForbiddenSinglechar(stats.freq))
|
|
191
|
+
) return null;
|
|
169
192
|
|
|
170
193
|
const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
|
|
171
194
|
|
|
172
195
|
// TODO: If separator detected, then extra efforts to detect alphabet are allowed.
|
|
173
|
-
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
|
|
196
|
+
const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol, colNameLikely);
|
|
197
|
+
/* Likely column name allows detecting 'fasta' notation with 'UN' alphabet, 2023-04-13, atanas, askalkin */
|
|
198
|
+
if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar && !colNameLikely) return null;
|
|
174
199
|
|
|
175
200
|
// const forbidden = this.checkForbiddenWoSeparator(stats.freq);
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
185
|
-
// col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
|
|
186
|
-
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
187
|
-
}
|
|
188
|
-
return DG.SEMTYPE.MACROMOLECULE;
|
|
201
|
+
col.setTag(DG.TAGS.UNITS, units);
|
|
202
|
+
if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
|
|
203
|
+
col.setTag(UnitsHandler.TAGS.aligned, aligned);
|
|
204
|
+
col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
|
|
205
|
+
if (alphabet === ALPHABET.UN) {
|
|
206
|
+
// alphabetSize calculated on (sub)sample of data is incorrect
|
|
207
|
+
const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
|
|
208
|
+
col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
|
|
189
209
|
}
|
|
210
|
+
return DG.SEMTYPE.MACROMOLECULE;
|
|
190
211
|
}
|
|
191
212
|
} finally {
|
|
192
213
|
const t2 = Date.now();
|
|
@@ -207,15 +228,15 @@ class BioPackageDetectors extends DG.Package {
|
|
|
207
228
|
// !!! What is the difference between the gap symbol and separator symbol in stats terms?
|
|
208
229
|
// const noSeparatorRe = /[a-z\d]+$/i;
|
|
209
230
|
const noSeparatorChemRe = /[HBCNOFPSKVYI]/i; // Mendeleev's periodic table single char elements
|
|
210
|
-
const noSeparatorAlphaDigitRe = /[\dA-Z
|
|
231
|
+
const noSeparatorAlphaDigitRe = /[\dA-Z]/i;
|
|
211
232
|
const noSeparatorBracketsRe = /[\[\]()<>{}]/i;
|
|
212
233
|
const cleanFreq = Object.assign({}, ...Object.entries(freq)
|
|
213
234
|
.filter(([m, f]) =>
|
|
214
235
|
!noSeparatorChemRe.test(m) && !noSeparatorAlphaDigitRe.test(m) && !noSeparatorBracketsRe.test(m) &&
|
|
215
|
-
!this.
|
|
216
|
-
!this.
|
|
236
|
+
!this.peptideFastaAlphabet.has(m) &&
|
|
237
|
+
!this.dnaFastaAlphabet.has(m))
|
|
217
238
|
.map(([m, f]) => ({[m]: f})));
|
|
218
|
-
if (Object.keys(cleanFreq).length
|
|
239
|
+
if (Object.keys(cleanFreq).length === 0) return null;
|
|
219
240
|
|
|
220
241
|
const maxFreq = Math.max(...Object.values(cleanFreq));
|
|
221
242
|
|
|
@@ -227,12 +248,24 @@ class BioPackageDetectors extends DG.Package {
|
|
|
227
248
|
return sepFreq / otherSumFreq > freqThreshold ? sep : null;
|
|
228
249
|
}
|
|
229
250
|
|
|
230
|
-
|
|
251
|
+
checkForbiddenSeparator(separator) {
|
|
252
|
+
// dot, comma, ampersand, space, underscore, CR, LF
|
|
253
|
+
const forbiddenSepRe = / |\.|,|&|_|\r\n|\n/i;
|
|
254
|
+
return forbiddenSepRe.test(separator);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/** Spaces, dots and colons are nor allowed in multichar monomer names.
|
|
231
258
|
* The monomer name/label cannot contain digits only.
|
|
232
259
|
*/
|
|
233
|
-
|
|
234
|
-
const forbiddenRe = /[ ]|^\d+$/i;
|
|
235
|
-
return Object.keys(freq).
|
|
260
|
+
checkForbiddenMultichar(freq) {
|
|
261
|
+
const forbiddenRe = /[ .:]|^\d+$/i;
|
|
262
|
+
return Object.keys(freq).some((m) => forbiddenRe.test(m));
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/** Space, dot, colon, semicolon, digit, underscore are not allowed as singe char monomer names.*/
|
|
266
|
+
checkForbiddenSinglechar(freq) {
|
|
267
|
+
const forbiddenRe = /[ .:;\d_]/i;
|
|
268
|
+
return Object.keys(freq).some((m) => forbiddenRe.test(m));
|
|
236
269
|
}
|
|
237
270
|
|
|
238
271
|
// /** Without a separator, special symbols or digits are not allowed as monomers. */
|
|
@@ -250,17 +283,16 @@ class BioPackageDetectors extends DG.Package {
|
|
|
250
283
|
for (const seq of values) {
|
|
251
284
|
const mSeq = splitter(seq);
|
|
252
285
|
|
|
253
|
-
if (firstLength
|
|
286
|
+
if (firstLength === null) {
|
|
287
|
+
//
|
|
254
288
|
firstLength = mSeq.length;
|
|
255
289
|
} else if (mSeq.length !== firstLength) {
|
|
256
290
|
sameLength = false;
|
|
257
291
|
}
|
|
258
292
|
|
|
259
|
-
if (mSeq.length
|
|
293
|
+
if (mSeq.length >= minLength) {
|
|
260
294
|
for (const m of mSeq) {
|
|
261
|
-
if (!(m in freq))
|
|
262
|
-
freq[m] = 0;
|
|
263
|
-
}
|
|
295
|
+
if (!(m in freq)) freq[m] = 0;
|
|
264
296
|
freq[m] += 1;
|
|
265
297
|
}
|
|
266
298
|
}
|
|
@@ -271,17 +303,18 @@ class BioPackageDetectors extends DG.Package {
|
|
|
271
303
|
/** Detects alphabet for freq by freq similarity to alphabet monomer set.
|
|
272
304
|
* @param freq frequencies of monomers in sequence set
|
|
273
305
|
* @param candidates an array of pairs [name, monomer set]
|
|
274
|
-
*
|
|
275
|
-
|
|
306
|
+
* @param {boolean} colNameLikely The column name suggests the column is Macromolecule more likely
|
|
307
|
+
*/
|
|
308
|
+
detectAlphabet(freq, candidates, gapSymbol, colNameLikely = false) {
|
|
276
309
|
const candidatesSims = candidates.map((c) => {
|
|
277
|
-
const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol);
|
|
310
|
+
const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol) + (colNameLikely ? 0.15 : 0);
|
|
278
311
|
return [c[0], c[1], c[2], freq, sim];
|
|
279
312
|
});
|
|
280
313
|
|
|
281
314
|
let alphabetName;
|
|
282
315
|
const maxSim = Math.max(...candidatesSims.map((cs) => cs[4] > cs[2] ? cs[4] : -1));
|
|
283
316
|
if (maxSim > 0) {
|
|
284
|
-
const sim = candidatesSims.find((cs) => cs[4]
|
|
317
|
+
const sim = candidatesSims.find((cs) => cs[4] === maxSim);
|
|
285
318
|
alphabetName = sim[0];
|
|
286
319
|
} else {
|
|
287
320
|
alphabetName = ALPHABET.UN;
|
|
@@ -306,20 +339,19 @@ class BioPackageDetectors extends DG.Package {
|
|
|
306
339
|
|
|
307
340
|
vectorLength(v) {
|
|
308
341
|
let sqrSum = 0;
|
|
309
|
-
for (let i = 0; i < v.length; i++)
|
|
342
|
+
for (let i = 0; i < v.length; i++)
|
|
310
343
|
sqrSum += v[i] * v[i];
|
|
311
|
-
}
|
|
312
344
|
return Math.sqrt(sqrSum);
|
|
313
345
|
}
|
|
314
346
|
|
|
315
347
|
vectorDotProduct(v1, v2) {
|
|
316
|
-
if (v1.length
|
|
348
|
+
if (v1.length !== v2.length)
|
|
317
349
|
throw Error('The dimensionality of the vectors must match');
|
|
318
|
-
|
|
350
|
+
|
|
319
351
|
let prod = 0;
|
|
320
|
-
for (let i = 0; i < v1.length; i++)
|
|
352
|
+
for (let i = 0; i < v1.length; i++)
|
|
321
353
|
prod += v1[i] * v2[i];
|
|
322
|
-
|
|
354
|
+
|
|
323
355
|
return prod;
|
|
324
356
|
}
|
|
325
357
|
|
|
@@ -327,7 +359,7 @@ class BioPackageDetectors extends DG.Package {
|
|
|
327
359
|
getSplitterAsChars(lengthLimit) {
|
|
328
360
|
return function(seq) {
|
|
329
361
|
return seq.split('', lengthLimit);
|
|
330
|
-
}
|
|
362
|
+
};
|
|
331
363
|
}
|
|
332
364
|
|
|
333
365
|
getSplitterWithSeparator(separator, lengthLimit) {
|
|
@@ -346,11 +378,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
346
378
|
// } else {
|
|
347
379
|
return seq.split(separator, lengthLimit);
|
|
348
380
|
// }
|
|
349
|
-
}
|
|
381
|
+
};
|
|
350
382
|
}
|
|
351
383
|
|
|
352
384
|
// Multichar monomer names in square brackets, single char monomers or gap symbol
|
|
353
|
-
monomerRe = /\[(\w+)\]|(
|
|
385
|
+
monomerRe = /\[(\w+)\]|(.)/g;
|
|
354
386
|
|
|
355
387
|
/** Split sequence for single character monomers, square brackets multichar monomer names or gap symbol. */
|
|
356
388
|
getSplitterAsFasta(lengthLimit) {
|
|
@@ -360,11 +392,11 @@ class BioPackageDetectors extends DG.Package {
|
|
|
360
392
|
.map((ma) => {
|
|
361
393
|
let mRes;
|
|
362
394
|
const m = ma[0];
|
|
363
|
-
if (m.length > 1)
|
|
395
|
+
if (m.length > 1)
|
|
364
396
|
mRes = ma[1];
|
|
365
|
-
|
|
397
|
+
else
|
|
366
398
|
mRes = m;
|
|
367
|
-
|
|
399
|
+
|
|
368
400
|
return mRes;
|
|
369
401
|
}).toArray();
|
|
370
402
|
|
|
@@ -391,11 +423,10 @@ class BioPackageDetectors extends DG.Package {
|
|
|
391
423
|
const mmPostProcess = (mm) => {
|
|
392
424
|
this.helmPp1Re.lastIndex = 0;
|
|
393
425
|
const pp1M = this.helmPp1Re.exec(mm);
|
|
394
|
-
if (pp1M && pp1M.length >= 2)
|
|
426
|
+
if (pp1M && pp1M.length >= 2)
|
|
395
427
|
return pp1M[1];
|
|
396
|
-
|
|
428
|
+
else
|
|
397
429
|
return mm;
|
|
398
|
-
}
|
|
399
430
|
};
|
|
400
431
|
|
|
401
432
|
const mmList = inSeq ? inSeq.split('.') : [];
|
|
@@ -405,16 +436,13 @@ class BioPackageDetectors extends DG.Package {
|
|
|
405
436
|
}
|
|
406
437
|
|
|
407
438
|
sample(src, n) {
|
|
408
|
-
if (src.length < n)
|
|
439
|
+
if (src.length < n)
|
|
409
440
|
throw new Error('Sample source is less than n requested.');
|
|
410
|
-
}
|
|
411
441
|
|
|
412
442
|
const idxSet = new Set();
|
|
413
443
|
while (idxSet.size < n) {
|
|
414
444
|
const idx = Math.floor(Math.random() * src.length);
|
|
415
|
-
if (!idxSet.has(idx))
|
|
416
|
-
idxSet.add(idx);
|
|
417
|
-
}
|
|
445
|
+
if (!idxSet.has(idx)) idxSet.add(idx);
|
|
418
446
|
}
|
|
419
447
|
|
|
420
448
|
return [...idxSet].map((idx) => src[idx]);
|