@datagrok/bio 2.1.12 → 2.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/.eslintrc.json +1 -1
  2. package/README.md +11 -12
  3. package/css/helm.css +10 -0
  4. package/detectors.js +97 -69
  5. package/dist/package-test.js +2 -13168
  6. package/dist/package-test.js.map +1 -0
  7. package/dist/package.js +2 -10560
  8. package/dist/package.js.map +1 -0
  9. package/dockerfiles/Dockerfile +86 -0
  10. package/files/icons/composition-analysis.svg +17 -0
  11. package/files/icons/sequence-diversity-viewer.svg +4 -0
  12. package/files/icons/sequence-similarity-viewer.svg +4 -0
  13. package/files/icons/vdregions-viewer.svg +22 -0
  14. package/files/icons/weblogo-viewer.svg +7 -0
  15. package/files/tests/testUrl.csv +11 -0
  16. package/files/tests/toAtomicLevelTest.csv +4 -0
  17. package/package.json +24 -25
  18. package/src/analysis/sequence-activity-cliffs.ts +11 -9
  19. package/src/analysis/sequence-search-base-viewer.ts +2 -1
  20. package/src/analysis/sequence-similarity-viewer.ts +3 -3
  21. package/src/analysis/sequence-space.ts +2 -1
  22. package/src/calculations/monomerLevelMols.ts +4 -4
  23. package/src/package-test.ts +10 -2
  24. package/src/package.ts +215 -131
  25. package/src/substructure-search/substructure-search.ts +19 -16
  26. package/src/tests/Palettes-test.ts +1 -1
  27. package/src/tests/WebLogo-positions-test.ts +113 -57
  28. package/src/tests/_first-tests.ts +9 -0
  29. package/src/tests/activity-cliffs-tests.ts +8 -7
  30. package/src/tests/activity-cliffs-utils.ts +17 -9
  31. package/src/tests/bio-tests.ts +4 -5
  32. package/src/tests/checkInputColumn-tests.ts +1 -1
  33. package/src/tests/converters-test.ts +52 -17
  34. package/src/tests/detectors-benchmark-tests.ts +3 -2
  35. package/src/tests/detectors-tests.ts +177 -172
  36. package/src/tests/detectors-weak-and-likely-tests.ts +129 -0
  37. package/src/tests/fasta-export-tests.ts +1 -1
  38. package/src/tests/monomer-libraries-tests.ts +34 -0
  39. package/src/tests/pepsea-tests.ts +21 -0
  40. package/src/tests/renderers-test.ts +21 -19
  41. package/src/tests/sequence-space-test.ts +6 -4
  42. package/src/tests/similarity-diversity-tests.ts +4 -4
  43. package/src/tests/splitters-test.ts +4 -5
  44. package/src/tests/substructure-filters-tests.ts +23 -1
  45. package/src/tests/utils/sequences-generators.ts +1 -1
  46. package/src/tests/utils.ts +2 -1
  47. package/src/tests/viewers.ts +16 -0
  48. package/src/utils/cell-renderer.ts +88 -35
  49. package/src/utils/constants.ts +7 -6
  50. package/src/utils/convert.ts +8 -2
  51. package/src/utils/monomer-lib.ts +174 -0
  52. package/src/utils/multiple-sequence-alignment.ts +44 -20
  53. package/src/utils/pepsea.ts +78 -0
  54. package/src/utils/save-as-fasta.ts +2 -1
  55. package/src/utils/ui-utils.ts +15 -3
  56. package/src/viewers/vd-regions-viewer.ts +113 -72
  57. package/src/viewers/web-logo-viewer.ts +1031 -0
  58. package/src/widgets/bio-substructure-filter.ts +38 -24
  59. package/tsconfig.json +71 -72
  60. package/webpack.config.js +4 -11
  61. package/dist/vendors-node_modules_datagrok-libraries_ml_src_workers_dimensionality-reducer_js.js +0 -9039
package/.eslintrc.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "env": {
3
3
  "browser": true,
4
- "es2021": true
4
+ "es2022": true
5
5
  },
6
6
  "extends": [
7
7
  "google"
package/README.md CHANGED
@@ -4,7 +4,7 @@ Bio is a bioinformatics support [package](https://datagrok.ai/help/develop/devel
4
4
  [Datagrok](https://datagrok.ai) platform with an extensive toolset supporting SAR analisys for small molecules
5
5
  and antibodies.
6
6
 
7
- # Notations
7
+ ## Notations
8
8
 
9
9
  [@datagrok/bio](https://github.com/datagrok-ai/public/tree/master/packages/Bio) can ingest data in multiple file
10
10
  formats (such as fasta o csv) and multiple notations for natural and modified residues, aligned and non-aligned forms,
@@ -18,7 +18,7 @@ See:
18
18
  * [detectMacromolecule()](../Bio/detectors.js)
19
19
  * [class NotationConverter](../../libraries/bio/src/utils/notation-converter.ts)
20
20
 
21
- # Atomic-Level structures from sequences
21
+ ## Atomic-Level structures from sequences
22
22
 
23
23
  For linear sequences, the linear form (see the illustration below) of molecules is reproduced. This is useful
24
24
  for better visual inspection of sequence and duplex comparison. Structure at atomic level could be saved in available
@@ -34,11 +34,10 @@ See:
34
34
 
35
35
  * [getMolfilesFromSeq()](./src/utils/atomic-works.ts)
36
36
 
37
- # MSA
37
+ ## MSA
38
38
 
39
39
  For multiple-sequence alignment, Datagrok uses the “kalign” that relies on Wu-Manber string-matching algorithm
40
- [Lassmann, Timo. _Kalign 3: multiple sequence alignment of large data sets._ **Bioinformatics** (2019).[pdf](
41
- https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btz795/30314127/btz795.pdf)].
40
+ [Lassmann, Timo. _Kalign 3: multiple sequence alignment of large data sets._ **Bioinformatics** (2019).pdf](https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btz795/30314127/btz795.pdf).
42
41
  “kalign“ is suited for sequences containing only natural monomers. Sequences of a particular column can be analyzed
43
42
  using MSA algorithm available at the top menu. Aligned sequences can be inspected for base composition
44
43
  at the position of MSA result.
@@ -52,7 +51,7 @@ See:
52
51
 
53
52
  TODO: MSA with PepSeA
54
53
 
55
- # Splitting to monomers
54
+ ## Splitting to monomers
56
55
 
57
56
  Splitting to monomers allows splitting aligned sequences in separate monomers.
58
57
 
@@ -62,7 +61,7 @@ See:
62
61
 
63
62
  * [splitAlignedSequences()](../../libraries/bio/src/utils/splitter.ts)
64
63
 
65
- # Web Logo
64
+ ## Web Logo
66
65
 
67
66
  Web Logo visualizes a graphical representation of multiple sequence alignment (amino acids or nucleotides or
68
67
  modified residues with multi-char labels). Each logo consists of stacks of symbols, one for each position
@@ -81,13 +80,13 @@ You can customize the look of the viewer with properties. Properties ```startPos
81
80
  allow to display multiple alignment partially. If property ```startPosition``` (```endPosition```)
82
81
  is not specified, then the Logo will be plotted from the first (till the last) position of sequences.
83
82
 
84
- ## General
83
+ ### General
85
84
 
86
85
  | | |
87
86
  |-------------|--------------|
88
87
  | Right click | Context menu |
89
88
 
90
- ## Properties
89
+ ### Properties
91
90
 
92
91
  | Property name | Default | Description |
93
92
  |----------------------|----------|-------------------------------------------------------------------------------------------------------------------------|
@@ -116,7 +115,7 @@ See also:
116
115
  * [Viewers](../../help/visualize/viewers.md)
117
116
  * [Table view](../../help/datagrok/table-view.md)
118
117
 
119
- # Sequence space
118
+ ## Sequence space
120
119
 
121
120
  Datagrok allows visualizing multidimensional sequence space using a dimensionality reduction approach.
122
121
  Several distance-based dimensionality reduction algorithms are available, such as UMAP or t-SNE.
@@ -132,7 +131,7 @@ See:
132
131
 
133
132
  * [sequenceSpace()](src/utils/sequence-space.ts)
134
133
 
135
- # Sequence activity cliffs
134
+ ## Sequence activity cliffs
136
135
 
137
136
  Activity cliffs tool finds pairs of sequences where small changes in the sequence yield significant
138
137
  changes in activity or any other numerical property. open the tool from a top menu by selecting.
@@ -145,4 +144,4 @@ To launch the analysis from the top menu, select Bio | Sequence Activity Cliffs.
145
144
 
146
145
  See:
147
146
 
148
- * [getActivityCliffs()](../../libraries/ml/src/viewers/activity-cliffs.ts)
147
+ * [getActivityCliffs()](../../libraries/ml/src/viewers/activity-cliffs.ts)
package/css/helm.css CHANGED
@@ -1,3 +1,13 @@
1
1
  .d4-g-cell[semType="Macromolecule"] * {
2
2
  pointer-events: none !important;
3
3
  }
4
+
5
+ .helm-substructure-filter {
6
+ border: 1px solid var(--grey-2);
7
+ height: 25px;
8
+ display: flex;
9
+ justify-content: center;
10
+ align-items: center;
11
+ padding-left: 0px;
12
+ margin-left: 0px;
13
+ }
package/detectors.js CHANGED
@@ -45,23 +45,28 @@ const isUrlRe = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9
45
45
 
46
46
  class BioPackageDetectors extends DG.Package {
47
47
 
48
- PeptideFastaAlphabet = new Set([
48
+ /** Parts of the column name required in the column's name under the detector. It must be in lowercase. */
49
+ likelyColNamePartList = ['seq', 'msa', 'dna', 'rna', 'fasta', 'helm', 'sense', 'protein'];
50
+
51
+ peptideFastaAlphabet = new Set([
49
52
  'G', 'L', 'Y', 'S', 'E', 'Q', 'D', 'N', 'F', 'A',
50
53
  'K', 'R', 'H', 'C', 'V', 'P', 'W', 'I', 'M', 'T',
51
54
  'MeNle', 'MeA', 'MeG', 'MeF',
52
55
  ]);
53
56
 
54
- DnaFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
57
+ dnaFastaAlphabet = new Set(['A', 'C', 'G', 'T']);
58
+
59
+ rnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
55
60
 
56
- RnaFastaAlphabet = new Set(['A', 'C', 'G', 'U']);
61
+ numbersRawAlphabet = new Set(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']);
57
62
 
58
- SmilesRawAlphabet = new Set([
63
+ smilesRawAlphabet = new Set([
59
64
  'A', 'B', 'C', 'E', 'F', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'Z',
60
65
  'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'r', 's', 't', 'u',
61
66
  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
62
67
  '+', '-', '.', , '/', '\\', '@', '[', ']', '(', ')', '#', '%', '=']);
63
68
 
64
- SmartsRawAlphabet = new Set([
69
+ smartsRawAlphabet = new Set([
65
70
  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
66
71
  '!', '#', '$', '&', '(', ')', '*', '+', ',', '-', '.', ':', ';', '=', '@', '~', '[', ']',
67
72
  'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M',
@@ -83,6 +88,11 @@ class BioPackageDetectors extends DG.Package {
83
88
  detectMacromolecule(col) {
84
89
  const t1 = Date.now();
85
90
  try {
91
+ const colName = col.name;
92
+ const colNameLikely = this.likelyColNamePartList.some(
93
+ (requiredColNamePart) => colName.toLowerCase().includes(requiredColNamePart));
94
+ const seqMinLength = colNameLikely ? 3 : 5;
95
+
86
96
  // Fail early
87
97
  if (col.type !== DG.TYPE.STRING) return null;
88
98
 
@@ -92,7 +102,7 @@ class BioPackageDetectors extends DG.Package {
92
102
  // To collect alphabet freq three strategies can be used:
93
103
  // as chars, as fasta (single or within square brackets), as with the separator.
94
104
  if (
95
- !(col.categories.length == 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category value
105
+ !(col.categories.length === 1 && !col.categories[0]) && // TODO: Remove with tests for single empty category
96
106
  DG.Detector.sampleCategories(col, (s) => this.isHelm(s), 1, SEQ_SAMPLE_LIMIT)
97
107
  ) {
98
108
  const statsAsHelm = this.getStats(categoriesSample, 2,
@@ -109,14 +119,15 @@ class BioPackageDetectors extends DG.Package {
109
119
  }
110
120
 
111
121
  const decoyAlphabets = [
112
- ['SMILES', this.SmilesRawAlphabet, 0.30],
113
- ['SMARTS', this.SmartsRawAlphabet, 0.43],
122
+ ['NUMBERS', this.numbersRawAlphabet, 0.25],
123
+ ['SMILES', this.smilesRawAlphabet, 0.25],
124
+ ['SMARTS', this.smartsRawAlphabet, 0.43],
114
125
  ];
115
126
 
116
127
  const candidateAlphabets = [
117
- [ALPHABET.PT, this.PeptideFastaAlphabet, 0.50],
118
- [ALPHABET.DNA, this.DnaFastaAlphabet, 0.55],
119
- [ALPHABET.RNA, this.RnaFastaAlphabet, 0.55],
128
+ [ALPHABET.PT, this.peptideFastaAlphabet, 0.50],
129
+ [ALPHABET.DNA, this.dnaFastaAlphabet, 0.55],
130
+ [ALPHABET.RNA, this.rnaFastaAlphabet, 0.55],
120
131
  ];
121
132
 
122
133
  // Check for url column, maybe it is too heavy check
@@ -136,57 +147,67 @@ class BioPackageDetectors extends DG.Package {
136
147
 
137
148
  // TODO: Detect HELM sequence
138
149
  // TODO: Lazy calculations could be helpful for performance and convenient for expressing classification logic.
139
- const statsAsChars = this.getStats(categoriesSample, 5,
150
+ const statsAsChars = this.getStats(categoriesSample, seqMinLength,
140
151
  this.getSplitterAsChars(SEQ_SAMPLE_LENGTH_LIMIT));
141
152
  // Empty statsAsShars.freq alphabet means no strings of enough length presented in the data
142
153
  if (Object.keys(statsAsChars.freq).length === 0) return null;
143
154
 
144
155
  const decoy = this.detectAlphabet(statsAsChars.freq, decoyAlphabets, null);
145
- if (decoy != ALPHABET.UN) return null;
156
+ if (decoy !== ALPHABET.UN) return null;
146
157
 
147
158
  const separator = this.detectSeparator(statsAsChars.freq);
159
+ if (this.checkForbiddenSeparator(separator)) return null;
160
+
148
161
  const units = separator ? NOTATION.SEPARATOR : NOTATION.FASTA;
149
162
  const gapSymbol = separator ? '' : '-';
150
163
  const splitter = separator ? this.getSplitterWithSeparator(separator, SEQ_SAMPLE_LENGTH_LIMIT) :
151
164
  this.getSplitterAsFasta(SEQ_SAMPLE_LENGTH_LIMIT);
152
165
 
153
166
  if (statsAsChars.sameLength) {
154
- const stats = this.getStats(categoriesSample, 5, splitter);
155
- const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-');
156
- if (alphabet === ALPHABET.UN) return null;
167
+ const stats = this.getStats(categoriesSample, seqMinLength, splitter);
168
+ const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, '-', colNameLikely);
169
+ if (alphabet === ALPHABET.UN && !colNameLikely) return null;
157
170
 
158
171
  col.setTag(DG.TAGS.UNITS, units);
159
172
  if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
160
173
  col.setTag(UnitsHandler.TAGS.aligned, ALIGNMENT.SEQ_MSA);
161
174
  col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
175
+ if (alphabet === ALPHABET.UN) {
176
+ const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
177
+ col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
178
+ }
162
179
  return DG.SEMTYPE.MACROMOLECULE;
163
180
  } else {
164
- const stats = this.getStats(categoriesSample, 5, splitter);
181
+ const stats = this.getStats(categoriesSample, seqMinLength, splitter);
182
+ const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
165
183
  // Empty monomer alphabet is not allowed
166
184
  if (Object.keys(stats.freq).length === 0) return null;
167
185
  // Long monomer names for sequences with separators have constraints
168
- if (separator && this.checkForbiddenWithSeparators(stats.freq)) return null;
186
+ if (
187
+ ((units === NOTATION.SEPARATOR || (units === NOTATION.FASTA && alphabetIsMultichar)) &&
188
+ this.checkForbiddenMultichar(stats.freq)) ||
189
+ ((units === NOTATION.FASTA && !alphabetIsMultichar) &&
190
+ this.checkForbiddenSinglechar(stats.freq))
191
+ ) return null;
169
192
 
170
193
  const aligned = stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
171
194
 
172
195
  // TODO: If separator detected, then extra efforts to detect alphabet are allowed.
173
- const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol);
196
+ const alphabet = this.detectAlphabet(stats.freq, candidateAlphabets, gapSymbol, colNameLikely);
197
+ /* Likely column name allows detecting 'fasta' notation with 'UN' alphabet, 2023-04-13, atanas, askalkin */
198
+ if (units === NOTATION.FASTA && alphabet === ALPHABET.UN && !alphabetIsMultichar && !colNameLikely) return null;
174
199
 
175
200
  // const forbidden = this.checkForbiddenWoSeparator(stats.freq);
176
- if (separator || alphabet != 'UN') {
177
- col.setTag(DG.TAGS.UNITS, units);
178
- if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
179
- col.setTag(UnitsHandler.TAGS.aligned, aligned);
180
- col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
181
- if (alphabet === ALPHABET.UN) {
182
- // alphabetSize calculated on (sub)sample of data is incorrect
183
- // const alphabetSize = Object.keys(stats.freq).length;
184
- const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
185
- // col.setTag(UnitsHandler.TAGS.alphabetSize, alphabetSize.toString());
186
- col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
187
- }
188
- return DG.SEMTYPE.MACROMOLECULE;
201
+ col.setTag(DG.TAGS.UNITS, units);
202
+ if (separator) col.setTag(UnitsHandler.TAGS.separator, separator);
203
+ col.setTag(UnitsHandler.TAGS.aligned, aligned);
204
+ col.setTag(UnitsHandler.TAGS.alphabet, alphabet);
205
+ if (alphabet === ALPHABET.UN) {
206
+ // alphabetSize calculated on (sub)sample of data is incorrect
207
+ const alphabetIsMultichar = Object.keys(stats.freq).some((m) => m.length > 1);
208
+ col.setTag(UnitsHandler.TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
189
209
  }
210
+ return DG.SEMTYPE.MACROMOLECULE;
190
211
  }
191
212
  } finally {
192
213
  const t2 = Date.now();
@@ -207,15 +228,15 @@ class BioPackageDetectors extends DG.Package {
207
228
  // !!! What is the difference between the gap symbol and separator symbol in stats terms?
208
229
  // const noSeparatorRe = /[a-z\d]+$/i;
209
230
  const noSeparatorChemRe = /[HBCNOFPSKVYI]/i; // Mendeleev's periodic table single char elements
210
- const noSeparatorAlphaDigitRe = /[\dA-Z,& _\r\n]/i; // ..., comma, ampersand, space, underscore, CR, LF
231
+ const noSeparatorAlphaDigitRe = /[\dA-Z]/i;
211
232
  const noSeparatorBracketsRe = /[\[\]()<>{}]/i;
212
233
  const cleanFreq = Object.assign({}, ...Object.entries(freq)
213
234
  .filter(([m, f]) =>
214
235
  !noSeparatorChemRe.test(m) && !noSeparatorAlphaDigitRe.test(m) && !noSeparatorBracketsRe.test(m) &&
215
- !this.PeptideFastaAlphabet.has(m) &&
216
- !this.DnaFastaAlphabet.has(m))
236
+ !this.peptideFastaAlphabet.has(m) &&
237
+ !this.dnaFastaAlphabet.has(m))
217
238
  .map(([m, f]) => ({[m]: f})));
218
- if (Object.keys(cleanFreq).length == 0) return null;
239
+ if (Object.keys(cleanFreq).length === 0) return null;
219
240
 
220
241
  const maxFreq = Math.max(...Object.values(cleanFreq));
221
242
 
@@ -227,12 +248,24 @@ class BioPackageDetectors extends DG.Package {
227
248
  return sepFreq / otherSumFreq > freqThreshold ? sep : null;
228
249
  }
229
250
 
230
- /** With a separator, spaces are nor allowed in monomer names.
251
+ checkForbiddenSeparator(separator) {
252
+ // dot, comma, ampersand, space, underscore, CR, LF
253
+ const forbiddenSepRe = / |\.|,|&|_|\r\n|\n/i;
254
+ return forbiddenSepRe.test(separator);
255
+ }
256
+
257
+ /** Spaces, dots and colons are nor allowed in multichar monomer names.
231
258
  * The monomer name/label cannot contain digits only.
232
259
  */
233
- checkForbiddenWithSeparators(freq) {
234
- const forbiddenRe = /[ ]|^\d+$/i;
235
- return Object.keys(freq).filter((m) => forbiddenRe.test(m)).length > 0;
260
+ checkForbiddenMultichar(freq) {
261
+ const forbiddenRe = /[ .:]|^\d+$/i;
262
+ return Object.keys(freq).some((m) => forbiddenRe.test(m));
263
+ }
264
+
265
+ /** Space, dot, colon, semicolon, digit, underscore are not allowed as singe char monomer names.*/
266
+ checkForbiddenSinglechar(freq) {
267
+ const forbiddenRe = /[ .:;\d_]/i;
268
+ return Object.keys(freq).some((m) => forbiddenRe.test(m));
236
269
  }
237
270
 
238
271
  // /** Without a separator, special symbols or digits are not allowed as monomers. */
@@ -250,17 +283,16 @@ class BioPackageDetectors extends DG.Package {
250
283
  for (const seq of values) {
251
284
  const mSeq = splitter(seq);
252
285
 
253
- if (firstLength == null) {
286
+ if (firstLength === null) {
287
+ //
254
288
  firstLength = mSeq.length;
255
289
  } else if (mSeq.length !== firstLength) {
256
290
  sameLength = false;
257
291
  }
258
292
 
259
- if (mSeq.length > minLength) {
293
+ if (mSeq.length >= minLength) {
260
294
  for (const m of mSeq) {
261
- if (!(m in freq)) {
262
- freq[m] = 0;
263
- }
295
+ if (!(m in freq)) freq[m] = 0;
264
296
  freq[m] += 1;
265
297
  }
266
298
  }
@@ -271,17 +303,18 @@ class BioPackageDetectors extends DG.Package {
271
303
  /** Detects alphabet for freq by freq similarity to alphabet monomer set.
272
304
  * @param freq frequencies of monomers in sequence set
273
305
  * @param candidates an array of pairs [name, monomer set]
274
- * */
275
- detectAlphabet(freq, candidates, gapSymbol) {
306
+ * @param {boolean} colNameLikely The column name suggests the column is Macromolecule more likely
307
+ */
308
+ detectAlphabet(freq, candidates, gapSymbol, colNameLikely = false) {
276
309
  const candidatesSims = candidates.map((c) => {
277
- const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol);
310
+ const sim = this.getAlphabetSimilarity(freq, c[1], gapSymbol) + (colNameLikely ? 0.15 : 0);
278
311
  return [c[0], c[1], c[2], freq, sim];
279
312
  });
280
313
 
281
314
  let alphabetName;
282
315
  const maxSim = Math.max(...candidatesSims.map((cs) => cs[4] > cs[2] ? cs[4] : -1));
283
316
  if (maxSim > 0) {
284
- const sim = candidatesSims.find((cs) => cs[4] == maxSim);
317
+ const sim = candidatesSims.find((cs) => cs[4] === maxSim);
285
318
  alphabetName = sim[0];
286
319
  } else {
287
320
  alphabetName = ALPHABET.UN;
@@ -306,20 +339,19 @@ class BioPackageDetectors extends DG.Package {
306
339
 
307
340
  vectorLength(v) {
308
341
  let sqrSum = 0;
309
- for (let i = 0; i < v.length; i++) {
342
+ for (let i = 0; i < v.length; i++)
310
343
  sqrSum += v[i] * v[i];
311
- }
312
344
  return Math.sqrt(sqrSum);
313
345
  }
314
346
 
315
347
  vectorDotProduct(v1, v2) {
316
- if (v1.length != v2.length) {
348
+ if (v1.length !== v2.length)
317
349
  throw Error('The dimensionality of the vectors must match');
318
- }
350
+
319
351
  let prod = 0;
320
- for (let i = 0; i < v1.length; i++) {
352
+ for (let i = 0; i < v1.length; i++)
321
353
  prod += v1[i] * v2[i];
322
- }
354
+
323
355
  return prod;
324
356
  }
325
357
 
@@ -327,7 +359,7 @@ class BioPackageDetectors extends DG.Package {
327
359
  getSplitterAsChars(lengthLimit) {
328
360
  return function(seq) {
329
361
  return seq.split('', lengthLimit);
330
- }.bind(this);
362
+ };
331
363
  }
332
364
 
333
365
  getSplitterWithSeparator(separator, lengthLimit) {
@@ -346,11 +378,11 @@ class BioPackageDetectors extends DG.Package {
346
378
  // } else {
347
379
  return seq.split(separator, lengthLimit);
348
380
  // }
349
- }.bind(this);
381
+ };
350
382
  }
351
383
 
352
384
  // Multichar monomer names in square brackets, single char monomers or gap symbol
353
- monomerRe = /\[(\w+)\]|(\w)|(-)/g;
385
+ monomerRe = /\[(\w+)\]|(.)/g;
354
386
 
355
387
  /** Split sequence for single character monomers, square brackets multichar monomer names or gap symbol. */
356
388
  getSplitterAsFasta(lengthLimit) {
@@ -360,11 +392,11 @@ class BioPackageDetectors extends DG.Package {
360
392
  .map((ma) => {
361
393
  let mRes;
362
394
  const m = ma[0];
363
- if (m.length > 1) {
395
+ if (m.length > 1)
364
396
  mRes = ma[1];
365
- } else {
397
+ else
366
398
  mRes = m;
367
- }
399
+
368
400
  return mRes;
369
401
  }).toArray();
370
402
 
@@ -391,11 +423,10 @@ class BioPackageDetectors extends DG.Package {
391
423
  const mmPostProcess = (mm) => {
392
424
  this.helmPp1Re.lastIndex = 0;
393
425
  const pp1M = this.helmPp1Re.exec(mm);
394
- if (pp1M && pp1M.length >= 2) {
426
+ if (pp1M && pp1M.length >= 2)
395
427
  return pp1M[1];
396
- } else {
428
+ else
397
429
  return mm;
398
- }
399
430
  };
400
431
 
401
432
  const mmList = inSeq ? inSeq.split('.') : [];
@@ -405,16 +436,13 @@ class BioPackageDetectors extends DG.Package {
405
436
  }
406
437
 
407
438
  sample(src, n) {
408
- if (src.length < n) {
439
+ if (src.length < n)
409
440
  throw new Error('Sample source is less than n requested.');
410
- }
411
441
 
412
442
  const idxSet = new Set();
413
443
  while (idxSet.size < n) {
414
444
  const idx = Math.floor(Math.random() * src.length);
415
- if (!idxSet.has(idx)) {
416
- idxSet.add(idx);
417
- }
445
+ if (!idxSet.has(idx)) idxSet.add(idx);
418
446
  }
419
447
 
420
448
  return [...idxSet].map((idx) => src[idx]);