@datagrok-libraries/bio 5.39.28 → 5.40.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +1 -1
- package/CHANGELOG.md +21 -0
- package/package.json +2 -2
- package/src/monomer-works/monomer-utils.d.ts.map +1 -1
- package/src/monomer-works/monomer-utils.js +34 -31
- package/src/monomer-works/monomer-utils.js.map +1 -1
- package/src/monomer-works/to-atomic-level.d.ts +4 -4
- package/src/monomer-works/to-atomic-level.d.ts.map +1 -1
- package/src/monomer-works/to-atomic-level.js +37 -38
- package/src/monomer-works/to-atomic-level.js.map +1 -1
- package/src/trees/consts.d.ts +1 -0
- package/src/trees/consts.d.ts.map +1 -1
- package/src/trees/consts.js +1 -0
- package/src/trees/consts.js.map +1 -1
- package/src/utils/cell-renderer-monomer-placer.d.ts +2 -3
- package/src/utils/cell-renderer-monomer-placer.d.ts.map +1 -1
- package/src/utils/cell-renderer-monomer-placer.js +13 -11
- package/src/utils/cell-renderer-monomer-placer.js.map +1 -1
- package/src/utils/cell-renderer.d.ts +2 -3
- package/src/utils/cell-renderer.d.ts.map +1 -1
- package/src/utils/cell-renderer.js +9 -8
- package/src/utils/cell-renderer.js.map +1 -1
- package/src/utils/fasta-handler.js +2 -2
- package/src/utils/fasta-handler.js.map +1 -1
- package/src/utils/macromolecule/alignment.d.ts +4 -3
- package/src/utils/macromolecule/alignment.d.ts.map +1 -1
- package/src/utils/macromolecule/alignment.js +25 -18
- package/src/utils/macromolecule/alignment.js.map +1 -1
- package/src/utils/macromolecule/consts.d.ts +2 -0
- package/src/utils/macromolecule/consts.d.ts.map +1 -1
- package/src/utils/macromolecule/consts.js +2 -0
- package/src/utils/macromolecule/consts.js.map +1 -1
- package/src/utils/macromolecule/index.d.ts +1 -1
- package/src/utils/macromolecule/index.d.ts.map +1 -1
- package/src/utils/macromolecule/index.js +1 -1
- package/src/utils/macromolecule/index.js.map +1 -1
- package/src/utils/macromolecule/scoring.d.ts +1 -1
- package/src/utils/macromolecule/scoring.d.ts.map +1 -1
- package/src/utils/macromolecule/scoring.js +7 -5
- package/src/utils/macromolecule/scoring.js.map +1 -1
- package/src/utils/macromolecule/types.d.ts +10 -2
- package/src/utils/macromolecule/types.d.ts.map +1 -1
- package/src/utils/macromolecule/types.js +2 -0
- package/src/utils/macromolecule/types.js.map +1 -1
- package/src/utils/macromolecule/utils.d.ts +30 -12
- package/src/utils/macromolecule/utils.d.ts.map +1 -1
- package/src/utils/macromolecule/utils.js +67 -40
- package/src/utils/macromolecule/utils.js.map +1 -1
- package/src/utils/{units-handler.d.ts → seq-handler.d.ts} +39 -20
- package/src/utils/seq-handler.d.ts.map +1 -0
- package/src/utils/{units-handler.js → seq-handler.js} +283 -207
- package/src/utils/seq-handler.js.map +1 -0
- package/src/utils/splitter.d.ts.map +1 -1
- package/src/utils/splitter.js +8 -11
- package/src/utils/splitter.js.map +1 -1
- package/src/utils/units-handler.d.ts.map +0 -1
- package/src/utils/units-handler.js.map +0 -1
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import * as DG from 'datagrok-api/dg';
|
|
2
2
|
import wu from 'wu';
|
|
3
|
-
import { NOTATION, candidateAlphabets, positionSeparator } from './macromolecule';
|
|
4
|
-
import {
|
|
3
|
+
import { NOTATION, candidateAlphabets, positionSeparator, splitterAsFasta, getSplitterWithSeparator, splitterAsHelm, } from './macromolecule';
|
|
4
|
+
import { GAP_SYMBOL, } from './macromolecule/types';
|
|
5
|
+
import { detectAlphabet, splitterAsFastaSimple, StringListSeqSplitted } from './macromolecule/utils';
|
|
5
6
|
import { mmDistanceFunctions, MmDistanceFunctionsNames } from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
6
7
|
import { getMonomerLibHelper } from '../monomer-works/monomer-utils';
|
|
7
8
|
import { HELM_WRAPPERS_REGEXP, PHOSPHATE_SYMBOL } from './const';
|
|
@@ -11,7 +12,7 @@ export const Temps = new class {
|
|
|
11
12
|
this.uh = `units-handler.${DG.SEMTYPE.MACROMOLECULE}`;
|
|
12
13
|
}
|
|
13
14
|
}();
|
|
14
|
-
export const
|
|
15
|
+
export const GapOriginals = {
|
|
15
16
|
[NOTATION.FASTA]: '-',
|
|
16
17
|
[NOTATION.SEPARATOR]: '',
|
|
17
18
|
[NOTATION.HELM]: '*',
|
|
@@ -19,12 +20,66 @@ export const GapSymbols = {
|
|
|
19
20
|
/** Class for handling notation units in Macromolecule columns and
|
|
20
21
|
* conversion of notation systems in Macromolecule columns
|
|
21
22
|
*/
|
|
22
|
-
export class
|
|
23
|
+
export class SeqHandler {
|
|
24
|
+
constructor(col) {
|
|
25
|
+
this._splitter = null;
|
|
26
|
+
this.cached = true;
|
|
27
|
+
this._splitted = null;
|
|
28
|
+
this.columnVersion = null;
|
|
29
|
+
this._stats = null;
|
|
30
|
+
this._maxLength = null;
|
|
31
|
+
this._posList = null;
|
|
32
|
+
this._joiner = undefined;
|
|
33
|
+
if (col.type !== DG.TYPE.STRING)
|
|
34
|
+
throw new Error(`Unexpected column type '${col.type}', must be '${DG.TYPE.STRING}'.`);
|
|
35
|
+
this._column = col;
|
|
36
|
+
this._columnVersion = col.version;
|
|
37
|
+
const units = this._column.getTag(DG.TAGS.UNITS);
|
|
38
|
+
if (units !== null && units !== undefined)
|
|
39
|
+
this._units = units;
|
|
40
|
+
else
|
|
41
|
+
throw new Error('Units are not specified in column');
|
|
42
|
+
this._notation = this.getNotation();
|
|
43
|
+
this._defaultGapOriginal = (this.isFasta()) ? GapOriginals[NOTATION.FASTA] :
|
|
44
|
+
(this.isHelm()) ? GapOriginals[NOTATION.HELM] :
|
|
45
|
+
GapOriginals[NOTATION.SEPARATOR];
|
|
46
|
+
if (!this.column.tags.has("aligned" /* TAGS.aligned */) || !this.column.tags.has("alphabet" /* TAGS.alphabet */) ||
|
|
47
|
+
(!this.column.tags.has(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */) && !this.isHelm() && this.alphabet === "UN" /* ALPHABET.UN */)) {
|
|
48
|
+
// The following detectors and setters are to be called because the column is likely
|
|
49
|
+
// as the UnitsHandler constructor was called on the column.
|
|
50
|
+
if (this.isFasta())
|
|
51
|
+
SeqHandler.setUnitsToFastaColumn(this);
|
|
52
|
+
else if (this.isSeparator()) {
|
|
53
|
+
const separator = col.getTag("separator" /* TAGS.separator */);
|
|
54
|
+
SeqHandler.setUnitsToSeparatorColumn(this, separator);
|
|
55
|
+
}
|
|
56
|
+
else if (this.isHelm())
|
|
57
|
+
SeqHandler.setUnitsToHelmColumn(this);
|
|
58
|
+
else
|
|
59
|
+
throw new Error(`Unexpected units '${this.column.getTag(DG.TAGS.UNITS)}'.`);
|
|
60
|
+
}
|
|
61
|
+
// if (!this.column.tags.has(TAGS.alphabetSize)) {
|
|
62
|
+
// if (this.isHelm())
|
|
63
|
+
// throw new Error(`For column '${this.column.name}' of notation '${this.notation}' ` +
|
|
64
|
+
// `tag '${TAGS.alphabetSize}' is mandatory.`);
|
|
65
|
+
// else if (['UN'].includes(this.alphabet))
|
|
66
|
+
// throw new Error(`For column '${this.column.name}' of alphabet '${this.alphabet}' ` +
|
|
67
|
+
// `tag '${TAGS.alphabetSize}' is mandatory.`);
|
|
68
|
+
// }
|
|
69
|
+
if (!this.column.tags.has(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */)) {
|
|
70
|
+
if (this.isHelm())
|
|
71
|
+
this.column.setTag(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */, 'true');
|
|
72
|
+
else if (['UN'].includes(this.alphabet)) {
|
|
73
|
+
throw new Error(`For column '${this.column.name}' of alphabet '${this.alphabet}' ` +
|
|
74
|
+
`tag '${".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */}' is mandatory.`);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
23
78
|
static setUnitsToFastaColumn(uh) {
|
|
24
79
|
if (uh.column.semType !== DG.SEMTYPE.MACROMOLECULE || uh.column.getTag(DG.TAGS.UNITS) !== NOTATION.FASTA)
|
|
25
80
|
throw new Error(`The column of notation '${NOTATION.FASTA}' must be '${DG.SEMTYPE.MACROMOLECULE}'.`);
|
|
26
81
|
uh.column.setTag(DG.TAGS.UNITS, NOTATION.FASTA);
|
|
27
|
-
|
|
82
|
+
SeqHandler.setTags(uh);
|
|
28
83
|
}
|
|
29
84
|
static setUnitsToSeparatorColumn(uh, separator) {
|
|
30
85
|
if (uh.column.semType !== DG.SEMTYPE.MACROMOLECULE || uh.column.getTag(DG.TAGS.UNITS) !== NOTATION.SEPARATOR)
|
|
@@ -33,13 +88,13 @@ export class UnitsHandler {
|
|
|
33
88
|
throw new Error(`The column of notation '${NOTATION.SEPARATOR}' must have the separator tag.`);
|
|
34
89
|
uh.column.setTag(DG.TAGS.UNITS, NOTATION.SEPARATOR);
|
|
35
90
|
uh.column.setTag("separator" /* TAGS.separator */, separator);
|
|
36
|
-
|
|
91
|
+
SeqHandler.setTags(uh);
|
|
37
92
|
}
|
|
38
93
|
static setUnitsToHelmColumn(uh) {
|
|
39
94
|
if (uh.column.semType !== DG.SEMTYPE.MACROMOLECULE)
|
|
40
95
|
throw new Error(`The column of notation '${NOTATION.HELM}' must be '${DG.SEMTYPE.MACROMOLECULE}'`);
|
|
41
96
|
uh.column.setTag(DG.TAGS.UNITS, NOTATION.HELM);
|
|
42
|
-
|
|
97
|
+
SeqHandler.setTags(uh);
|
|
43
98
|
}
|
|
44
99
|
/** From detectMacromolecule */
|
|
45
100
|
static setTags(uh) {
|
|
@@ -69,9 +124,10 @@ export class UnitsHandler {
|
|
|
69
124
|
}
|
|
70
125
|
}
|
|
71
126
|
get column() { return this._column; }
|
|
127
|
+
get length() { return this._column.length; }
|
|
72
128
|
get units() { return this._units; }
|
|
73
129
|
get notation() { return this._notation; }
|
|
74
|
-
get
|
|
130
|
+
get defaultGapOriginal() { return this._defaultGapOriginal; }
|
|
75
131
|
get separator() {
|
|
76
132
|
const separator = this.column.getTag("separator" /* TAGS.separator */) ?? undefined;
|
|
77
133
|
if (this.notation === NOTATION.SEPARATOR && separator === undefined)
|
|
@@ -132,35 +188,57 @@ export class UnitsHandler {
|
|
|
132
188
|
else
|
|
133
189
|
return this.column.getTag(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */) === 'true';
|
|
134
190
|
}
|
|
135
|
-
/** */
|
|
136
|
-
get splitted() {
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
191
|
+
// /** */
|
|
192
|
+
// public get splitted(): ISeqSplitted[] {
|
|
193
|
+
// // TODO: Disable cache or invalidate on changing data
|
|
194
|
+
// if (this._splitted === null) {
|
|
195
|
+
// const splitter = this.splitter;
|
|
196
|
+
// const colLength: number = this._column.length;
|
|
197
|
+
// this._splitted = new Array(colLength);
|
|
198
|
+
// const catIdxList = this._column.getRawData();
|
|
199
|
+
// const catList: string[] = this._column.categories;
|
|
200
|
+
// for (let rowIdx: number = 0; rowIdx < colLength; rowIdx++) {
|
|
201
|
+
// const seq: string = catList[catIdxList[rowIdx]];
|
|
202
|
+
// this._splitted[rowIdx] = splitter(seq);
|
|
203
|
+
// }
|
|
204
|
+
// }
|
|
205
|
+
// return this._splitted;
|
|
206
|
+
// }
|
|
207
|
+
getSplitted(rowIdx) {
|
|
208
|
+
if (!this.cached) {
|
|
209
|
+
const seq = this.column.get(rowIdx);
|
|
210
|
+
return this.splitter(seq);
|
|
211
|
+
}
|
|
212
|
+
else {
|
|
213
|
+
if (this.column.version !== this.columnVersion || this._splitted === null) {
|
|
214
|
+
this.columnVersion = this.column.version;
|
|
215
|
+
this._splitted = new Array(this.column.length);
|
|
146
216
|
}
|
|
217
|
+
let resSS = this._splitted[rowIdx] ? this._splitted[rowIdx].deref() : undefined;
|
|
218
|
+
if (!resSS) {
|
|
219
|
+
const seq = this.column.get(rowIdx);
|
|
220
|
+
resSS = this.splitter(seq);
|
|
221
|
+
this._splitted[rowIdx] = new WeakRef(resSS);
|
|
222
|
+
}
|
|
223
|
+
return resSS;
|
|
147
224
|
}
|
|
148
|
-
return this._splitted;
|
|
149
225
|
}
|
|
150
226
|
get stats() {
|
|
151
227
|
if (this._stats === null) {
|
|
152
228
|
const freq = {};
|
|
153
229
|
let sameLength = true;
|
|
154
230
|
let firstLength = null;
|
|
155
|
-
|
|
231
|
+
const colLen = this.column.length;
|
|
232
|
+
for (let rowIdx = 0; rowIdx < colLen; ++rowIdx) {
|
|
233
|
+
const mSeq = this.getSplitted(rowIdx);
|
|
156
234
|
if (firstLength == null)
|
|
157
235
|
firstLength = mSeq.length;
|
|
158
236
|
else if (mSeq.length !== firstLength)
|
|
159
237
|
sameLength = false;
|
|
160
|
-
for (const
|
|
161
|
-
if (!(
|
|
162
|
-
freq[
|
|
163
|
-
freq[
|
|
238
|
+
for (const cm of mSeq.canonicals) {
|
|
239
|
+
if (!(cm in freq))
|
|
240
|
+
freq[cm] = 0;
|
|
241
|
+
freq[cm] += 1;
|
|
164
242
|
}
|
|
165
243
|
}
|
|
166
244
|
this._stats = { freq: freq, sameLength: sameLength };
|
|
@@ -169,8 +247,8 @@ export class UnitsHandler {
|
|
|
169
247
|
}
|
|
170
248
|
get maxLength() {
|
|
171
249
|
if (this._maxLength === null) {
|
|
172
|
-
this._maxLength = this.
|
|
173
|
-
Math.max(...this.
|
|
250
|
+
this._maxLength = this.column.length === 0 ? 0 :
|
|
251
|
+
Math.max(...wu.count(0).take(this.column.length).map((rowIdx) => this.getSplitted(rowIdx).length));
|
|
174
252
|
}
|
|
175
253
|
return this._maxLength;
|
|
176
254
|
}
|
|
@@ -190,9 +268,12 @@ export class UnitsHandler {
|
|
|
190
268
|
isPeptide() { return this.alphabet === "PT" /* ALPHABET.PT */; }
|
|
191
269
|
isMsa() { return this.aligned ? this.aligned.toUpperCase().includes('MSA') : false; }
|
|
192
270
|
isHelmCompatible() { return this.helmCompatible === 'true'; }
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
271
|
+
/** Checks {@link om} for being a gap
|
|
272
|
+
* @param {string} om Original monomer of sequence symbol
|
|
273
|
+
* @return {boolean}
|
|
274
|
+
*/
|
|
275
|
+
isGap(om) {
|
|
276
|
+
return !om || om === this._defaultGapOriginal;
|
|
196
277
|
}
|
|
197
278
|
/** Associate notation types with the corresponding units */
|
|
198
279
|
/**
|
|
@@ -246,7 +327,9 @@ export class UnitsHandler {
|
|
|
246
327
|
const srcAligned = col.getTag("aligned" /* TAGS.aligned */);
|
|
247
328
|
if (srcAligned)
|
|
248
329
|
newColumn.setTag("aligned" /* TAGS.aligned */, srcAligned);
|
|
249
|
-
|
|
330
|
+
let srcAlphabet = col.getTag("alphabet" /* TAGS.alphabet */);
|
|
331
|
+
if (!srcAlphabet && this.notation === NOTATION.HELM && tgtNotation !== NOTATION.HELM)
|
|
332
|
+
srcAlphabet = "UN" /* ALPHABET.UN */;
|
|
250
333
|
if (srcAlphabet != null)
|
|
251
334
|
newColumn.setTag("alphabet" /* TAGS.alphabet */, srcAlphabet);
|
|
252
335
|
let srcAlphabetSize = col.getTag(".alphabetSize" /* TAGS.alphabetSize */);
|
|
@@ -261,8 +344,9 @@ export class UnitsHandler {
|
|
|
261
344
|
}
|
|
262
345
|
return newColumn;
|
|
263
346
|
}
|
|
264
|
-
|
|
265
|
-
|
|
347
|
+
/** Creates a new column on data of {@link seqList} with the same tags */
|
|
348
|
+
getNewColumnFromList(name, seqList) {
|
|
349
|
+
return this.getNewColumn(this.notation, this.separator, name, seqList);
|
|
266
350
|
}
|
|
267
351
|
/**
|
|
268
352
|
* Create a new empty column using templateCol as a template
|
|
@@ -272,7 +356,7 @@ export class UnitsHandler {
|
|
|
272
356
|
* @return {DG.Column}
|
|
273
357
|
*/
|
|
274
358
|
static getNewColumn(templateCol) {
|
|
275
|
-
const col =
|
|
359
|
+
const col = SeqHandler.forColumn(templateCol);
|
|
276
360
|
const targetNotation = col.notation;
|
|
277
361
|
return col.getNewColumn(targetNotation);
|
|
278
362
|
}
|
|
@@ -302,7 +386,7 @@ export class UnitsHandler {
|
|
|
302
386
|
// WARNING: in this implementation is is impossible to verify the uniqueness
|
|
303
387
|
// of the new column's name
|
|
304
388
|
// TODO: verify the validity of units parameter
|
|
305
|
-
if (!
|
|
389
|
+
if (!SeqHandler.unitsStringIsValid(units))
|
|
306
390
|
throw new Error('Invalid format of \'units\' parameter');
|
|
307
391
|
const newColumn = DG.Column.fromList('string', name, new Array(len).fill(''));
|
|
308
392
|
newColumn.semType = DG.SEMTYPE.MACROMOLECULE;
|
|
@@ -326,6 +410,9 @@ export class UnitsHandler {
|
|
|
326
410
|
throw new Error(`Unexpected units ${this.units} .`);
|
|
327
411
|
// TODO: Splitter for HELM
|
|
328
412
|
}
|
|
413
|
+
split(seq) {
|
|
414
|
+
return this.splitter(seq);
|
|
415
|
+
}
|
|
329
416
|
getDistanceFunctionName() {
|
|
330
417
|
// TODO add support for helm and separator notation
|
|
331
418
|
if (!this.isFasta())
|
|
@@ -333,10 +420,10 @@ export class UnitsHandler {
|
|
|
333
420
|
if (this.isMsa())
|
|
334
421
|
return MmDistanceFunctionsNames.HAMMING;
|
|
335
422
|
switch (this.alphabet) {
|
|
336
|
-
// As DNA and RNA scoring matrices are same as identity matrices(mostly),
|
|
337
|
-
// we can use very fast and optimized Levenshtein distance library
|
|
338
423
|
case "DNA" /* ALPHABET.DNA */:
|
|
339
424
|
case "RNA" /* ALPHABET.RNA */:
|
|
425
|
+
// As DNA and RNA scoring matrices are same as identity matrices(mostly),
|
|
426
|
+
// we can use very fast and optimized Levenshtein distance library
|
|
340
427
|
return MmDistanceFunctionsNames.LEVENSHTEIN;
|
|
341
428
|
case "PT" /* ALPHABET.PT */:
|
|
342
429
|
return MmDistanceFunctionsNames.LEVENSHTEIN;
|
|
@@ -353,7 +440,7 @@ export class UnitsHandler {
|
|
|
353
440
|
// check first for the column tag to avoid extra processing
|
|
354
441
|
if (this.column.tags.has(".isHelmCompatible" /* TAGS.isHelmCompatible */))
|
|
355
442
|
return this.column.getTag(".isHelmCompatible" /* TAGS.isHelmCompatible */) === 'true';
|
|
356
|
-
// get the
|
|
443
|
+
// get the monomer lib and check against the column
|
|
357
444
|
const monomerLibHelper = await getMonomerLibHelper();
|
|
358
445
|
const bioLib = monomerLibHelper.getBioLib();
|
|
359
446
|
// retrieve peptides
|
|
@@ -363,14 +450,21 @@ export class UnitsHandler {
|
|
|
363
450
|
// get splitter for given separator and check if all monomers are in the lib
|
|
364
451
|
const splitterFunc = getSplitterWithSeparator(this.separator);
|
|
365
452
|
// iterate over the columns, split them and check if all monomers are in the lib
|
|
366
|
-
//TODO maybe add missing
|
|
453
|
+
//TODO maybe add missing threshold so that if there are not too many missing monomers
|
|
367
454
|
// the column is still considered helm compatible
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
455
|
+
const catIdxSet = new Set();
|
|
456
|
+
const rowCount = this.column.length;
|
|
457
|
+
const colRawData = this.column.getRawData();
|
|
458
|
+
for (let rowIdx = 0; rowIdx < rowCount; ++rowIdx) {
|
|
459
|
+
const catI = colRawData[rowIdx];
|
|
460
|
+
if (!(catI in catIdxSet)) {
|
|
461
|
+
catIdxSet.add(catI);
|
|
462
|
+
const monomers = this.getSplitted(rowIdx);
|
|
463
|
+
for (const cm of monomers.canonicals) {
|
|
464
|
+
if (!peptidesSet.has(cm)) {
|
|
465
|
+
this.column.setTag(".isHelmCompatible" /* TAGS.isHelmCompatible */, 'false');
|
|
466
|
+
return false;
|
|
467
|
+
}
|
|
374
468
|
}
|
|
375
469
|
}
|
|
376
470
|
}
|
|
@@ -380,7 +474,7 @@ export class UnitsHandler {
|
|
|
380
474
|
// -- Notation Converter --
|
|
381
475
|
get splitter() {
|
|
382
476
|
if (this._splitter === null)
|
|
383
|
-
this._splitter =
|
|
477
|
+
this._splitter = this.getSplitter();
|
|
384
478
|
return this._splitter;
|
|
385
479
|
}
|
|
386
480
|
toFasta(targetNotation) { return targetNotation === NOTATION.FASTA; }
|
|
@@ -389,38 +483,38 @@ export class UnitsHandler {
|
|
|
389
483
|
/**
|
|
390
484
|
* Convert HELM string to FASTA/SEPARATOR
|
|
391
485
|
*
|
|
392
|
-
* @param {string}
|
|
486
|
+
* @param {string} srcSeq A string to be converted
|
|
393
487
|
* @param {string} tgtNotation Target notation: FASTA or SEPARATOR
|
|
394
488
|
* @param {string} tgtSeparator Optional target separator (for HELM ->
|
|
395
|
-
* @param {string | null}
|
|
489
|
+
* @param {string | null} tgtGapOriginal Optional target gap symbol
|
|
396
490
|
* SEPARATOR)
|
|
397
491
|
* @return {string} Converted string
|
|
398
492
|
*/
|
|
399
|
-
convertHelmToFastaSeparator(
|
|
400
|
-
if (!
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
493
|
+
convertHelmToFastaSeparator(srcSeq, tgtNotation, tgtSeparator, tgtGapOriginal) {
|
|
494
|
+
if (!tgtGapOriginal) {
|
|
495
|
+
tgtGapOriginal = (this.toFasta(tgtNotation)) ?
|
|
496
|
+
GapOriginals[NOTATION.FASTA] :
|
|
497
|
+
GapOriginals[NOTATION.SEPARATOR];
|
|
404
498
|
}
|
|
405
499
|
if (!tgtSeparator)
|
|
406
500
|
tgtSeparator = (this.toFasta(tgtNotation)) ? '' : this.separator;
|
|
407
|
-
const isNucleotide =
|
|
501
|
+
const isNucleotide = srcSeq.startsWith('RNA');
|
|
408
502
|
// items can be monomers or helms
|
|
409
|
-
const helmItemsArray = this.splitter(
|
|
503
|
+
const helmItemsArray = this.splitter(srcSeq);
|
|
410
504
|
const tgtMonomersArray = [];
|
|
411
|
-
for (let
|
|
412
|
-
let
|
|
505
|
+
for (let posIdx = 0; posIdx < helmItemsArray.length; ++posIdx) {
|
|
506
|
+
let om = helmItemsArray.getOriginal(posIdx);
|
|
413
507
|
if (isNucleotide)
|
|
414
|
-
|
|
415
|
-
if (
|
|
416
|
-
tgtMonomersArray.push(
|
|
417
|
-
else if (this.toFasta(tgtNotation) &&
|
|
508
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '');
|
|
509
|
+
if (om === GapOriginals[NOTATION.HELM])
|
|
510
|
+
tgtMonomersArray.push(tgtGapOriginal);
|
|
511
|
+
else if (this.toFasta(tgtNotation) && om.length > 1) {
|
|
418
512
|
// the case of a multi-character monomer converted to FASTA
|
|
419
|
-
const monomer = '[' +
|
|
513
|
+
const monomer = '[' + om + ']';
|
|
420
514
|
tgtMonomersArray.push(monomer);
|
|
421
515
|
}
|
|
422
516
|
else
|
|
423
|
-
tgtMonomersArray.push(
|
|
517
|
+
tgtMonomersArray.push(om);
|
|
424
518
|
}
|
|
425
519
|
return tgtMonomersArray.join(tgtSeparator);
|
|
426
520
|
}
|
|
@@ -431,14 +525,15 @@ export class UnitsHandler {
|
|
|
431
525
|
* @return {DG.Column} Converted column
|
|
432
526
|
*/
|
|
433
527
|
convert(tgtNotation, tgtSeparator) {
|
|
434
|
-
|
|
528
|
+
// Get joiner from the source column units handler (this) knowing about the source sequence.
|
|
529
|
+
// For example, converting DNA Helm to fasta requires removing the r(X)p decoration.
|
|
530
|
+
const joiner = this.getJoiner({ notation: tgtNotation, separator: tgtSeparator });
|
|
435
531
|
const newColumn = this.getNewColumn(tgtNotation, tgtSeparator);
|
|
436
532
|
// assign the values to the newly created empty column
|
|
437
|
-
newColumn.init((
|
|
438
|
-
const
|
|
439
|
-
return
|
|
533
|
+
newColumn.init((rowIdx) => {
|
|
534
|
+
const srcSS = this.getSplitted(rowIdx);
|
|
535
|
+
return joiner(srcSS);
|
|
440
536
|
});
|
|
441
|
-
// newColumn.setTag(DG.TAGS.UNITS, NOTATION.SEPARATOR);
|
|
442
537
|
return newColumn;
|
|
443
538
|
}
|
|
444
539
|
/**
|
|
@@ -449,20 +544,20 @@ export class UnitsHandler {
|
|
|
449
544
|
getRegion(startIdx, endIdx, name) {
|
|
450
545
|
const regCol = this.getNewColumn(this.notation, this.separator);
|
|
451
546
|
regCol.name = name;
|
|
452
|
-
const maxLength = Math.max(...this.splitted.map((seqS) => seqS.length));
|
|
453
547
|
const startIdxVal = startIdx ?? 0;
|
|
454
548
|
const endIdxVal = endIdx ?? this.maxLength - 1;
|
|
455
549
|
const join = this.getJoiner();
|
|
456
550
|
const regLength = endIdxVal - startIdxVal + 1;
|
|
457
551
|
regCol.init((rowI) => {
|
|
458
|
-
const seqS = this.
|
|
552
|
+
const seqS = this.getSplitted(rowI);
|
|
459
553
|
// Custom slicing instead of array method to maintain gaps
|
|
460
|
-
const
|
|
554
|
+
const regOMList = new Array(regLength);
|
|
461
555
|
for (let regJPos = 0; regJPos < regLength; ++regJPos) {
|
|
462
556
|
const seqJPos = startIdxVal + regJPos;
|
|
463
|
-
|
|
557
|
+
const seqOM = seqS.getOriginal(seqJPos);
|
|
558
|
+
regOMList[regJPos] = seqJPos < seqS.length ? seqOM : GapOriginals[this.notation];
|
|
464
559
|
}
|
|
465
|
-
return join(
|
|
560
|
+
return join(new StringListSeqSplitted(regOMList, GapOriginals[this.notation]));
|
|
466
561
|
});
|
|
467
562
|
const getRegionOfPositionNames = (str) => {
|
|
468
563
|
const srcPosList = str.split(',').map((p) => p.trim());
|
|
@@ -481,159 +576,140 @@ export class UnitsHandler {
|
|
|
481
576
|
regCol.setTag(".positionLabels" /* TAGS.positionLabels */, getRegionOfPositionNames(srcPositionLabelsStr));
|
|
482
577
|
return regCol;
|
|
483
578
|
}
|
|
484
|
-
|
|
485
|
-
if (this._joiner
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
579
|
+
get joiner() {
|
|
580
|
+
if (!this._joiner)
|
|
581
|
+
this._joiner = this.getJoiner();
|
|
582
|
+
return this._joiner;
|
|
583
|
+
}
|
|
584
|
+
getJoiner(opts) {
|
|
585
|
+
const notation = opts ? opts.notation : this.notation;
|
|
586
|
+
const separator = opts ? opts.separator : this.separator;
|
|
587
|
+
let res;
|
|
588
|
+
const srcSh = this;
|
|
589
|
+
switch (notation) {
|
|
590
|
+
case NOTATION.FASTA: {
|
|
591
|
+
res = function (srcSS) { return srcSh.joinToFasta(srcSS, srcSh.isHelm()); };
|
|
592
|
+
break;
|
|
494
593
|
}
|
|
495
|
-
|
|
496
|
-
|
|
594
|
+
case NOTATION.SEPARATOR: {
|
|
595
|
+
if (!separator)
|
|
596
|
+
throw new Error(`Separator is mandatory for notation '${notation}'.`);
|
|
597
|
+
res = function (srcSS) { return joinToSeparator(srcSS, separator, srcSh.isHelm()); };
|
|
598
|
+
break;
|
|
599
|
+
}
|
|
600
|
+
case NOTATION.HELM: {
|
|
601
|
+
const isDnaOrRna = srcSh.alphabet === "DNA" /* ALPHABET.DNA */ || srcSh.alphabet === "RNA" /* ALPHABET.RNA */;
|
|
602
|
+
const wrappers = srcSh.getHelmWrappers();
|
|
603
|
+
res = function (srcSS) { return joinToHelm(srcSS, wrappers, isDnaOrRna); };
|
|
604
|
+
break;
|
|
605
|
+
}
|
|
606
|
+
default:
|
|
607
|
+
throw new Error(`Unexpected notation '${notation}'.`);
|
|
497
608
|
}
|
|
498
|
-
return
|
|
609
|
+
return res;
|
|
499
610
|
}
|
|
500
611
|
getConverter(tgtUnits, tgtSeparator = undefined) {
|
|
501
612
|
if (tgtUnits === NOTATION.SEPARATOR && !tgtSeparator)
|
|
502
613
|
throw new Error(`Target separator is not specified for target units '${NOTATION.SEPARATOR}'.`);
|
|
503
|
-
const
|
|
614
|
+
const srcSh = this;
|
|
504
615
|
if (tgtUnits === NOTATION.FASTA)
|
|
505
|
-
return function (
|
|
616
|
+
return function (srcSeq) { return srcSh.convertToFasta(srcSeq); };
|
|
506
617
|
if (tgtUnits === NOTATION.HELM)
|
|
507
|
-
return function (
|
|
618
|
+
return function (srcSeq) { return srcSh.convertToHelm(srcSeq); };
|
|
508
619
|
else if (tgtUnits === NOTATION.SEPARATOR)
|
|
509
|
-
return function (
|
|
620
|
+
return function (srcSeq) { return srcSh.convertToSeparator(srcSeq, tgtSeparator); };
|
|
510
621
|
else
|
|
511
622
|
throw new Error();
|
|
512
623
|
}
|
|
513
|
-
constructor(col) {
|
|
514
|
-
this._splitter = null;
|
|
515
|
-
this._splitted = null;
|
|
516
|
-
this._stats = null;
|
|
517
|
-
this._maxLength = null;
|
|
518
|
-
this._posList = null;
|
|
519
|
-
this._joiner = undefined;
|
|
520
|
-
if (col.type !== DG.TYPE.STRING)
|
|
521
|
-
throw new Error(`Unexpected column type '${col.type}', must be '${DG.TYPE.STRING}'.`);
|
|
522
|
-
this._column = col;
|
|
523
|
-
const units = this._column.getTag(DG.TAGS.UNITS);
|
|
524
|
-
if (units !== null && units !== undefined)
|
|
525
|
-
this._units = units;
|
|
526
|
-
else
|
|
527
|
-
throw new Error('Units are not specified in column');
|
|
528
|
-
this._notation = this.getNotation();
|
|
529
|
-
this._defaultGapSymbol = (this.isFasta()) ? GapSymbols[NOTATION.FASTA] :
|
|
530
|
-
(this.isHelm()) ? GapSymbols[NOTATION.HELM] :
|
|
531
|
-
GapSymbols[NOTATION.SEPARATOR];
|
|
532
|
-
if (!this.column.tags.has("aligned" /* TAGS.aligned */) || !this.column.tags.has("alphabet" /* TAGS.alphabet */) ||
|
|
533
|
-
(!this.column.tags.has(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */) && !this.isHelm() && this.alphabet === "UN" /* ALPHABET.UN */)) {
|
|
534
|
-
// The following detectors and setters are to be called because the column is likely
|
|
535
|
-
// as the UnitsHandler constructor was called on the column.
|
|
536
|
-
if (this.isFasta())
|
|
537
|
-
UnitsHandler.setUnitsToFastaColumn(this);
|
|
538
|
-
else if (this.isSeparator()) {
|
|
539
|
-
const separator = col.getTag("separator" /* TAGS.separator */);
|
|
540
|
-
UnitsHandler.setUnitsToSeparatorColumn(this, separator);
|
|
541
|
-
}
|
|
542
|
-
else if (this.isHelm())
|
|
543
|
-
UnitsHandler.setUnitsToHelmColumn(this);
|
|
544
|
-
else
|
|
545
|
-
throw new Error(`Unexpected units '${this.column.getTag(DG.TAGS.UNITS)}'.`);
|
|
546
|
-
}
|
|
547
|
-
// if (!this.column.tags.has(TAGS.alphabetSize)) {
|
|
548
|
-
// if (this.isHelm())
|
|
549
|
-
// throw new Error(`For column '${this.column.name}' of notation '${this.notation}' ` +
|
|
550
|
-
// `tag '${TAGS.alphabetSize}' is mandatory.`);
|
|
551
|
-
// else if (['UN'].includes(this.alphabet))
|
|
552
|
-
// throw new Error(`For column '${this.column.name}' of alphabet '${this.alphabet}' ` +
|
|
553
|
-
// `tag '${TAGS.alphabetSize}' is mandatory.`);
|
|
554
|
-
// }
|
|
555
|
-
if (!this.column.tags.has(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */)) {
|
|
556
|
-
if (this.isHelm())
|
|
557
|
-
this.column.setTag(".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */, 'true');
|
|
558
|
-
else if (['UN'].includes(this.alphabet)) {
|
|
559
|
-
throw new Error(`For column '${this.column.name}' of alphabet '${this.alphabet}' ` +
|
|
560
|
-
`tag '${".alphabetIsMultichar" /* TAGS.alphabetIsMultichar */}' is mandatory.`);
|
|
561
|
-
}
|
|
562
|
-
}
|
|
563
|
-
}
|
|
564
624
|
/** Gets a column's UnitsHandler object from temp slot or creates a new and stores it to the temp slot. */
|
|
565
|
-
static
|
|
625
|
+
static forColumn(col) {
|
|
626
|
+
// TODO: Invalidate col.temp[Temps.uh] checking column's metadata
|
|
566
627
|
let res = col.temp[Temps.uh];
|
|
567
|
-
if (!res)
|
|
568
|
-
res = col.temp[Temps.uh] = new
|
|
628
|
+
if (!res || res.columnVersion !== col.version)
|
|
629
|
+
res = col.temp[Temps.uh] = new SeqHandler(col);
|
|
569
630
|
return res;
|
|
570
631
|
}
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
632
|
+
// -- joiners & converters --
|
|
633
|
+
joinToFasta(seqS, isHelm) {
|
|
634
|
+
const resMList = new Array(seqS.length);
|
|
635
|
+
for (let posIdx = 0; posIdx < seqS.length; ++posIdx) {
|
|
636
|
+
const cm = seqS.getOriginal(posIdx);
|
|
637
|
+
let om = seqS.getOriginal(posIdx);
|
|
638
|
+
if (isHelm)
|
|
639
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
640
|
+
if (cm === GAP_SYMBOL)
|
|
641
|
+
om = GapOriginals[NOTATION.FASTA];
|
|
642
|
+
else if (cm === PHOSPHATE_SYMBOL)
|
|
643
|
+
om = '';
|
|
644
|
+
else if (om.length > 1)
|
|
645
|
+
om = '[' + om + ']';
|
|
646
|
+
resMList[posIdx] = om;
|
|
647
|
+
}
|
|
648
|
+
return resMList.join('');
|
|
649
|
+
}
|
|
650
|
+
convertToFasta(src) {
|
|
651
|
+
const srcUhSplitter = this.splitter;
|
|
652
|
+
const srcSS = this.isHelm() ? this.splitterAsHelmNucl(src) : srcUhSplitter(src);
|
|
653
|
+
return this.joinToFasta(srcSS, this.isHelm());
|
|
654
|
+
}
|
|
655
|
+
convertToSeparator(src, tgtSeparator) {
|
|
656
|
+
const srcSS = this.isHelm() ? this.splitterAsHelmNucl(src) : this.splitter(src);
|
|
657
|
+
return joinToSeparator(srcSS, tgtSeparator, this.isHelm());
|
|
658
|
+
}
|
|
659
|
+
convertToHelm(src) {
|
|
660
|
+
const wrappers = this.getHelmWrappers();
|
|
661
|
+
const isDnaOrRna = src.startsWith('DNA') || src.startsWith('RNA');
|
|
662
|
+
const srcSS = this.splitter(src);
|
|
663
|
+
return joinToHelm(srcSS, wrappers, isDnaOrRna);
|
|
664
|
+
}
|
|
665
|
+
/** Splits Helm sequence adjusting nucleotides to single char symbols. (!) Removes lone phosphorus. */
|
|
666
|
+
splitterAsHelmNucl(src) {
|
|
667
|
+
const srcMList = this.splitter(src);
|
|
668
|
+
const tgtMList = new Array(srcMList.length);
|
|
669
|
+
const isDna = src.startsWith('DNA');
|
|
670
|
+
const isRna = src.startsWith('RNA');
|
|
671
|
+
for (let posIdx = 0; posIdx < srcMList.length; ++posIdx) {
|
|
672
|
+
let om = srcMList.getOriginal(posIdx);
|
|
673
|
+
if (isDna || isRna) {
|
|
674
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
675
|
+
om = om === PHOSPHATE_SYMBOL ? null : om;
|
|
676
|
+
}
|
|
677
|
+
tgtMList[posIdx] = om ? om : null;
|
|
678
|
+
}
|
|
679
|
+
return new StringListSeqSplitted(tgtMList.filter((om) => !!om), GapOriginals[NOTATION.HELM]);
|
|
597
680
|
}
|
|
598
|
-
return resMList.map((m) => m ?? '').join(tgtSeparator);
|
|
599
681
|
}
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
if (
|
|
609
|
-
|
|
610
|
-
else if (
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
}).toArray();
|
|
616
|
-
return `${prefix}${resMList.join('.')}${postfix}`;
|
|
617
|
-
}
|
|
618
|
-
function convertToHelm(srcUh, src) {
|
|
619
|
-
const isDnaOrRna = src.startsWith('DNA') || src.startsWith('RNA');
|
|
620
|
-
const srcS = srcUh.getSplitter()(src);
|
|
621
|
-
return joinToHelm(srcUh, srcS, isDnaOrRna);
|
|
682
|
+
// -- joiners --
|
|
683
|
+
function joinToSeparator(seqS, tgtSeparator, isHelm) {
|
|
684
|
+
const resMList = new Array(seqS.length);
|
|
685
|
+
for (let posIdx = 0; posIdx < seqS.length; ++posIdx) {
|
|
686
|
+
const cm = seqS.getCanonical(posIdx);
|
|
687
|
+
let om = seqS.getOriginal(posIdx);
|
|
688
|
+
if (isHelm)
|
|
689
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
690
|
+
if (cm === GAP_SYMBOL)
|
|
691
|
+
om = GapOriginals[NOTATION.SEPARATOR];
|
|
692
|
+
else if (cm === PHOSPHATE_SYMBOL)
|
|
693
|
+
om = '';
|
|
694
|
+
resMList[posIdx] = om;
|
|
695
|
+
}
|
|
696
|
+
return resMList.join(tgtSeparator);
|
|
622
697
|
}
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
const
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
698
|
+
function joinToHelm(srcSS, wrappers, isDnaOrRna) {
|
|
699
|
+
const [prefix, leftWrapper, rightWrapper, postfix] = wrappers;
|
|
700
|
+
const resOMList = new Array(srcSS.length);
|
|
701
|
+
for (let posIdx = 0; posIdx < srcSS.length; ++posIdx) {
|
|
702
|
+
const cm = srcSS.getCanonical(posIdx);
|
|
703
|
+
let om = srcSS.getOriginal(posIdx);
|
|
704
|
+
if (cm === GAP_SYMBOL)
|
|
705
|
+
om = GapOriginals[NOTATION.HELM];
|
|
706
|
+
else {
|
|
707
|
+
if (isDnaOrRna)
|
|
708
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
709
|
+
om = om.length === 1 ? `${leftWrapper}${om}${rightWrapper}` : `${leftWrapper}[${om}]${rightWrapper}`;
|
|
634
710
|
}
|
|
635
|
-
|
|
711
|
+
resOMList[posIdx] = om;
|
|
636
712
|
}
|
|
637
|
-
return
|
|
713
|
+
return `${prefix}${resOMList.join('.')}${postfix}`;
|
|
638
714
|
}
|
|
639
|
-
//# sourceMappingURL=
|
|
715
|
+
//# sourceMappingURL=seq-handler.js.map
|