@datagrok/bio 2.15.13 → 2.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/detectors.js +16 -11
  3. package/dist/455.js.map +1 -1
  4. package/dist/980.js +1 -1
  5. package/dist/980.js.map +1 -1
  6. package/dist/package-test.js +6 -6
  7. package/dist/package-test.js.map +1 -1
  8. package/dist/package.js +3 -3
  9. package/dist/package.js.map +1 -1
  10. package/package.json +14 -14
  11. package/src/analysis/sequence-activity-cliffs.ts +9 -8
  12. package/src/analysis/sequence-diversity-viewer.ts +6 -4
  13. package/src/analysis/sequence-similarity-viewer.ts +9 -6
  14. package/src/analysis/sequence-space.ts +3 -2
  15. package/src/calculations/monomerLevelMols.ts +4 -5
  16. package/src/demo/bio01-similarity-diversity.ts +4 -1
  17. package/src/package-test.ts +1 -1
  18. package/src/package-types.ts +34 -2
  19. package/src/package.ts +60 -76
  20. package/src/substructure-search/substructure-search.ts +15 -9
  21. package/src/tests/WebLogo-layout-tests.ts +1 -1
  22. package/src/tests/WebLogo-positions-test.ts +11 -5
  23. package/src/tests/WebLogo-project-tests.ts +1 -1
  24. package/src/tests/activity-cliffs-utils.ts +11 -14
  25. package/src/tests/bio-tests.ts +85 -79
  26. package/src/tests/checkInputColumn-tests.ts +15 -10
  27. package/src/tests/converters-test.ts +12 -5
  28. package/src/tests/detectors-benchmark-tests.ts +5 -2
  29. package/src/tests/detectors-tests.ts +51 -44
  30. package/src/tests/detectors-weak-and-likely-tests.ts +12 -5
  31. package/src/tests/fasta-export-tests.ts +13 -5
  32. package/src/tests/helm-tests.ts +85 -0
  33. package/src/tests/mm-distance-tests.ts +14 -7
  34. package/src/tests/monomer-libraries-tests.ts +1 -1
  35. package/src/tests/msa-tests.ts +33 -24
  36. package/src/tests/renderers-monomer-placer-tests.ts +2 -5
  37. package/src/tests/renderers-test.ts +15 -9
  38. package/src/tests/scoring.ts +9 -6
  39. package/src/tests/seq-handler-get-helm-tests.ts +7 -5
  40. package/src/tests/seq-handler-get-region-tests.ts +9 -3
  41. package/src/tests/seq-handler-splitted-tests.ts +11 -5
  42. package/src/tests/seq-handler-tests.ts +17 -10
  43. package/src/tests/sequence-space-utils.ts +9 -4
  44. package/src/tests/splitters-test.ts +5 -4
  45. package/src/tests/substructure-filters-tests.ts +22 -23
  46. package/src/tests/to-atomic-level-tests.ts +5 -3
  47. package/src/tests/to-atomic-level-ui-tests.ts +4 -1
  48. package/src/tests/utils/detectors-utils.ts +4 -4
  49. package/src/utils/calculate-scores.ts +11 -9
  50. package/src/utils/cell-renderer-custom.ts +27 -17
  51. package/src/utils/cell-renderer.ts +14 -8
  52. package/src/utils/check-input-column.ts +13 -9
  53. package/src/utils/context-menu.ts +4 -4
  54. package/src/utils/convert.ts +21 -14
  55. package/src/utils/get-region-func-editor.ts +8 -5
  56. package/src/utils/get-region.ts +4 -5
  57. package/src/utils/helm-to-molfile/converter/helm.ts +4 -4
  58. package/src/utils/helm-to-molfile/utils.ts +5 -6
  59. package/src/utils/macromolecule-column-widget.ts +6 -7
  60. package/src/utils/monomer-cell-renderer-base.ts +8 -1
  61. package/src/utils/monomer-lib/lib-manager.ts +3 -2
  62. package/src/utils/monomer-lib/monomer-colors.ts +10 -10
  63. package/src/utils/monomer-lib/monomer-lib-base.ts +6 -1
  64. package/src/utils/monomer-lib/monomer-lib.ts +15 -9
  65. package/src/utils/multiple-sequence-alignment-ui.ts +30 -30
  66. package/src/utils/save-as-fasta.ts +19 -12
  67. package/src/utils/seq-helper/seq-handler.ts +836 -0
  68. package/src/utils/seq-helper/seq-helper.ts +43 -19
  69. package/src/utils/sequence-to-mol.ts +7 -8
  70. package/src/utils/split-to-monomers.ts +7 -2
  71. package/src/utils/types.ts +8 -7
  72. package/src/utils/ui-utils.ts +2 -2
  73. package/src/viewers/web-logo-viewer.ts +18 -16
  74. package/src/widgets/bio-substructure-filter-helm.ts +5 -2
  75. package/src/widgets/bio-substructure-filter.ts +14 -24
  76. package/src/widgets/composition-analysis-widget.ts +6 -6
  77. package/src/widgets/representations.ts +7 -4
  78. package/src/tests/detectors-custom-notation-tests.ts +0 -37
  79. package/src/utils/cyclized.ts +0 -89
  80. package/src/utils/dimerized.ts +0 -10
@@ -0,0 +1,836 @@
1
+ import * as DG from 'datagrok-api/dg';
2
+
3
+ import wu from 'wu';
4
+
5
+ /* eslint-disable max-len */
6
+ import {ALIGNMENT, ALPHABET, candidateAlphabets, getSplitterWithSeparator, NOTATION, positionSeparator, splitterAsFasta, splitterAsHelm, TAGS} from '@datagrok-libraries/bio/src/utils/macromolecule/index';
7
+ import {INotationProvider, ISeqSplitted, SeqColStats, SplitterFunc,} from '@datagrok-libraries/bio/src/utils/macromolecule/types';
8
+ import {detectAlphabet, splitterAsFastaSimple, StringListSeqSplitted} from '@datagrok-libraries/bio/src/utils/macromolecule/utils';
9
+ import {mmDistanceFunctions, MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
10
+ import {mmDistanceFunctionType} from '@datagrok-libraries/ml/src/macromolecule-distance-functions/types';
11
+ import {getMonomerLibHelper, IMonomerLibHelper} from '@datagrok-libraries/bio/src/monomer-works/monomer-utils';
12
+ import {HELM_POLYMER_TYPE, HELM_WRAPPERS_REGEXP, PHOSPHATE_SYMBOL} from '@datagrok-libraries/bio/src/utils/const';
13
+ import {GAP_SYMBOL, GapOriginals} from '@datagrok-libraries/bio/src/utils/macromolecule/consts';
14
+ import {CellRendererBackBase, GridCellRendererTemp} from '@datagrok-libraries/bio/src/utils/cell-renderer-back-base';
15
+ import {HelmTypes} from '@datagrok-libraries/bio/src/helm/consts';
16
+ import {HelmType} from '@datagrok-libraries/bio/src/helm/types';
17
+ import {ISeqHandler, ConvertFunc, JoinerFunc, SeqTemps} from '@datagrok-libraries/bio/src/utils/macromolecule/seq-handler';
18
+
19
+ import {SeqHelper} from './seq-helper';
20
+
21
+ /* eslint-enable max-len */
22
+
23
+ /** Class for handling notation units in Macromolecule columns and
24
+ * conversion of notation systems in Macromolecule columns
25
+ */
26
+ export class SeqHandler implements ISeqHandler {
27
+ protected readonly _column: DG.Column; // the column to be converted
28
+ protected readonly _units: string; // units, of the form fasta, separator
29
+ protected readonly _notation: NOTATION; // current notation (without :SEQ:NT, etc.)
30
+ protected readonly _defaultGapOriginal: string;
31
+ protected readonly notationProvider: INotationProvider | null = null;
32
+
33
+ private _splitter: SplitterFunc | null = null;
34
+
35
+ protected constructor(col: DG.Column<string>,
36
+ private readonly seqHelper: SeqHelper,
37
+ ) {
38
+ if (col.type !== DG.TYPE.STRING)
39
+ throw new Error(`Unexpected column type '${col.type}', must be '${DG.TYPE.STRING}'.`);
40
+ this._column = col;
41
+ const units = this._column.meta.units;
42
+ if (units !== null && units !== undefined)
43
+ this._units = units;
44
+ else
45
+ throw new Error('Units are not specified in column');
46
+ this._notation = this.getNotation();
47
+ this._defaultGapOriginal = (this.isFasta()) ? GapOriginals[NOTATION.FASTA] :
48
+ (this.isHelm()) ? GapOriginals[NOTATION.HELM] :
49
+ GapOriginals[NOTATION.SEPARATOR];
50
+
51
+ if (!this.column.tags.has(TAGS.aligned) || !this.column.tags.has(TAGS.alphabet) ||
52
+ (!this.column.tags.has(TAGS.alphabetIsMultichar) && !this.isHelm() && this.alphabet === ALPHABET.UN)
53
+ ) {
54
+ // The following detectors and setters are to be called because the column is likely
55
+ // as the UnitsHandler constructor was called on the column.
56
+ if (this.isFasta())
57
+ this.seqHelper.setUnitsToFastaColumn(this);
58
+ else if (this.isSeparator()) {
59
+ const separator = col.getTag(TAGS.separator);
60
+ this.seqHelper.setUnitsToSeparatorColumn(this, separator);
61
+ } else if (this.isHelm())
62
+ this.seqHelper.setUnitsToHelmColumn(this);
63
+ else
64
+ throw new Error(`Unexpected units '${this.column.meta.units}'.`);
65
+ }
66
+
67
+ // if (!this.column.tags.has(TAGS.alphabetSize)) {
68
+ // if (this.isHelm())
69
+ // throw new Error(`For column '${this.column.name}' of notation '${this.notation}' ` +
70
+ // `tag '${TAGS.alphabetSize}' is mandatory.`);
71
+ // else if (['UN'].includes(this.alphabet))
72
+ // throw new Error(`For column '${this.column.name}' of alphabet '${this.alphabet}' ` +
73
+ // `tag '${TAGS.alphabetSize}' is mandatory.`);
74
+ // }
75
+
76
+ if (!this.column.tags.has(TAGS.alphabetIsMultichar)) {
77
+ if (this.isHelm())
78
+ this.column.setTag(TAGS.alphabetIsMultichar, 'true');
79
+ else if (['UN'].includes(this.alphabet)) {
80
+ throw new Error(`For column '${this.column.name}' of alphabet '${this.alphabet}' ` +
81
+ `tag '${TAGS.alphabetIsMultichar}' is mandatory.`);
82
+ }
83
+ }
84
+
85
+ if (this.column.meta.units === NOTATION.CUSTOM) {
86
+ // this.column.temp[SeqTemps.notationProvider] must be set at detector stage
87
+ this.notationProvider = this.column.temp[SeqTemps.notationProvider] ?? null;
88
+ }
89
+ this.columnVersion = this.column.version;
90
+ }
91
+
92
+ /** From detectMacromolecule */
93
+ public static setTags(uh: SeqHandler): void {
94
+ const units = uh.column.meta.units as NOTATION;
95
+
96
+ if ([NOTATION.FASTA, NOTATION.SEPARATOR].includes(units)) {
97
+ // Empty monomer alphabet is allowed, only if alphabet tag is annotated
98
+ if (!uh.column.getTag(TAGS.alphabet) && Object.keys(uh.stats.freq).length === 0)
99
+ throw new Error('Alphabet is empty and not annotated.');
100
+
101
+ let aligned = uh.column.getTag(TAGS.aligned);
102
+ if (aligned === null) {
103
+ aligned = uh.stats.sameLength ? ALIGNMENT.SEQ_MSA : ALIGNMENT.SEQ;
104
+ uh.column.setTag(TAGS.aligned, aligned);
105
+ }
106
+
107
+ let alphabet = uh.column.getTag(TAGS.alphabet);
108
+ if (alphabet === null) {
109
+ alphabet = detectAlphabet(uh.stats.freq, candidateAlphabets);
110
+ uh.column.setTag(TAGS.alphabet, alphabet);
111
+ }
112
+ if (alphabet === ALPHABET.UN) {
113
+ const alphabetSize = Object.keys(uh.stats.freq).length;
114
+ const alphabetIsMultichar = Object.keys(uh.stats.freq).some((m) => m.length > 1);
115
+ uh.column.setTag(TAGS.alphabetSize, alphabetSize.toString());
116
+ uh.column.setTag(TAGS.alphabetIsMultichar, alphabetIsMultichar ? 'true' : 'false');
117
+ }
118
+ }
119
+ }
120
+
121
+ get column(): DG.Column { return this._column; }
122
+
123
+ public get length(): number { return this._column.length; }
124
+
125
+ public get units(): string { return this._units; }
126
+
127
+ public get notation(): NOTATION { return this._notation; }
128
+
129
+ public get defaultGapOriginal(): string { return this._defaultGapOriginal; }
130
+
131
+ public get separator(): string | undefined {
132
+ const separator: string | undefined = this.column.getTag(TAGS.separator) ?? undefined;
133
+ if (this.notation === NOTATION.SEPARATOR && separator === undefined)
134
+ throw new Error(`Separator is mandatory for column '${this.column.name}' of notation '${this.notation}'.`);
135
+ return separator;
136
+ }
137
+
138
+ public get aligned(): string {
139
+ const aligned = this.column.getTag(TAGS.aligned);
140
+
141
+ // TAGS.aligned is mandatory for columns of NOTATION.FASTA and NOTATION.SEPARATOR
142
+ if (!aligned && (this.isFasta() || this.isSeparator()))
143
+ throw new Error('Tag aligned not set');
144
+
145
+ return aligned;
146
+ }
147
+
148
+ /** Alphabet name (upper case) */
149
+ public get alphabet(): string {
150
+ const alphabet = this.column.getTag(TAGS.alphabet);
151
+
152
+ // TAGS.alphabet is mandatory for columns of NOTATION.FASTA and NOTATION.SEPARATOR
153
+ if (!alphabet && (this.isFasta() || this.isSeparator()))
154
+ throw new Error('Tag alphabet not set');
155
+
156
+ return alphabet;
157
+ }
158
+
159
+ public get defaultBiotype(): HelmType {
160
+ return this.alphabet === ALPHABET.RNA || this.alphabet === ALPHABET.DNA ? HelmTypes.NUCLEOTIDE : HelmTypes.AA;
161
+ }
162
+
163
+ protected get helmCompatible(): string | undefined {
164
+ return this.column.getTag(TAGS.isHelmCompatible);
165
+ }
166
+
167
+ public getAlphabetSize(): number {
168
+ if (this.notation == NOTATION.HELM || this.alphabet == ALPHABET.UN) {
169
+ const alphabetSizeStr = this.column.getTag(TAGS.alphabetSize);
170
+ let alphabetSize: number;
171
+ if (alphabetSizeStr)
172
+ alphabetSize = parseInt(alphabetSizeStr);
173
+ else {
174
+ // calculate alphabetSize on demand
175
+ const stats = this.stats;
176
+ alphabetSize = Object.keys(stats.freq).length;
177
+ }
178
+ return alphabetSize;
179
+ } else {
180
+ switch (this.alphabet) {
181
+ case ALPHABET.PT:
182
+ return 20;
183
+ case ALPHABET.DNA:
184
+ case ALPHABET.RNA:
185
+ return 4;
186
+ case 'NT':
187
+ console.warn(`Unexpected alphabet 'NT'.`);
188
+ return 4;
189
+ default:
190
+ throw new Error(`Unexpected alphabet '${this.alphabet}'.`);
191
+ }
192
+ }
193
+ }
194
+
195
+ public getAlphabetIsMultichar(): boolean {
196
+ if (this.notation === NOTATION.HELM)
197
+ return true;
198
+ else if (this.alphabet !== ALPHABET.UN)
199
+ return false;
200
+ else
201
+ return this.column.getTag(TAGS.alphabetIsMultichar) === 'true';
202
+ }
203
+
204
+ private cached: boolean = true;
205
+ private _splitted: WeakRef<ISeqSplitted>[] | null = null;
206
+ private columnVersion: number | null = null;
207
+ // /** */
208
+ // public get splitted(): ISeqSplitted[] {
209
+ // // TODO: Disable cache or invalidate on changing data
210
+ // if (this._splitted === null) {
211
+ // const splitter = this.splitter;
212
+ // const colLength: number = this._column.length;
213
+ // this._splitted = new Array(colLength);
214
+ // const catIdxList = this._column.getRawData();
215
+ // const catList: string[] = this._column.categories;
216
+ // for (let rowIdx: number = 0; rowIdx < colLength; rowIdx++) {
217
+ // const seq: string = catList[catIdxList[rowIdx]];
218
+ // this._splitted[rowIdx] = splitter(seq);
219
+ // }
220
+ // }
221
+ // return this._splitted;
222
+ // }
223
+ public getSplitted(rowIdx: number, limit?: number): ISeqSplitted {
224
+ if (!this.cached || limit !== undefined) {
225
+ const seq = this.column.get(rowIdx);
226
+ return this.getSplitter(limit)(seq);
227
+ } else {
228
+ if (this.column.version !== this.columnVersion || this._splitted === null) {
229
+ this.columnVersion = this.column.version;
230
+ this._splitted = new Array<WeakRef<ISeqSplitted>>(this.column.length);
231
+ }
232
+
233
+ let resSS: ISeqSplitted | undefined = this._splitted[rowIdx] ? this._splitted[rowIdx].deref() : undefined;
234
+ if (!resSS) {
235
+ const seq = this.column.get(rowIdx);
236
+ resSS = this.splitter(seq);
237
+ this._splitted[rowIdx] = new WeakRef(resSS);
238
+ }
239
+ return resSS;
240
+ }
241
+ }
242
+
243
+ /** Any Macromolecule can be represented on Helm format. The reverse is not always possible. */
244
+ public async getHelm(rowIdx: number, options?: any): Promise<DG.SemanticValue<string>> {
245
+ const seq: string = this.column.get(rowIdx);
246
+ let resHelmSV: DG.SemanticValue<string>;
247
+ if (this.notationProvider)
248
+ resHelmSV = await this.notationProvider.getHelm(seq, options);
249
+ else {
250
+ const resHelm = this.convertToHelm(seq);
251
+ resHelmSV = DG.SemanticValue.fromValueType(resHelm, DG.SEMTYPE.MACROMOLECULE, NOTATION.HELM);
252
+ // TODO: set tags from column
253
+ }
254
+
255
+ return resHelmSV;
256
+ }
257
+
258
+ private _stats: SeqColStats | null = null;
259
+
260
+ public get stats(): SeqColStats {
261
+ if (this._stats === null) {
262
+ const freq: { [m: string]: number } = {};
263
+ let sameLength = true;
264
+ let firstLength = null;
265
+
266
+ const colLen = this.column.length;
267
+ for (let rowIdx: number = 0; rowIdx < colLen; ++rowIdx) {
268
+ const mSeq: ISeqSplitted = this.getSplitted(rowIdx);
269
+ if (firstLength == null)
270
+ firstLength = mSeq.length;
271
+ else if (mSeq.length !== firstLength)
272
+ sameLength = false;
273
+
274
+ for (let posIdx = 0; posIdx < mSeq.length; ++posIdx) {
275
+ const cm = mSeq.getCanonical(posIdx);
276
+ if (!(cm in freq))
277
+ freq[cm] = 0;
278
+ freq[cm] += 1;
279
+ }
280
+ }
281
+ this._stats = {freq: freq, sameLength: sameLength};
282
+ }
283
+ return this._stats;
284
+ }
285
+
286
+ private _maxLength: number | null = null;
287
+ public get maxLength(): number {
288
+ if (this._maxLength === null) {
289
+ this._maxLength = this.column.length === 0 ? 0 :
290
+ Math.max(...wu.count(0).take(this.column.length).map((rowIdx) => this.getSplitted(rowIdx).length));
291
+ }
292
+ return this._maxLength!;
293
+ }
294
+
295
+ private _posList: string[] | null = null;
296
+ public get posList(): string[] {
297
+ if (this._posList === null) {
298
+ const posListTxt = this.column.getTag(TAGS.positionNames);
299
+ this._posList = posListTxt ? posListTxt.split(positionSeparator).map((p) => p.trim()) :
300
+ wu.count(1).take(this.maxLength).map((pos) => pos.toString()).toArray();
301
+ }
302
+ return this._posList!;
303
+ }
304
+
305
+ public isFasta(): boolean { return this.notation === NOTATION.FASTA; }
306
+
307
+ public isSeparator(): boolean { return this.notation === NOTATION.SEPARATOR || !!this.separator; }
308
+
309
+ public isHelm(): boolean { return this.notation === NOTATION.HELM; }
310
+
311
+ public isRna(): boolean { return this.alphabet === ALPHABET.RNA; }
312
+
313
+ public isDna(): boolean { return this.alphabet === ALPHABET.DNA; }
314
+
315
+ public isPeptide(): boolean { return this.alphabet === ALPHABET.PT; }
316
+
317
+ public isMsa(): boolean { return this.aligned ? this.aligned.toUpperCase().includes('MSA') : false; }
318
+
319
+ public isHelmCompatible(): boolean { return this.helmCompatible === 'true'; }
320
+
321
+ /** Checks {@link om} for being a gap
322
+ * @param {string} om Original monomer of sequence symbol
323
+ * @return {boolean}
324
+ */
325
+ public isGap(om: string): boolean {
326
+ return !om || om === this._defaultGapOriginal;
327
+ }
328
+
329
+ /** Associate notation types with the corresponding units */
330
+ /**
331
+ * @return {NOTATION} Notation associated with the units type
332
+ */
333
+ protected getNotation(): NOTATION {
334
+ if (this.units.toLowerCase().startsWith(NOTATION.FASTA))
335
+ return NOTATION.FASTA;
336
+ else if (this.units.toLowerCase().startsWith(NOTATION.SEPARATOR))
337
+ return NOTATION.SEPARATOR;
338
+ else if (this.units.toLowerCase().startsWith(NOTATION.HELM))
339
+ return NOTATION.HELM;
340
+ else if (this.units.toLowerCase().startsWith(NOTATION.CUSTOM))
341
+ return NOTATION.CUSTOM;
342
+ else
343
+ throw new Error(`Column '${this.column.name}' has unexpected notation '${this.units}'.`);
344
+ }
345
+
346
+
347
+ /**
348
+ * Get the wrapper strings for HELM, depending on the type of the
349
+ * macromolecule (peptide, DNA, RNA)
350
+ *
351
+ * @return {string[]} Array of wrappers
352
+ */
353
+ public getHelmWrappers(): string[] {
354
+ const prefix = (this.isDna()) ? 'RNA1{' :
355
+ (this.isRna() || this.isHelmCompatible()) ? 'RNA1{' : 'PEPTIDE1{';
356
+
357
+ const postfix = '}$$$$';
358
+ const leftWrapper = (this.isDna()) ? 'd(' :
359
+ (this.isRna()) ? 'r(' : '';
360
+ const rightWrapper = (this.isDna() || this.isRna()) ? ')p' : '';
361
+ return [prefix, leftWrapper, rightWrapper, postfix];
362
+ }
363
+
364
+ /**
365
+ * Create a new empty column of the specified notation type and the same
366
+ * length as column
367
+ *
368
+ * @param {NOTATION} tgtNotation
369
+ * @return {DG.Column}
370
+ */
371
+ protected getNewColumn(
372
+ tgtNotation: NOTATION, tgtSeparator?: string, colName?: string, data?: string[]
373
+ ): DG.Column<string> {
374
+ const col = this.column;
375
+ const name = tgtNotation.toLowerCase() + '(' + col.name + ')';
376
+ const newColName = colName ?? col.dataFrame?.columns.getUnusedName(name) ?? name;
377
+ const newColumn = DG.Column.fromList('string', newColName, data ?? new Array(this.column.length).fill(''));
378
+ newColumn.semType = DG.SEMTYPE.MACROMOLECULE;
379
+ newColumn.meta.units = tgtNotation;
380
+ if (tgtNotation === NOTATION.SEPARATOR) {
381
+ if (!tgtSeparator) throw new Error(`Notation \'${NOTATION.SEPARATOR}\' requires separator value.`);
382
+ newColumn.setTag(TAGS.separator, tgtSeparator);
383
+ }
384
+ newColumn.setTag(DG.TAGS.CELL_RENDERER, tgtNotation === NOTATION.HELM ? 'helm' : 'sequence'); // cell.renderer
385
+
386
+ const srcAligned = col.getTag(TAGS.aligned);
387
+ if (srcAligned)
388
+ newColumn.setTag(TAGS.aligned, srcAligned);
389
+
390
+ let srcAlphabet = col.getTag(TAGS.alphabet);
391
+ if (!srcAlphabet && this.notation === NOTATION.HELM && tgtNotation !== NOTATION.HELM)
392
+ srcAlphabet = ALPHABET.UN;
393
+ if (srcAlphabet != null)
394
+ newColumn.setTag(TAGS.alphabet, srcAlphabet);
395
+
396
+ let srcAlphabetSize: string = col.getTag(TAGS.alphabetSize);
397
+ if (srcAlphabet != null && srcAlphabetSize)
398
+ newColumn.setTag(TAGS.alphabetSize, srcAlphabetSize);
399
+
400
+ const srcAlphabetIsMultichar: string = col.getTag(TAGS.alphabetIsMultichar);
401
+ if (srcAlphabet != null && srcAlphabetIsMultichar !== undefined)
402
+ newColumn.setTag(TAGS.alphabetIsMultichar, srcAlphabetIsMultichar);
403
+
404
+ if (tgtNotation == NOTATION.HELM) {
405
+ srcAlphabetSize = this.getAlphabetSize().toString();
406
+ newColumn.setTag(TAGS.alphabetSize, srcAlphabetSize);
407
+ }
408
+
409
+ return newColumn;
410
+ }
411
+
412
+ /** Creates a new column on data of {@link seqList} with the same tags */
413
+ public getNewColumnFromList(name: string, seqList: string[]): DG.Column<string> {
414
+ return this.getNewColumn(this.notation, this.separator, name, seqList);
415
+ }
416
+
417
+ /**
418
+ * A helper function checking the validity of the 'units' string
419
+ *
420
+ * @param {string} units the string to be validated
421
+ * @return {boolean}
422
+ */
423
+ public static unitsStringIsValid(units: string): boolean {
424
+ units = units.toLowerCase();
425
+ const prefixes = [NOTATION.FASTA, NOTATION.SEPARATOR, NOTATION.HELM];
426
+ const postfixes = ['rna', 'dna', 'pt'];
427
+
428
+ const prefixCriterion = prefixes.some((p) => units.startsWith(p.toLowerCase()));
429
+ return prefixCriterion;
430
+ }
431
+
432
+ /**
433
+ * Construct a new column of semantic type MACROMOLECULE from the list of
434
+ * specified parameters
435
+ *
436
+ * @param {number} len the length of the new column
437
+ * @param {string} name the name of the new column
438
+ * @param {string} units the units of the new column
439
+ * @return {DG.Column}
440
+ */
441
+ public static getNewColumnFromParams(
442
+ len: number,
443
+ name: string,
444
+ units: string
445
+ ): DG.Column {
446
+ // WARNING: in this implementation is is impossible to verify the uniqueness
447
+ // of the new column's name
448
+ // TODO: verify the validity of units parameter
449
+ if (!SeqHandler.unitsStringIsValid(units))
450
+ throw new Error('Invalid format of \'units\' parameter');
451
+ const newColumn = DG.Column.fromList('string', name, new Array(len).fill(''));
452
+ newColumn.semType = DG.SEMTYPE.MACROMOLECULE;
453
+ newColumn.meta.units = units;
454
+ return newColumn;
455
+ }
456
+
457
+ /** Gets function to split seq value to monomers */
458
+ protected getSplitter(limit?: number): SplitterFunc {
459
+ let splitter: SplitterFunc | null = null;
460
+ splitter = this.notationProvider ? this.notationProvider.splitter : null;
461
+ if (splitter) return splitter;
462
+
463
+ if (this.units.toLowerCase().startsWith(NOTATION.FASTA)) {
464
+ const alphabet: string | null = this.column.getTag(TAGS.alphabet);
465
+ if (alphabet !== null && !this.getAlphabetIsMultichar())
466
+ return splitterAsFastaSimple;
467
+ else
468
+ return splitterAsFasta;
469
+ } else if (this.units.toLowerCase().startsWith(NOTATION.SEPARATOR))
470
+ return getSplitterWithSeparator(this.separator!, limit);
471
+ else if (this.units.toLowerCase().startsWith(NOTATION.HELM))
472
+ return splitterAsHelm;
473
+ else
474
+ throw new Error(`Unexpected units ${this.units} .`);
475
+
476
+ // TODO: Splitter for HELM
477
+ }
478
+
479
+ public split(seq: string): ISeqSplitted {
480
+ return this.splitter(seq);
481
+ }
482
+
483
+ public getDistanceFunctionName(): MmDistanceFunctionsNames {
484
+ // TODO add support for helm and separator notation
485
+ if (!this.isFasta())
486
+ throw new Error('Only FASTA notation is supported');
487
+ if (this.isMsa())
488
+ return MmDistanceFunctionsNames.HAMMING;
489
+ switch (this.alphabet) {
490
+ case ALPHABET.DNA:
491
+ case ALPHABET.RNA:
492
+ // As DNA and RNA scoring matrices are same as identity matrices(mostly),
493
+ // we can use very fast and optimized Levenshtein distance library
494
+ return MmDistanceFunctionsNames.LEVENSHTEIN;
495
+ case ALPHABET.PT:
496
+ return MmDistanceFunctionsNames.LEVENSHTEIN;
497
+ // For default case, let's use Levenshtein distance
498
+ default:
499
+ return MmDistanceFunctionsNames.LEVENSHTEIN;
500
+ }
501
+ }
502
+
503
+ public getDistanceFunction(): mmDistanceFunctionType {
504
+ return mmDistanceFunctions[this.getDistanceFunctionName()]();
505
+ }
506
+
507
+ // checks if the separator notation is compatible with helm library
508
+ public async checkHelmCompatibility(): Promise<boolean> {
509
+ // check first for the column tag to avoid extra processing
510
+ if (this.column.tags.has(TAGS.isHelmCompatible))
511
+ return this.column.getTag(TAGS.isHelmCompatible) === 'true';
512
+
513
+ // get the monomer lib and check against the column
514
+ const monomerLibHelper: IMonomerLibHelper = await getMonomerLibHelper();
515
+ const bioLib = monomerLibHelper.getMonomerLib();
516
+ // retrieve peptides
517
+ const peptides = bioLib.getMonomerSymbolsByType(HELM_POLYMER_TYPE.PEPTIDE);
518
+ // convert the peptides list to a set for faster lookup
519
+ const peptidesSet = new Set(peptides);
520
+ // get splitter for given separator and check if all monomers are in the lib
521
+ const splitterFunc = getSplitterWithSeparator(this.separator!);
522
+ // iterate over the columns, split them and check if all monomers are in the lib
523
+ //TODO maybe add missing threshold so that if there are not too many missing monomers
524
+ // the column is still considered helm compatible
525
+ const catIdxSet: Set<number> = new Set();
526
+ const rowCount = this.column.length;
527
+ const colRawData = this.column.getRawData();
528
+ for (let rowIdx = 0; rowIdx < rowCount; ++rowIdx) {
529
+ const catI = colRawData[rowIdx];
530
+ if (!(catI in catIdxSet)) {
531
+ catIdxSet.add(catI);
532
+ const seqSS = this.getSplitted(rowIdx);
533
+ for (let posIdx = 0; posIdx < seqSS.length; ++posIdx) {
534
+ const cm = seqSS.getCanonical(posIdx);
535
+ if (!peptidesSet.has(cm)) {
536
+ this.column.setTag(TAGS.isHelmCompatible, 'false');
537
+ return false;
538
+ }
539
+ }
540
+ }
541
+ }
542
+ this.column.setTag(TAGS.isHelmCompatible, 'true');
543
+ return true;
544
+ }
545
+
546
+ // -- Notation Converter --
547
+
548
+ protected get splitter(): SplitterFunc {
549
+ if (this._splitter === null)
550
+ this._splitter = this.getSplitter();
551
+ return this._splitter;
552
+ }
553
+
554
+ public toFasta(targetNotation: NOTATION): boolean { return targetNotation === NOTATION.FASTA; }
555
+
556
+ public toSeparator(targetNotation: NOTATION): boolean { return targetNotation === NOTATION.SEPARATOR; }
557
+
558
+ public toHelm(targetNotation: NOTATION): boolean { return targetNotation === NOTATION.HELM; }
559
+
560
+ /**
561
+ * Convert HELM string to FASTA/SEPARATOR
562
+ *
563
+ * @param {string} srcSeq A string to be converted
564
+ * @param {string} tgtNotation Target notation: FASTA or SEPARATOR
565
+ * @param {string} tgtSeparator Optional target separator (for HELM ->
566
+ * @param {string | null} tgtGapOriginal Optional target gap symbol
567
+ * SEPARATOR)
568
+ * @return {string} Converted string
569
+ */
570
+ public convertHelmToFastaSeparator(
571
+ srcSeq: string, tgtNotation: string, tgtSeparator?: string, tgtGapOriginal?: string
572
+ ): string {
573
+ if (!tgtGapOriginal) {
574
+ tgtGapOriginal = (this.toFasta(tgtNotation as NOTATION)) ?
575
+ GapOriginals[NOTATION.FASTA] :
576
+ GapOriginals[NOTATION.SEPARATOR];
577
+ }
578
+
579
+ if (!tgtSeparator)
580
+ tgtSeparator = (this.toFasta(tgtNotation as NOTATION)) ? '' : this.separator;
581
+
582
+ const isNucleotide = srcSeq.startsWith('RNA');
583
+ // items can be monomers or helms
584
+ const helmItemsArray = this.splitter(srcSeq);
585
+ const tgtMonomersArray: string[] = [];
586
+ for (let posIdx = 0; posIdx < helmItemsArray.length; ++posIdx) {
587
+ let om: string = helmItemsArray.getOriginal(posIdx);
588
+ if (isNucleotide)
589
+ om = om.replace(HELM_WRAPPERS_REGEXP, '');
590
+ if (om === GapOriginals[NOTATION.HELM])
591
+ tgtMonomersArray.push(tgtGapOriginal);
592
+ else if (this.toFasta(tgtNotation as NOTATION) && om.length > 1) {
593
+ // the case of a multi-character monomer converted to FASTA
594
+ const monomer = '[' + om + ']';
595
+ tgtMonomersArray.push(monomer);
596
+ } else
597
+ tgtMonomersArray.push(om);
598
+ }
599
+ return tgtMonomersArray.join(tgtSeparator);
600
+ }
601
+
602
+ /** Dispatcher method for notation conversion
603
+ *
604
+ * @param {NOTATION} tgtNotation Notation we want to convert to
605
+ * @param {string | null} tgtSeparator Possible separator
606
+ * @return {DG.Column} Converted column
607
+ */
608
+ public convert(tgtNotation: NOTATION, tgtSeparator?: string): DG.Column<string> {
609
+ // Get joiner from the source column units handler (this) knowing about the source sequence.
610
+ // For example, converting DNA Helm to fasta requires removing the r(X)p decoration.
611
+ const joiner: JoinerFunc = this.getJoiner({notation: tgtNotation, separator: tgtSeparator});
612
+ const newColumn = this.getNewColumn(tgtNotation, tgtSeparator);
613
+ // assign the values to the newly created empty column
614
+ newColumn.init((rowIdx: number) => {
615
+ const srcSS = this.getSplitted(rowIdx);
616
+ return joiner(srcSS);
617
+ });
618
+ return newColumn;
619
+ }
620
+
621
+ /**
622
+ * @param name
623
+ * @param startIdx Start position index of the region (0-based)
624
+ * @param endIdx End position index of the region (0-based, inclusive)
625
+ */
626
+ public getRegion(startIdx: number | null, endIdx: number | null, name: string): DG.Column<string> {
627
+ const regCol: DG.Column<string> = this.getNewColumn(this.notation, this.separator);
628
+ regCol.name = name;
629
+
630
+ const startIdxVal: number = startIdx ?? 0;
631
+ const endIdxVal: number = endIdx ?? this.maxLength - 1;
632
+
633
+ const joiner = this.getJoiner();
634
+
635
+ const regLength = endIdxVal - startIdxVal + 1;
636
+ const gapOM = GapOriginals[this.notation];
637
+ regCol.init((rowI): string => {
638
+ const seqS = this.getSplitted(rowI);
639
+ // Custom slicing instead of array method to maintain gaps
640
+ const regOMList: string[] = new Array<string>(regLength);
641
+ for (let regJPos: number = 0; regJPos < regLength; ++regJPos) {
642
+ const seqJPos = startIdxVal + regJPos;
643
+ regOMList[regJPos] = seqJPos < seqS.length ? seqS.getOriginal(seqJPos) : gapOM;
644
+ }
645
+ return joiner(new StringListSeqSplitted(regOMList, gapOM));
646
+ });
647
+
648
+ const getRegionOfPositionNames = (str: string): string => {
649
+ const srcPosList = str.split(',').map((p) => p.trim());
650
+ const regPosList = new Array<string>(regLength);
651
+ for (let regJPos: number = 0; regJPos < regLength; ++regJPos) {
652
+ const srcJPos = startIdxVal + regJPos;
653
+ regPosList[regJPos] = srcJPos < srcPosList.length ? srcPosList[srcJPos] : '?';
654
+ }
655
+ return regPosList.join(positionSeparator);
656
+ };
657
+
658
+ const srcPositionNamesStr = this.column.getTag(TAGS.positionNames);
659
+ if (srcPositionNamesStr) regCol.setTag(TAGS.positionNames, getRegionOfPositionNames(srcPositionNamesStr));
660
+
661
+ const srcPositionLabelsStr = this.column.getTag(TAGS.positionLabels);
662
+ if (srcPositionLabelsStr) regCol.setTag(TAGS.positionLabels, getRegionOfPositionNames(srcPositionLabelsStr));
663
+
664
+ return regCol;
665
+ }
666
+
667
+ private _joiner?: JoinerFunc = undefined;
668
+
669
+ public get joiner(): JoinerFunc {
670
+ if (!this._joiner)
671
+ this._joiner = this.getJoiner();
672
+
673
+ return this._joiner;
674
+ }
675
+
676
+ public getJoiner(opts?: { notation: NOTATION, separator?: string }): JoinerFunc {
677
+ const notation = opts ? opts.notation : this.notation;
678
+ const separator = opts ? opts.separator : this.separator;
679
+
680
+ let res: JoinerFunc;
681
+ const srcSh = this;
682
+ switch (notation) {
683
+ case NOTATION.FASTA: {
684
+ res = function(srcSS: ISeqSplitted): string { return srcSh.joinToFasta(srcSS, srcSh.isHelm()); };
685
+ break;
686
+ }
687
+ case NOTATION.SEPARATOR: {
688
+ if (!separator) throw new Error(`Separator is mandatory for notation '${notation}'.`);
689
+ res = function(srcSS: ISeqSplitted): string { return joinToSeparator(srcSS, separator, srcSh.isHelm()); };
690
+ break;
691
+ }
692
+ case NOTATION.HELM: {
693
+ const isDnaOrRna = srcSh.alphabet === ALPHABET.DNA || srcSh.alphabet === ALPHABET.RNA;
694
+ const wrappers = srcSh.getHelmWrappers();
695
+ res = function(srcSS: ISeqSplitted): string { return joinToHelm(srcSS, wrappers, isDnaOrRna); };
696
+ break;
697
+ }
698
+ default:
699
+ throw new Error(`Unexpected notation '${notation}'.`);
700
+ }
701
+
702
+ return res;
703
+ }
704
+
705
+ public getConverter(tgtUnits: NOTATION, tgtSeparator: string | undefined = undefined): ConvertFunc {
706
+ if (tgtUnits === NOTATION.SEPARATOR && !tgtSeparator)
707
+ throw new Error(`Target separator is not specified for target units '${NOTATION.SEPARATOR}'.`);
708
+
709
+ const srcSh = this;
710
+ if (tgtUnits === NOTATION.FASTA)
711
+ return function(srcSeq: string) { return srcSh.convertToFasta(srcSeq); };
712
+ if (tgtUnits === NOTATION.HELM)
713
+ return function(srcSeq: string) { return srcSh.convertToHelm(srcSeq); };
714
+ else if (tgtUnits === NOTATION.SEPARATOR)
715
+ return function(srcSeq: string) { return srcSh.convertToSeparator(srcSeq, tgtSeparator!); };
716
+ else
717
+ throw new Error();
718
+ }
719
+
720
+ /** Gets a column's UnitsHandler object from temp slot or creates a new and stores it to the temp slot. */
721
+ public static forColumn(col: DG.Column<string>, seqHelper: SeqHelper): SeqHandler {
722
+ // TODO: Invalidate col.temp[Temps.uh] checking column's metadata
723
+ let res = col.temp[SeqTemps.seqHandler];
724
+ if (!res || res.columnVersion !== col.version)
725
+ res = col.temp[SeqTemps.seqHandler] = new SeqHandler(col, seqHelper);
726
+ return res;
727
+ }
728
+
729
+ // -- joiners & converters --
730
+
731
+ private joinToFasta(seqS: ISeqSplitted, isHelm: boolean): string {
732
+ const resMList: string[] = new Array<string>(seqS.length);
733
+ for (let posIdx: number = 0; posIdx < seqS.length; ++posIdx) {
734
+ const cm: string = seqS.getOriginal(posIdx);
735
+ let om: string = seqS.getOriginal(posIdx);
736
+ if (isHelm)
737
+ om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
738
+
739
+ if (cm === GAP_SYMBOL)
740
+ om = GapOriginals[NOTATION.FASTA];
741
+ else if (cm === PHOSPHATE_SYMBOL)
742
+ om = '';
743
+ else if (om.length > 1)
744
+ om = '[' + om + ']';
745
+
746
+ resMList[posIdx] = om;
747
+ }
748
+ return resMList.join('');
749
+ }
750
+
751
+ private convertToFasta(src: string): string {
752
+ const srcUhSplitter: SplitterFunc = this.splitter;
753
+ const srcSS: ISeqSplitted = this.isHelm() ? this.splitterAsHelmNucl(src) : srcUhSplitter(src);
754
+ return this.joinToFasta(srcSS, this.isHelm());
755
+ }
756
+
757
+ private convertToSeparator(src: string, tgtSeparator: string): string {
758
+ const srcSS: ISeqSplitted = this.isHelm() ? this.splitterAsHelmNucl(src) : this.splitter(src);
759
+ return joinToSeparator(srcSS, tgtSeparator, this.isHelm());
760
+ }
761
+
762
+ private convertToHelm(src: string): string {
763
+ if (this.notation == NOTATION.HELM) return src;
764
+
765
+ const wrappers = this.getHelmWrappers();
766
+
767
+ const isDnaOrRna = src.startsWith('DNA') || src.startsWith('RNA');
768
+ const srcSS = this.splitter(src);
769
+ return joinToHelm(srcSS, wrappers, isDnaOrRna);
770
+ }
771
+
772
+ /** Splits Helm sequence adjusting nucleotides to single char symbols. (!) Removes lone phosphorus. */
773
+ private splitterAsHelmNucl(src: string): ISeqSplitted {
774
+ const srcMList: ISeqSplitted = this.splitter(src);
775
+ const tgtMList: (string | null)[] = new Array<string>(srcMList.length);
776
+ const isDna = src.startsWith('DNA');
777
+ const isRna = src.startsWith('RNA');
778
+ for (let posIdx: number = 0; posIdx < srcMList.length; ++posIdx) {
779
+ let om: string | null = srcMList.getOriginal(posIdx);
780
+ if (isDna || isRna) {
781
+ om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
782
+ om = om === PHOSPHATE_SYMBOL ? null : om;
783
+ }
784
+ tgtMList[posIdx] = om ? om : null;
785
+ }
786
+ return new StringListSeqSplitted(tgtMList.filter((om) => !!om) as string[], GapOriginals[NOTATION.HELM]);
787
+ }
788
+
789
+ // Custom notation provider
790
+
791
+ getRendererBack(gridCol: DG.GridColumn | null, tableCol: DG.Column<string>): CellRendererBackBase<string> {
792
+ const temp = this.column.temp as GridCellRendererTemp<any>;
793
+ let res = temp.rendererBack;
794
+ if (!res)
795
+ res = temp.rendererBack = this.notationProvider!.createCellRendererBack(gridCol, tableCol);
796
+ return res;
797
+ }
798
+ }
799
+
800
+ // -- joiners --
801
+
802
+ function joinToSeparator(seqS: ISeqSplitted, tgtSeparator: string, isHelm: boolean): string {
803
+ const resMList: string[] = new Array<string>(seqS.length);
804
+ for (let posIdx: number = 0; posIdx < seqS.length; ++posIdx) {
805
+ const cm = seqS.getCanonical(posIdx);
806
+ let om = seqS.getOriginal(posIdx);
807
+ if (isHelm)
808
+ om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
809
+
810
+ if (cm === GAP_SYMBOL)
811
+ om = GapOriginals[NOTATION.SEPARATOR];
812
+ else if (cm === PHOSPHATE_SYMBOL)
813
+ om = '';
814
+ resMList[posIdx] = om;
815
+ }
816
+ return resMList.join(tgtSeparator);
817
+ }
818
+
819
+ function joinToHelm(srcSS: ISeqSplitted, wrappers: string[], isDnaOrRna: boolean): string {
820
+ const [prefix, leftWrapper, rightWrapper, postfix] = wrappers;
821
+ const resOMList: string[] = new Array<string>(srcSS.length);
822
+ for (let posIdx: number = 0; posIdx < srcSS.length; ++posIdx) {
823
+ const cm = srcSS.getCanonical(posIdx);
824
+ let om: string = srcSS.getOriginal(posIdx);
825
+ if (cm === GAP_SYMBOL)
826
+ om = GapOriginals[NOTATION.HELM];
827
+ else {
828
+ if (isDnaOrRna)
829
+ om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
830
+ om = om.length === 1 ? `${leftWrapper}${om}${rightWrapper}` : `${leftWrapper}[${om}]${rightWrapper}`;
831
+ }
832
+ resOMList[posIdx] = om;
833
+ }
834
+ return `${prefix}${resOMList.join('.')}${postfix}`;
835
+ }
836
+