@datagrok/bio 2.22.11 → 2.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/detectors.js +14 -0
- package/dist/455.js +1 -1
- package/dist/455.js.map +1 -1
- package/dist/package-test.js +3 -3
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +2 -2
- package/dist/package.js.map +1 -1
- package/dockerfiles/container.json +2 -2
- package/files/samples/BILN.csv +625 -0
- package/files/samples/BILN_W_HELM.csv +5114 -0
- package/package.json +6 -6
- package/src/package-api.ts +11 -0
- package/src/package.g.ts +54 -24
- package/src/package.ts +65 -13
- package/src/tests/biln-tests.ts +167 -0
- package/src/tests/converters-test.ts +14 -0
- package/src/tests/detectors-tests.ts +7 -0
- package/src/tests/renderers-test.ts +1 -1
- package/src/tests/viewers.ts +11 -16
- package/src/utils/biln.ts +69 -0
- package/src/utils/cell-renderer.ts +7 -11
- package/src/utils/convert.ts +3 -2
- package/src/utils/monomer-lib/monomer-manager/monomer-manager.ts +48 -1
- package/src/utils/save-as-fasta.ts +25 -22
- package/src/utils/seq-helper/seq-handler.ts +139 -33
- package/src/utils/seq-helper/seq-helper.ts +1 -1
- package/src/widgets/representations.ts +1 -1
- package/src/widgets/to-atomic-level-widget.ts +12 -4
- package/test-console-output-1.log +1071 -3014
- package/test-record-1.mp4 +0 -0
|
@@ -10,6 +10,8 @@ import {ALIGNMENT, ALPHABET, NOTATION, TAGS as bioTAGS} from '@datagrok-librarie
|
|
|
10
10
|
import {ISeqHelper, getSeqHelper} from '@datagrok-libraries/bio/src/utils/seq-helper';
|
|
11
11
|
|
|
12
12
|
import {_testNeg, _testPos, DetectorTestData, DfReaderFunc, PosCol} from './utils/detectors-utils';
|
|
13
|
+
import { SeqTemps } from '@datagrok-libraries/bio/src/utils/macromolecule/seq-handler';
|
|
14
|
+
import { _testBilnDetection, detectorTestsDataForBiln } from './biln-tests';
|
|
13
15
|
|
|
14
16
|
/*
|
|
15
17
|
// snippet to list df columns of semType='Macromolecule' (false positive)
|
|
@@ -418,6 +420,11 @@ MWRSWY-CKHPMWRSWY-CKHP`;
|
|
|
418
420
|
}, seqHelper);
|
|
419
421
|
});
|
|
420
422
|
|
|
423
|
+
for (const bilnT of detectorTestsDataForBiln) {
|
|
424
|
+
test(bilnT.name, async () => {
|
|
425
|
+
await _testBilnDetection(bilnT.seqs, seqHelper, bilnT.negative);
|
|
426
|
+
});
|
|
427
|
+
}
|
|
421
428
|
// test('samplesFastaFasta', async () => {
|
|
422
429
|
// await _testDf(readSamples(Samples.fastaFasta), {
|
|
423
430
|
// 'sequence': new PosCol(NOTATION.FASTA, ALIGNMENT.SEQ, ALPHABET.PT, 20, false),
|
|
@@ -65,7 +65,7 @@ category('renderers', () => {
|
|
|
65
65
|
|
|
66
66
|
test('scatterPlotTooltip', async () => {
|
|
67
67
|
await _testScatterPlotTooltip();
|
|
68
|
-
}
|
|
68
|
+
});
|
|
69
69
|
|
|
70
70
|
async function _rendererMacromoleculeFasta() {
|
|
71
71
|
const csv: string = await grok.dapi.files.readAsText('System:AppData/Bio/samples/FASTA.csv');
|
package/src/tests/viewers.ts
CHANGED
|
@@ -6,19 +6,14 @@ import {category, test, testViewer} from '@datagrok-libraries/utils/src/test';
|
|
|
6
6
|
import {readDataframe} from './utils';
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
category('viewers', () => {
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
'VdRegions': 'GROK-13162',
|
|
21
|
-
}[v],
|
|
22
|
-
});
|
|
23
|
-
}
|
|
24
|
-
});
|
|
9
|
+
// category('viewers', () => {
|
|
10
|
+
// const viewers = DG.Func.find({package: 'Bio', tags: ['viewer']}).map((f) => f.friendlyName);
|
|
11
|
+
// for (const v of viewers) {
|
|
12
|
+
// test(v, async () => {
|
|
13
|
+
// const df = await readDataframe('samples/FASTA_DNA.csv');
|
|
14
|
+
// await df.meta.detectSemanticTypes();
|
|
15
|
+
// await grok.data.detectSemanticTypes(df);
|
|
16
|
+
// await testViewer(v, df, {detectSemanticTypes: true});
|
|
17
|
+
// });
|
|
18
|
+
// }
|
|
19
|
+
// });
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/* eslint-disable max-len */
|
|
2
|
+
/* eslint-disable max-len */
|
|
3
|
+
import * as grok from 'datagrok-api/grok';
|
|
4
|
+
import * as ui from 'datagrok-api/ui';
|
|
5
|
+
import * as DG from 'datagrok-api/dg';
|
|
6
|
+
|
|
7
|
+
/* eslint-disable max-len */
|
|
8
|
+
import {ISeqHelper} from '@datagrok-libraries/bio/src/utils/seq-helper';
|
|
9
|
+
import {INotationProvider, SplitterFunc} from '@datagrok-libraries/bio/src/utils/macromolecule/types';
|
|
10
|
+
import {NOTATION} from '@datagrok-libraries/bio/src/utils/macromolecule/consts';
|
|
11
|
+
import {CellRendererBackBase} from '@datagrok-libraries/bio/src/utils/cell-renderer-back-base';
|
|
12
|
+
import {MonomerPlacer} from '@datagrok-libraries/bio/src/utils/cell-renderer-monomer-placer';
|
|
13
|
+
import {monomerToShort, splitterAsBiln} from '@datagrok-libraries/bio/src/utils/macromolecule/utils';
|
|
14
|
+
import {_package} from '../package';
|
|
15
|
+
/* eslint-enable max-len */
|
|
16
|
+
|
|
17
|
+
export class BilnNotationProvider implements INotationProvider {
|
|
18
|
+
public readonly splitter: SplitterFunc;
|
|
19
|
+
|
|
20
|
+
get defaultGapOriginal(): string { return ''; }
|
|
21
|
+
|
|
22
|
+
constructor(
|
|
23
|
+
public readonly separator: string,
|
|
24
|
+
public readonly seqHelper: ISeqHelper,
|
|
25
|
+
public readonly seqCol: DG.Column
|
|
26
|
+
) {
|
|
27
|
+
this.splitter = splitterAsBiln.bind(this);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
setUnits(): void {}
|
|
31
|
+
|
|
32
|
+
public getHelm(seq: string, _options?: any): string {
|
|
33
|
+
// return resPseudoHelm;
|
|
34
|
+
// generate helm from biln
|
|
35
|
+
const seqSplitted = this.splitter(seq);
|
|
36
|
+
const sh = this.seqHelper.getSeqHandler(this.seqCol);
|
|
37
|
+
return sh.getJoiner({notation: NOTATION.HELM})(seqSplitted);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
public createCellRendererBack(gridCol: DG.GridColumn | null, tableCol: DG.Column<string>):
|
|
41
|
+
CellRendererBackBase<string> {
|
|
42
|
+
const maxLengthOfMonomer = _package.properties.maxMonomerLength || 4;
|
|
43
|
+
// (_package.bioProperties ? _package.bioProperties.maxMonomerLength : 4) ?? 50;
|
|
44
|
+
const back = new BilnCellRendererBack(gridCol, tableCol,
|
|
45
|
+
maxLengthOfMonomer, this.seqHelper);
|
|
46
|
+
|
|
47
|
+
back.init().then(() => {});
|
|
48
|
+
return back;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export class BilnCellRendererBack extends MonomerPlacer {
|
|
53
|
+
constructor(
|
|
54
|
+
gridCol: DG.GridColumn | null, tableCol: DG.Column,
|
|
55
|
+
maxLengthOfMonomer: number, seqHelper: ISeqHelper
|
|
56
|
+
) {
|
|
57
|
+
super(gridCol, tableCol, _package.logger, maxLengthOfMonomer, () => {
|
|
58
|
+
const sh = seqHelper.getSeqHandler(tableCol);
|
|
59
|
+
const {font, fontWidth} = MonomerPlacer.getFontSettings(tableCol);
|
|
60
|
+
return {
|
|
61
|
+
seqHandler: sh,
|
|
62
|
+
font: font,
|
|
63
|
+
fontCharWidth: fontWidth,
|
|
64
|
+
separatorWidth: 0,
|
|
65
|
+
monomerToShort: monomerToShort,
|
|
66
|
+
};
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -81,10 +81,9 @@ export class MacromoleculeSequenceCellRenderer extends DG.GridCellRenderer {
|
|
|
81
81
|
|
|
82
82
|
getRendererBack(gridCell: DG.GridCell): CellRendererBackBase<string> | null {
|
|
83
83
|
const [gridCol, tableCol, _temp] = getGridCellColTemp<string, any>(gridCell);
|
|
84
|
-
if (
|
|
84
|
+
if (_temp.rendererBack)
|
|
85
85
|
return _temp.rendererBack;
|
|
86
86
|
let back: CellRendererBackBase<string> | null = null;
|
|
87
|
-
|
|
88
87
|
if (this.seqHelper) {
|
|
89
88
|
const sh = this.seqHelper.getSeqHandler(tableCol);
|
|
90
89
|
back = sh.getRendererBack(gridCol, tableCol);
|
|
@@ -96,10 +95,8 @@ export class MacromoleculeSequenceCellRenderer extends DG.GridCellRenderer {
|
|
|
96
95
|
const colTemp: TempType = gridCell.cell.column.temp;
|
|
97
96
|
colTemp[tempTAGS.currentWord] = gridCell.cell.value;
|
|
98
97
|
gridCell.grid.invalidate();
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
back?.onClick(gridCell, _e);
|
|
102
|
-
}
|
|
98
|
+
const back = this.getRendererBack(gridCell);
|
|
99
|
+
back?.onClick(gridCell, _e);
|
|
103
100
|
}
|
|
104
101
|
|
|
105
102
|
override onMouseEnter(gridCell: DG.GridCell, e: MouseEvent) {
|
|
@@ -195,12 +192,11 @@ export class MacromoleculeSequenceCellRenderer extends DG.GridCellRenderer {
|
|
|
195
192
|
}
|
|
196
193
|
|
|
197
194
|
override render(g: CanvasRenderingContext2D, x: number, y: number, w: number, h: number, gridCell: DG.GridCell, cellStyle: DG.GridCellStyle): void {
|
|
198
|
-
|
|
199
|
-
|
|
195
|
+
const back = this.getRendererBack(gridCell);
|
|
196
|
+
if (back)
|
|
200
197
|
back?.render(g, x, y, w, h, gridCell, cellStyle);
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
this.renderInt(g, x, y, w, h, gridCell, cellStyle);
|
|
198
|
+
else
|
|
199
|
+
this.renderInt(g, x, y, w, h, gridCell, cellStyle);
|
|
204
200
|
}
|
|
205
201
|
}
|
|
206
202
|
|
package/src/utils/convert.ts
CHANGED
|
@@ -39,6 +39,7 @@ export function convert(col: DG.Column<string> | undefined, seqHelper: ISeqHelpe
|
|
|
39
39
|
NOTATION.FASTA,
|
|
40
40
|
NOTATION.SEPARATOR,
|
|
41
41
|
NOTATION.HELM,
|
|
42
|
+
NOTATION.BILN
|
|
42
43
|
];
|
|
43
44
|
const toggleColumn = (newCol: DG.Column) => {
|
|
44
45
|
srcCol = newCol;
|
|
@@ -47,7 +48,7 @@ export function convert(col: DG.Column<string> | undefined, seqHelper: ISeqHelpe
|
|
|
47
48
|
if (currentNotation === NOTATION.HELM)
|
|
48
49
|
separatorInput.value = '/'; // helm monomers can have - in the name like D-aThr;
|
|
49
50
|
dialogHeader.textContent = 'Current notation: ' + currentNotation;
|
|
50
|
-
filteredNotations = notations
|
|
51
|
+
filteredNotations = notations;//.filter((e) => e !== currentNotation); TEMPORARY DO NOT FORGET TO UNCOMMENT
|
|
51
52
|
targetNotationInput = ui.input.choice('Convert to', {
|
|
52
53
|
value: filteredNotations[0], items: filteredNotations,
|
|
53
54
|
onValueChanged: toggleSeparator
|
|
@@ -70,7 +71,7 @@ export function convert(col: DG.Column<string> | undefined, seqHelper: ISeqHelpe
|
|
|
70
71
|
});
|
|
71
72
|
|
|
72
73
|
const separatorArray = ['-', '.', '/'];
|
|
73
|
-
let filteredNotations = notations
|
|
74
|
+
let filteredNotations = notations;//.filter((e) => e !== currentNotation); // TEMPORARY DO NOT FORGET TO UNCOMMENT
|
|
74
75
|
|
|
75
76
|
const separatorInput = ui.input.choice('Separator', {value: separatorArray[0], items: separatorArray});
|
|
76
77
|
|
|
@@ -57,6 +57,53 @@ export async function standardiseMonomers(monomers: Monomer[]) {
|
|
|
57
57
|
return fixedMonomers;
|
|
58
58
|
}
|
|
59
59
|
|
|
60
|
+
/// matches molecules in the dataframe with monomers in the library by canonical smiles
|
|
61
|
+
export async function matchMoleculesWithMonomers(molDf: DG.DataFrame, molColName: string, monomerLib: IMonomerLib, polymerType: PolymerType = 'PEPTIDE'): Promise<DG.DataFrame> {
|
|
62
|
+
const converterFunc = DG.Func.find({package: 'Chem', name: 'convertMoleculeNotation'})[0];
|
|
63
|
+
if (!converterFunc)
|
|
64
|
+
throw new Error('Function convertMoleculeNotation not found, please install Chem package');
|
|
65
|
+
// first: stamdardize monomers
|
|
66
|
+
const monomers = monomerLib.getMonomerSymbolsByType(polymerType).map((s) => monomerLib.getMonomer(polymerType, s)!).filter((m) => m && (m.smiles || m.molfile));
|
|
67
|
+
const fixedMonomers = await standardiseMonomers(monomers);
|
|
68
|
+
const cappedSmilse = fixedMonomers.map((m, i) => ({sym: m.symbol, smiles: capSmiles(m.smiles ?? '', m.rgroups ?? []), original: m.smiles, source: monomers[i]?.lib?.source})).filter((s) => !!s?.smiles && !s.smiles.includes('[*:'));
|
|
69
|
+
// canonicalize all monomer smiles
|
|
70
|
+
const monomerSmilesCol = DG.Column.fromList(DG.COLUMN_TYPE.STRING, 'MonomerSmiles', cappedSmilse.map((m) => m.smiles!));
|
|
71
|
+
monomerSmilesCol.semType = DG.SEMTYPE.MOLECULE;
|
|
72
|
+
const canonicalizedMonomersSmilesCol: DG.Column = await converterFunc.apply({molecule: monomerSmilesCol, targetNotation: DG.chem.Notation.Smiles});
|
|
73
|
+
if (!canonicalizedMonomersSmilesCol || canonicalizedMonomersSmilesCol.length !== monomerSmilesCol.length)
|
|
74
|
+
throw new Error('Error canonicalizing monomer smiles');
|
|
75
|
+
canonicalizedMonomersSmilesCol.toList().forEach((s, i) => cappedSmilse[i].smiles = s);
|
|
76
|
+
|
|
77
|
+
const molecules = molDf.col(molColName)!;
|
|
78
|
+
const canonicalizedMoleculesCol: DG.Column = await converterFunc.apply({molecule: molecules, targetNotation: DG.chem.Notation.Smiles});
|
|
79
|
+
if (!canonicalizedMoleculesCol || canonicalizedMoleculesCol.length !== molecules.length)
|
|
80
|
+
throw new Error('Error canonicalizing molecules');
|
|
81
|
+
|
|
82
|
+
const canonicalizedMolecules = canonicalizedMoleculesCol.toList();
|
|
83
|
+
|
|
84
|
+
const resultDf = molDf.clone();
|
|
85
|
+
const matchingMonomerSmilesCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer smiles'));
|
|
86
|
+
matchingMonomerSmilesCol.semType = DG.SEMTYPE.MOLECULE;
|
|
87
|
+
const matchingMonomerSymbolCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer symbol'));
|
|
88
|
+
matchingMonomerSymbolCol.semType = 'Monomer';
|
|
89
|
+
const sourceLibCol = resultDf.columns.addNewString(resultDf.columns.getUnusedName('Matched monomer source'));
|
|
90
|
+
resultDf.columns.setOrder([molColName, matchingMonomerSymbolCol.name, matchingMonomerSmilesCol.name, sourceLibCol.name]);
|
|
91
|
+
|
|
92
|
+
for (let i = 0; i < canonicalizedMolecules.length; i++) {
|
|
93
|
+
const mol = canonicalizedMolecules[i];
|
|
94
|
+
if (!mol) continue;
|
|
95
|
+
for (let j = 0; j < cappedSmilse.length; j++) {
|
|
96
|
+
if (cappedSmilse[j].smiles === mol) {
|
|
97
|
+
matchingMonomerSmilesCol.set(i, cappedSmilse[j].original!, false);
|
|
98
|
+
matchingMonomerSymbolCol.set(i, cappedSmilse[j].sym, false);
|
|
99
|
+
sourceLibCol.set(i, cappedSmilse[j].source ?? '', false);
|
|
100
|
+
break;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return resultDf;
|
|
105
|
+
}
|
|
106
|
+
|
|
60
107
|
/** Standardizes the monomer library
|
|
61
108
|
* warning: throws error if the library is not valid or has invalid monomers
|
|
62
109
|
*/
|
|
@@ -127,7 +174,7 @@ export function getMonomersDataFrame(monomers: Monomer[]) {
|
|
|
127
174
|
monomers[i].id,
|
|
128
175
|
JSON.stringify(monomers[i].meta ?? {}),
|
|
129
176
|
monomers[i].lib?.source ?? '',
|
|
130
|
-
]);
|
|
177
|
+
], false);
|
|
131
178
|
// something is wrong with setting dates, so setting it manually for now
|
|
132
179
|
try {
|
|
133
180
|
if (date)
|
|
@@ -15,23 +15,21 @@ const FASTA_LINE_WIDTH = 60;
|
|
|
15
15
|
export function saveAsFastaUI(): void {
|
|
16
16
|
// Use grid for column order adjusted by user
|
|
17
17
|
const grid: DG.Grid = grok.shell.tv.grid;
|
|
18
|
+
const dataFrame: DG.DataFrame = grid.dataFrame;
|
|
18
19
|
|
|
19
|
-
const idGColList: DG.
|
|
20
|
-
.map((colI: number) =>
|
|
21
|
-
.filter((
|
|
22
|
-
const defaultIdGCol: DG.
|
|
23
|
-
.find((
|
|
24
|
-
const idDefaultValue = defaultIdGCol ? [defaultIdGCol
|
|
20
|
+
const idGColList: DG.Column[] = wu.count(0).take(dataFrame.columns.length)
|
|
21
|
+
.map((colI: number) => dataFrame.columns.byIndex(colI)!)
|
|
22
|
+
.filter((col: DG.Column) => col.semType !== DG.SEMTYPE.MACROMOLECULE).toArray();
|
|
23
|
+
const defaultIdGCol: DG.Column | undefined = idGColList
|
|
24
|
+
.find((col: DG.Column) => col.name.toLowerCase().indexOf('id') !== -1);
|
|
25
|
+
const idDefaultValue = defaultIdGCol ? [defaultIdGCol] : [];
|
|
25
26
|
|
|
26
|
-
const idGColListInput = ui.input.
|
|
27
|
-
|
|
28
|
-
items: idGColList.map((gcol: DG.GridColumn) => gcol.name)
|
|
29
|
-
});
|
|
27
|
+
const idGColListInput = ui.input.columns('Seq id columns', {table: dataFrame, value: idDefaultValue,
|
|
28
|
+
filter: (col: DG.Column) => col.semType !== DG.SEMTYPE.MACROMOLECULE});
|
|
30
29
|
|
|
31
|
-
const seqGColList: DG.
|
|
32
|
-
.map((colI: number) =>
|
|
33
|
-
.filter((
|
|
34
|
-
const col: DG.Column | null = gc.column;
|
|
30
|
+
const seqGColList: DG.Column[] = wu.count(0).take(dataFrame.columns.length)/* range rom 0 to grid.columns.length */
|
|
31
|
+
.map((colI: number) => dataFrame.columns.byIndex(colI)!)
|
|
32
|
+
.filter((col: DG.Column) => {
|
|
35
33
|
if (col && col.semType === DG.SEMTYPE.MACROMOLECULE) {
|
|
36
34
|
const sh = _package.seqHelper.getSeqHandler(col);
|
|
37
35
|
return sh.isFasta();
|
|
@@ -39,10 +37,17 @@ export function saveAsFastaUI(): void {
|
|
|
39
37
|
return false;
|
|
40
38
|
}).toArray();
|
|
41
39
|
|
|
42
|
-
const seqDefaultValue = seqGColList.length > 0 ? seqGColList[0]
|
|
43
|
-
const seqColInput = ui.input.
|
|
44
|
-
|
|
45
|
-
|
|
40
|
+
const seqDefaultValue = seqGColList.length > 0 ? seqGColList[0] : null;
|
|
41
|
+
const seqColInput = ui.input.column('Seq column', {
|
|
42
|
+
table: dataFrame,
|
|
43
|
+
value: seqDefaultValue!,
|
|
44
|
+
filter: (col) => {
|
|
45
|
+
if (col && col.semType === DG.SEMTYPE.MACROMOLECULE) {
|
|
46
|
+
const sh = _package.seqHelper.getSeqHandler(col);
|
|
47
|
+
return sh.isFasta();
|
|
48
|
+
}
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
46
51
|
});
|
|
47
52
|
|
|
48
53
|
const lineWidthInput = ui.input.int('FASTA line width', {value: FASTA_LINE_WIDTH});
|
|
@@ -54,10 +59,8 @@ export function saveAsFastaUI(): void {
|
|
|
54
59
|
lineWidthInput,
|
|
55
60
|
]))
|
|
56
61
|
.onOK(() => {
|
|
57
|
-
const valueIdColList: DG.Column[] = idGColListInput.value
|
|
58
|
-
|
|
59
|
-
const valueSeqCol: DG.Column | null = seqColInput.value ?
|
|
60
|
-
grid.columns.byName(seqColInput.value as string)!.column : null;
|
|
62
|
+
const valueIdColList: DG.Column[] = idGColListInput.value ?? [];
|
|
63
|
+
const valueSeqCol: DG.Column | null = seqColInput.value ?? null;
|
|
61
64
|
const valueLineWidth = lineWidthInput.value ?? FASTA_LINE_WIDTH;
|
|
62
65
|
|
|
63
66
|
if (!valueSeqCol)
|
|
@@ -6,7 +6,7 @@ import wu from 'wu';
|
|
|
6
6
|
|
|
7
7
|
/* eslint-disable max-len */
|
|
8
8
|
import {ALIGNMENT, ALPHABET, candidateAlphabets, getSplitterWithSeparator, NOTATION, positionSeparator, splitterAsFasta, splitterAsHelm, TAGS} from '@datagrok-libraries/bio/src/utils/macromolecule/index';
|
|
9
|
-
import {INotationProvider, ISeqSplitted, SeqColStats, SplitterFunc,} from '@datagrok-libraries/bio/src/utils/macromolecule/types';
|
|
9
|
+
import {INotationProvider, ISeqConnection, ISeqSplitted, SeqColStats, SplitterFunc,} from '@datagrok-libraries/bio/src/utils/macromolecule/types';
|
|
10
10
|
import {detectAlphabet, detectHelmAlphabet, splitterAsFastaSimple, StringListSeqSplitted} from '@datagrok-libraries/bio/src/utils/macromolecule/utils';
|
|
11
11
|
import {mmDistanceFunctions, MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
12
12
|
import {mmDistanceFunctionType} from '@datagrok-libraries/ml/src/macromolecule-distance-functions/types';
|
|
@@ -28,10 +28,22 @@ export class SeqHandler implements ISeqHandler {
|
|
|
28
28
|
protected readonly _units: string; // units, of the form fasta, separator
|
|
29
29
|
protected readonly _notation: NOTATION; // current notation (without :SEQ:NT, etc.)
|
|
30
30
|
protected readonly _defaultGapOriginal: string;
|
|
31
|
-
|
|
31
|
+
private _notationProvider: INotationProvider | null = null;
|
|
32
|
+
private _tempReadForNotProvider = false;
|
|
33
|
+
protected get notationProvider(): INotationProvider | null {
|
|
34
|
+
if (!this._tempReadForNotProvider) {
|
|
35
|
+
this._tempReadForNotProvider = true;
|
|
36
|
+
this._notationProvider = this._notationProvider ?? this._column.temp[SeqTemps.notationProvider] ?? null;
|
|
37
|
+
}
|
|
38
|
+
return this._notationProvider;
|
|
39
|
+
};
|
|
40
|
+
protected set notationProvider(v: INotationProvider | null) { this._notationProvider = v; };
|
|
32
41
|
|
|
33
42
|
private _splitter: SplitterFunc | null = null;
|
|
34
43
|
|
|
44
|
+
private _refinerPromise: Promise<void> = Promise.resolve();
|
|
45
|
+
public get refinerPromise(): Promise<void> { return this._refinerPromise; }
|
|
46
|
+
|
|
35
47
|
protected constructor(col: DG.Column<string>,
|
|
36
48
|
private readonly seqHelper: SeqHelper,
|
|
37
49
|
) {
|
|
@@ -44,7 +56,7 @@ export class SeqHandler implements ISeqHandler {
|
|
|
44
56
|
this._units = units!;
|
|
45
57
|
|
|
46
58
|
this._notation = this.getNotation();
|
|
47
|
-
if (this.isCustom()) {
|
|
59
|
+
if (this.isCustom() || this.isBiln()) {
|
|
48
60
|
// this.column.temp[SeqTemps.notationProvider] must be set at detector stage
|
|
49
61
|
this.notationProvider = this.column.temp[SeqTemps.notationProvider] ?? null;
|
|
50
62
|
}
|
|
@@ -52,7 +64,7 @@ export class SeqHandler implements ISeqHandler {
|
|
|
52
64
|
const defaultGapOriginal = this.isFasta() ? GapOriginals[NOTATION.FASTA] :
|
|
53
65
|
this.isSeparator() ? GapOriginals[NOTATION.SEPARATOR] :
|
|
54
66
|
this.isHelm() ? GapOriginals[NOTATION.HELM] :
|
|
55
|
-
this.isCustom() ? (this.notationProvider?.defaultGapOriginal ?? GapOriginals[NOTATION.SEPARATOR]) :
|
|
67
|
+
this.isCustom() || this.isBiln() ? (this.notationProvider?.defaultGapOriginal ?? GapOriginals[NOTATION.SEPARATOR]) :
|
|
56
68
|
undefined;
|
|
57
69
|
if (defaultGapOriginal == undefined)
|
|
58
70
|
throw new Error(`Unexpected defaultGapOriginal for notation '${this.notation}'`);
|
|
@@ -70,8 +82,8 @@ export class SeqHandler implements ISeqHandler {
|
|
|
70
82
|
this.seqHelper.setUnitsToSeparatorColumn(this, separator);
|
|
71
83
|
} else if (this.isHelm())
|
|
72
84
|
this.seqHelper.setUnitsToHelmColumn(this);
|
|
73
|
-
else if (this.isCustom())
|
|
74
|
-
this.notationProvider
|
|
85
|
+
else if (this.isCustom() || this.isBiln())
|
|
86
|
+
this.notationProvider?.setUnits(this);
|
|
75
87
|
else
|
|
76
88
|
throw new Error(`Unexpected units '${this.column.meta.units}'.`);
|
|
77
89
|
}
|
|
@@ -96,8 +108,8 @@ export class SeqHandler implements ISeqHandler {
|
|
|
96
108
|
|
|
97
109
|
this.columnVersion = this.column.version;
|
|
98
110
|
// refine separator only at this stage
|
|
99
|
-
if (this.isSeparator() && (!this.isCustom() || !this.notationProvider) && !col.temp['seqHandlerRefined']) {
|
|
100
|
-
this.refineSeparatorNotation();
|
|
111
|
+
if (this.isSeparator() && ((!this.isCustom() && !this.isBiln()) || !this.notationProvider) && !col.temp['seqHandlerRefined']) {
|
|
112
|
+
this._refinerPromise = this.refineSeparatorNotation();
|
|
101
113
|
col.temp['seqHandlerRefined'] = true;
|
|
102
114
|
}
|
|
103
115
|
}
|
|
@@ -138,16 +150,11 @@ export class SeqHandler implements ISeqHandler {
|
|
|
138
150
|
const stats = getStats(categoriesSample, 3, (s) => s.split(this.separator!));
|
|
139
151
|
let invalidateRequired = false;
|
|
140
152
|
|
|
141
|
-
const refinerList = [
|
|
142
|
-
{package: 'SequenceTranslator', name: 'refineNotationProviderForHarmonizedSequence'},
|
|
143
|
-
];
|
|
153
|
+
const refinerList = DG.Func.find({tags: ['notationRefiner']});
|
|
144
154
|
|
|
145
155
|
for (const refineFuncFind of refinerList) {
|
|
146
156
|
try {
|
|
147
|
-
const
|
|
148
|
-
if (funcList.length === 0) continue;
|
|
149
|
-
|
|
150
|
-
const funcFc = funcList[0].prepare({col: this.column, stats: stats, separator: this.separator});
|
|
157
|
+
const funcFc = refineFuncFind.prepare({col: this.column, stats: stats, separator: this.separator});
|
|
151
158
|
const refineRes = (await funcFc.call()).getOutputParamValue();
|
|
152
159
|
invalidateRequired ||= refineRes;
|
|
153
160
|
} catch (err) {
|
|
@@ -157,6 +164,8 @@ export class SeqHandler implements ISeqHandler {
|
|
|
157
164
|
|
|
158
165
|
if (invalidateRequired) {
|
|
159
166
|
// Applying custom notation provider MUST invalidate SeqHandler
|
|
167
|
+
// some things might still have the old seqHandler attached, so we need to make sure they have access to notationProvider
|
|
168
|
+
this._tempReadForNotProvider = false;
|
|
160
169
|
delete this.column.temp[SeqTemps.seqHandler];
|
|
161
170
|
|
|
162
171
|
this.column.fireValuesChanged();
|
|
@@ -371,7 +380,7 @@ export class SeqHandler implements ISeqHandler {
|
|
|
371
380
|
const seq = this.column.get(rowIdx);
|
|
372
381
|
if (this.notation === NOTATION.HELM)
|
|
373
382
|
resHelm = seq;
|
|
374
|
-
else if (this.notation === NOTATION.CUSTOM)
|
|
383
|
+
else if (this.notation === NOTATION.CUSTOM || this.notation === NOTATION.BILN)
|
|
375
384
|
resHelm = this.notationProvider!.getHelm(seq, {});
|
|
376
385
|
else
|
|
377
386
|
resHelm = this.getConverter(NOTATION.HELM)(seq);
|
|
@@ -440,6 +449,8 @@ export class SeqHandler implements ISeqHandler {
|
|
|
440
449
|
|
|
441
450
|
public isCustom(): boolean { return this.notation === NOTATION.CUSTOM; }
|
|
442
451
|
|
|
452
|
+
public isBiln(): boolean { return this.notation === NOTATION.BILN; }
|
|
453
|
+
|
|
443
454
|
public isRna(): boolean { return this.alphabet === ALPHABET.RNA; }
|
|
444
455
|
|
|
445
456
|
public isDna(): boolean { return this.alphabet === ALPHABET.DNA; }
|
|
@@ -471,6 +482,8 @@ export class SeqHandler implements ISeqHandler {
|
|
|
471
482
|
return NOTATION.HELM;
|
|
472
483
|
else if (this.units.toLowerCase().startsWith(NOTATION.CUSTOM))
|
|
473
484
|
return NOTATION.CUSTOM;
|
|
485
|
+
else if (this.units.toLowerCase().startsWith(NOTATION.BILN))
|
|
486
|
+
return NOTATION.BILN;
|
|
474
487
|
else
|
|
475
488
|
throw new Error(`Column '${this.column.name}' has unexpected notation '${this.units}'.`);
|
|
476
489
|
}
|
|
@@ -538,6 +551,12 @@ export class SeqHandler implements ISeqHandler {
|
|
|
538
551
|
newColumn.setTag(TAGS.alphabetSize, srcAlphabetSize);
|
|
539
552
|
}
|
|
540
553
|
|
|
554
|
+
// if its biln, we need to set it as a separator column, later to be refined
|
|
555
|
+
if (tgtNotation === NOTATION.BILN) {
|
|
556
|
+
newColumn.setTag(TAGS.separator, '-');
|
|
557
|
+
newColumn.meta.units = NOTATION.SEPARATOR;
|
|
558
|
+
}
|
|
559
|
+
|
|
541
560
|
return newColumn;
|
|
542
561
|
}
|
|
543
562
|
|
|
@@ -828,6 +847,10 @@ export class SeqHandler implements ISeqHandler {
|
|
|
828
847
|
res = function(srcSS: ISeqSplitted): string { return joinToHelm(srcSS, wrappers, isDnaOrRna); };
|
|
829
848
|
break;
|
|
830
849
|
}
|
|
850
|
+
case NOTATION.BILN: {
|
|
851
|
+
res = function(srcSS: ISeqSplitted): string { return joinToBiln(srcSS); };
|
|
852
|
+
break;
|
|
853
|
+
}
|
|
831
854
|
default:
|
|
832
855
|
throw new Error(`Unexpected notation '${notation}'.`);
|
|
833
856
|
}
|
|
@@ -846,8 +869,10 @@ export class SeqHandler implements ISeqHandler {
|
|
|
846
869
|
return function(srcSeq: string) { return srcSh.convertToHelm(srcSeq); };
|
|
847
870
|
else if (tgtUnits === NOTATION.SEPARATOR)
|
|
848
871
|
return function(srcSeq: string) { return srcSh.convertToSeparator(srcSeq, tgtSeparator!); };
|
|
872
|
+
else if (tgtUnits === NOTATION.BILN)
|
|
873
|
+
return function(srcSeq: string) { return srcSh.convertToBiln(srcSeq); };
|
|
849
874
|
else
|
|
850
|
-
throw new Error();
|
|
875
|
+
throw new Error('Unexpected target units \'' + tgtUnits + '\'.');
|
|
851
876
|
}
|
|
852
877
|
|
|
853
878
|
/** Gets a column's UnitsHandler object from temp slot or creates a new and stores it to the temp slot. */
|
|
@@ -897,11 +922,17 @@ export class SeqHandler implements ISeqHandler {
|
|
|
897
922
|
|
|
898
923
|
const wrappers = this.getHelmWrappers();
|
|
899
924
|
|
|
900
|
-
const isDnaOrRna =
|
|
925
|
+
const isDnaOrRna = this.isDna() || this.isRna();
|
|
901
926
|
const srcSS = this.splitter(src);
|
|
902
927
|
return joinToHelm(srcSS, wrappers, isDnaOrRna);
|
|
903
928
|
}
|
|
904
929
|
|
|
930
|
+
private convertToBiln(src: string): string {
|
|
931
|
+
if (this.notation == NOTATION.BILN) return src;
|
|
932
|
+
const srcSS = this.splitter(src);
|
|
933
|
+
return joinToBiln(srcSS);
|
|
934
|
+
}
|
|
935
|
+
|
|
905
936
|
/** Splits Helm sequence adjusting nucleotides to single char symbols. (!) Removes lone phosphorus. */
|
|
906
937
|
private splitterAsHelmNucl(src: string): ISeqSplitted {
|
|
907
938
|
const srcMList: ISeqSplitted = this.splitter(src);
|
|
@@ -921,11 +952,11 @@ export class SeqHandler implements ISeqHandler {
|
|
|
921
952
|
|
|
922
953
|
// Custom notation provider
|
|
923
954
|
|
|
924
|
-
getRendererBack(gridCol: DG.GridColumn | null, tableCol: DG.Column<string>): CellRendererBackBase<string> {
|
|
955
|
+
getRendererBack(gridCol: DG.GridColumn | null, tableCol: DG.Column<string>): CellRendererBackBase<string> | null {
|
|
925
956
|
const temp = this.column.temp as GridCellRendererTemp<any>;
|
|
926
957
|
let res = temp.rendererBack;
|
|
927
958
|
if (!res)
|
|
928
|
-
res = temp.rendererBack = this.notationProvider
|
|
959
|
+
res = temp.rendererBack = this.notationProvider?.createCellRendererBack(gridCol, tableCol);
|
|
929
960
|
return res;
|
|
930
961
|
}
|
|
931
962
|
}
|
|
@@ -950,20 +981,95 @@ function joinToSeparator(seqS: ISeqSplitted, tgtSeparator: string, isHelm: boole
|
|
|
950
981
|
}
|
|
951
982
|
|
|
952
983
|
function joinToHelm(srcSS: ISeqSplitted, wrappers: string[], isDnaOrRna: boolean): string {
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
const
|
|
957
|
-
let
|
|
958
|
-
|
|
959
|
-
om =
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
984
|
+
if (!srcSS.graphInfo || !((srcSS.graphInfo.connections?.length ?? 0) > 0)) {
|
|
985
|
+
// no graph info - linear sequence
|
|
986
|
+
const [prefix, leftWrapper, rightWrapper, postfix] = wrappers;
|
|
987
|
+
const resOMList: string[] = new Array<string>(srcSS.length);
|
|
988
|
+
for (let posIdx: number = 0; posIdx < srcSS.length; ++posIdx) {
|
|
989
|
+
const cm = srcSS.getCanonical(posIdx);
|
|
990
|
+
let om: string = cm;
|
|
991
|
+
if (cm === GAP_SYMBOL)
|
|
992
|
+
om = GapOriginals[NOTATION.HELM];
|
|
993
|
+
else {
|
|
994
|
+
if (isDnaOrRna)
|
|
995
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
996
|
+
om = om.length === 1 ? `${leftWrapper}${om}${rightWrapper}` : `${leftWrapper}[${om}]${rightWrapper}`;
|
|
997
|
+
}
|
|
998
|
+
resOMList[posIdx] = om;
|
|
999
|
+
}
|
|
1000
|
+
return `${prefix}${resOMList.join('.')}${postfix}`;
|
|
1001
|
+
} else {
|
|
1002
|
+
// there is a graph info - so we will need to be a bit tricky here
|
|
1003
|
+
const seqType = isDnaOrRna ? 'RNA' : 'PEPTIDE';
|
|
1004
|
+
const postFix = '$$$'; // three dollar signs - one is placed at the end of sequences
|
|
1005
|
+
const disjointSequenceIdxs = srcSS.graphInfo.disjointSeqStarts;
|
|
1006
|
+
const leftWrapper = wrappers[1];
|
|
1007
|
+
const rightWrapper = wrappers[2];
|
|
1008
|
+
const disjointSequences: string[] = [];
|
|
1009
|
+
for (let i = 0; i < disjointSequenceIdxs.length; i++) {
|
|
1010
|
+
const startIdx = disjointSequenceIdxs[i];
|
|
1011
|
+
const endIdx = i + 1 < disjointSequenceIdxs.length ? disjointSequenceIdxs[i + 1] : srcSS.length;
|
|
1012
|
+
const resOMList: string[] = new Array<string>(endIdx - startIdx);
|
|
1013
|
+
for (let posIdx = startIdx; posIdx < endIdx; ++posIdx) {
|
|
1014
|
+
const cm = srcSS.getCanonical(posIdx);
|
|
1015
|
+
let om: string = cm;
|
|
1016
|
+
if (cm === GAP_SYMBOL)
|
|
1017
|
+
om = GapOriginals[NOTATION.HELM];
|
|
1018
|
+
else {
|
|
1019
|
+
if (isDnaOrRna)
|
|
1020
|
+
om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
|
|
1021
|
+
om = om.length === 1 ? `${leftWrapper}${om}${rightWrapper}` : `${leftWrapper}[${om}]${rightWrapper}`;
|
|
1022
|
+
}
|
|
1023
|
+
resOMList[posIdx - startIdx] = om;
|
|
1024
|
+
}
|
|
1025
|
+
disjointSequences.push(`${seqType}${i + 1}{${resOMList.join('.')}}`);
|
|
1026
|
+
}
|
|
1027
|
+
// PEPTIDE2,PEPTIDE2,16:R2-1:R1|PEPTIDE3,PEPTIDE3,16:R2-1:R1|PEPTIDE3,PEPTIDE2,10:R3-1:R3|PEPTIDE1,PEPTIDE2,1:R2-9:R3$$$V2.0
|
|
1028
|
+
const sequencePart = disjointSequences.join('|');
|
|
1029
|
+
|
|
1030
|
+
const sequenceConnections = srcSS.graphInfo.connections.map((conn) => {
|
|
1031
|
+
return `${seqType}${conn.seqIndex1 + 1},${seqType}${conn.seqIndex2 + 1},${conn.monomerIndex1 + 1}:R${conn.rGroup1}-${conn.monomerIndex2 + 1}:R${conn.rGroup2}`;
|
|
1032
|
+
}).join('|');
|
|
1033
|
+
|
|
1034
|
+
return `${sequencePart}$${sequenceConnections}${postFix}V2.0`;
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
function joinToBiln(srcSS: ISeqSplitted): string {
|
|
1039
|
+
if (!srcSS.graphInfo || !((srcSS.graphInfo.connections?.length ?? 0) > 0)) {
|
|
1040
|
+
const resOMList: string[] = new Array<string>(srcSS.length);
|
|
1041
|
+
for (let posIdx: number = 0; posIdx < srcSS.length; ++posIdx) {
|
|
1042
|
+
resOMList[posIdx] = srcSS.getCanonical(posIdx);
|
|
1043
|
+
if (resOMList[posIdx]?.includes('-')) // Biln uses '-' as a separator, need to enclose in []
|
|
1044
|
+
resOMList[posIdx] = `[${resOMList[posIdx]}]`;
|
|
1045
|
+
}
|
|
1046
|
+
return resOMList.join('-'); // Biln uses '-' as a separator
|
|
1047
|
+
} else { // conversion happens only if there is a graph info
|
|
1048
|
+
const disjointSequenceIdxs = srcSS.graphInfo.disjointSeqStarts;
|
|
1049
|
+
const allSeqParts = new Array<string>(srcSS.length);
|
|
1050
|
+
for (let posIdx = 0; posIdx < srcSS.length; ++posIdx) {
|
|
1051
|
+
allSeqParts[posIdx] = srcSS.getCanonical(posIdx);
|
|
1052
|
+
if (allSeqParts[posIdx]?.includes('-')) // Biln uses '-' as a separator, need to enclose in []
|
|
1053
|
+
allSeqParts[posIdx] = `[${allSeqParts[posIdx]}]`;
|
|
964
1054
|
}
|
|
965
|
-
|
|
1055
|
+
for (let i = 0; i < srcSS.graphInfo.connections.length; i++) {
|
|
1056
|
+
const conn: ISeqConnection = srcSS.graphInfo.connections[i];
|
|
1057
|
+
const conId = `${i + 1}`;
|
|
1058
|
+
const seq1Idx = conn.seqIndex1;
|
|
1059
|
+
const seq2Idx = conn.seqIndex2;
|
|
1060
|
+
const monomer1Idx = disjointSequenceIdxs[seq1Idx] + conn.monomerIndex1;
|
|
1061
|
+
const monomer2Idx = disjointSequenceIdxs[seq2Idx] + conn.monomerIndex2;
|
|
1062
|
+
const seqPart1 = `${allSeqParts[monomer1Idx]}(${conId},${conn.rGroup1})`;
|
|
1063
|
+
const seqPart2 = `${allSeqParts[monomer2Idx]}(${conId},${conn.rGroup2})`;
|
|
1064
|
+
allSeqParts[monomer1Idx] = seqPart1;
|
|
1065
|
+
allSeqParts[monomer2Idx] = seqPart2;
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
const disjointParts = disjointSequenceIdxs.map((startIdx, i) => {
|
|
1069
|
+
const endIdx = i + 1 < disjointSequenceIdxs.length ? disjointSequenceIdxs[i + 1] : srcSS.length;
|
|
1070
|
+
return allSeqParts.slice(startIdx, endIdx).join('-');
|
|
1071
|
+
});
|
|
1072
|
+
return disjointParts.join('.'); // Biln uses '-' as a separator and '.' between disjoint sequences
|
|
966
1073
|
}
|
|
967
|
-
return `${prefix}${resOMList.join('.')}${postfix}`;
|
|
968
1074
|
}
|
|
969
1075
|
|
|
@@ -136,7 +136,7 @@ export class SeqHelper implements ISeqHelper {
|
|
|
136
136
|
}
|
|
137
137
|
|
|
138
138
|
public setUnitsToSeparatorColumn(uh: SeqHandler, separator?: string) {
|
|
139
|
-
if (uh.column.semType !== DG.SEMTYPE.MACROMOLECULE
|
|
139
|
+
if (uh.column.semType !== DG.SEMTYPE.MACROMOLECULE)
|
|
140
140
|
throw new Error(`The column of notation '${NOTATION.SEPARATOR}' must be '${DG.SEMTYPE.MACROMOLECULE}'.`);
|
|
141
141
|
if (!separator)
|
|
142
142
|
throw new Error(`The column of notation '${NOTATION.SEPARATOR}' must have the separator tag.`);
|
|
@@ -109,7 +109,7 @@ export function getMacromoleculeColumnPropertyPanel(col: DG.Column): DG.Widget {
|
|
|
109
109
|
const units = col.meta.units;
|
|
110
110
|
|
|
111
111
|
// Don't show for formats that have their own complex renderers (like Helm).
|
|
112
|
-
if (units === NOTATION.HELM
|
|
112
|
+
if (units === NOTATION.HELM)
|
|
113
113
|
return false;
|
|
114
114
|
|
|
115
115
|
// For all other cases, including 'UN' (non-canonical), 'fasta', and 'separator' show the multiline toggle.
|