@datagrok/bio 1.4.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package-test.js +1353 -148
- package/dist/package.js +1192 -73
- package/dist/vendors-node_modules_datagrok-libraries_ml_src_workers_dimensionality-reducer_js.js +8975 -0
- package/files/sample_FASTA.csv +66 -0
- package/files/sample_FASTA_with_activities.csv +66 -0
- package/files/samples/peptides_complex_msa.csv +10275 -0
- package/files/samples/peptides_simple_msa.csv +648 -0
- package/files/samples/sample_HELM.csv +541 -0
- package/files/samples/sample_MSA.csv +541 -0
- package/package.json +6 -3
- package/src/package-test.ts +2 -1
- package/src/package.ts +55 -39
- package/src/tests/sequence-space-test.ts +24 -0
- package/src/tests/utils.ts +14 -1
- package/src/utils/convert.ts +24 -0
- package/src/utils/multiple-sequence-alignment.ts +1 -32
- package/src/utils/sequence-space.ts +43 -0
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "@datagrok/bio",
|
|
3
3
|
"beta": false,
|
|
4
4
|
"friendlyName": "Bio",
|
|
5
|
-
"version": "1.4.
|
|
5
|
+
"version": "1.4.1",
|
|
6
6
|
"description": "Bio is a [package](https://datagrok.ai/help/develop/develop#packages) for the [Datagrok](https://datagrok.ai) platform",
|
|
7
7
|
"repository": {
|
|
8
8
|
"type": "git",
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
"@biowasm/aioli": ">=2.4.0",
|
|
14
14
|
"@datagrok-libraries/bio": "^2.2.0",
|
|
15
15
|
"@datagrok-libraries/utils": "^0.4.2",
|
|
16
|
+
"@datagrok-libraries/ml": "^2.0.2",
|
|
16
17
|
"cash-dom": "latest",
|
|
17
18
|
"datagrok-api": "^1.4.11",
|
|
18
19
|
"dayjs": "latest",
|
|
@@ -35,7 +36,8 @@
|
|
|
35
36
|
"scripts": {
|
|
36
37
|
"link-api": "npm link datagrok-api",
|
|
37
38
|
"link-bio": "npm link @datagrok-libraries/bio",
|
|
38
|
-
"link-
|
|
39
|
+
"link-ml": "npm link @datagrok-libraries/ml",
|
|
40
|
+
"link-all": "npm link datagrok-api @datagrok-libraries/utils @datagrok-libraries/bio @datagrok-libraries/ml",
|
|
39
41
|
"debug-sequences1": "grok publish --rebuild",
|
|
40
42
|
"release-sequences1": "grok publish --rebuild --release",
|
|
41
43
|
"build-sequences1": "webpack",
|
|
@@ -48,7 +50,8 @@
|
|
|
48
50
|
"lint": "eslint \"./src/**/*.ts\"",
|
|
49
51
|
"lint-fix": "eslint \"./src/**/*.ts\" --fix",
|
|
50
52
|
"test": "jest",
|
|
51
|
-
"test-local": "set HOST=localhost && jest"
|
|
53
|
+
"test-local": "set HOST=localhost && jest",
|
|
54
|
+
"build-bio-local": "npm --prefix ./../../js-api run build && npm --prefix ./../../libraries/utils run build && npm --prefix ./../../libraries/ml run build && npm run build && npm --prefix ./../../libraries/bio run build && npm run build"
|
|
52
55
|
},
|
|
53
56
|
"canEdit": [
|
|
54
57
|
"Developers"
|
package/src/package-test.ts
CHANGED
|
@@ -6,8 +6,9 @@ import './tests/WebLogo-test';
|
|
|
6
6
|
import './tests/Palettes-test';
|
|
7
7
|
import './tests/detectors-test';
|
|
8
8
|
import './tests/msa-tests';
|
|
9
|
+
import './tests/sequence-space-test';
|
|
9
10
|
|
|
10
|
-
export const
|
|
11
|
+
export const _package = new DG.Package();
|
|
11
12
|
export {tests};
|
|
12
13
|
|
|
13
14
|
/** For the 'test' function argument names are fixed as 'category' and 'test' because of way it is called. */
|
package/src/package.ts
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
import * as grok from 'datagrok-api/grok';
|
|
3
3
|
import * as ui from 'datagrok-api/ui';
|
|
4
4
|
import * as DG from 'datagrok-api/dg';
|
|
5
|
-
|
|
6
5
|
import {SequenceAlignment, Aligned} from './seq_align';
|
|
7
6
|
|
|
8
7
|
export const _package = new DG.Package();
|
|
@@ -10,10 +9,10 @@ export const _package = new DG.Package();
|
|
|
10
9
|
import {WebLogo} from '@datagrok-libraries/bio/src/viewers/web-logo';
|
|
11
10
|
import {VdRegionsViewer} from './viewers/vd-regions-viewer';
|
|
12
11
|
import {runKalign, testMSAEnoughMemory} from './utils/multiple-sequence-alignment';
|
|
12
|
+
import {convert} from './utils/convert';
|
|
13
13
|
import {TableView} from 'datagrok-api/dg';
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
import {Aminoacids} from '@datagrok-libraries/bio/src/aminoacids';
|
|
14
|
+
import { getEmbeddingColsNames, sequenceSpace } from './utils/sequence-space';
|
|
15
|
+
import { AvailableMetrics } from '@datagrok-libraries/ml/src/typed-metrics';
|
|
17
16
|
|
|
18
17
|
//name: sequenceAlignment
|
|
19
18
|
//input: string alignType {choices: ['Local alignment', 'Global alignment']}
|
|
@@ -60,12 +59,23 @@ export async function activityCliffs(df: DG.DataFrame, smiles: DG.Column, activi
|
|
|
60
59
|
//top-menu: Bio | Sequence Space...
|
|
61
60
|
//name: Sequence Space
|
|
62
61
|
//input: dataframe table
|
|
63
|
-
//input: column
|
|
64
|
-
//input: string methodName { choices:["UMAP", "t-SNE", "SPE"
|
|
65
|
-
//input: string similarityMetric { choices:["
|
|
62
|
+
//input: column macroMolecule { semType: Macromolecule }
|
|
63
|
+
//input: string methodName { choices:["UMAP", "t-SNE", "SPE"] }
|
|
64
|
+
//input: string similarityMetric { choices:["Levenshtein", "Tanimoto"] }
|
|
66
65
|
//input: bool plotEmbeddings = true
|
|
67
|
-
export async function
|
|
68
|
-
similarityMetric: string = '
|
|
66
|
+
export async function sequenceSpaceTopMenu(table: DG.DataFrame, macroMolecule: DG.Column, methodName: string,
|
|
67
|
+
similarityMetric: string = 'Levenshtein', plotEmbeddings: boolean) : Promise<void> {
|
|
68
|
+
const embedColsNames = getEmbeddingColsNames(table);
|
|
69
|
+
const sequenceSpaceRes = await sequenceSpace(macroMolecule, methodName, similarityMetric, embedColsNames);
|
|
70
|
+
const embeddings = sequenceSpaceRes.coordinates;
|
|
71
|
+
for (const col of embeddings)
|
|
72
|
+
table.columns.add(col);
|
|
73
|
+
if (plotEmbeddings) {
|
|
74
|
+
for (let v of grok.shell.views) {
|
|
75
|
+
if (v.name === table.name)
|
|
76
|
+
(v as DG.TableView).scatterPlot({x: embedColsNames[0], y: embedColsNames[1]});
|
|
77
|
+
}
|
|
78
|
+
}
|
|
69
79
|
};
|
|
70
80
|
|
|
71
81
|
//top-menu: Bio | MSA...
|
|
@@ -97,44 +107,50 @@ export async function compositionAnalysis(): Promise<void> {
|
|
|
97
107
|
}
|
|
98
108
|
}
|
|
99
109
|
|
|
110
|
+
// helper function for importFasta
|
|
111
|
+
function parseMacromolecule(
|
|
112
|
+
fileContent: string,
|
|
113
|
+
startOfSequence: number,
|
|
114
|
+
endOfSequence: number
|
|
115
|
+
): string {
|
|
116
|
+
const seq = fileContent.slice(startOfSequence, endOfSequence);
|
|
117
|
+
const seqArray = seq.split(/\s/);
|
|
118
|
+
return seqArray.join('');
|
|
119
|
+
}
|
|
120
|
+
|
|
100
121
|
//name: importFasta
|
|
101
122
|
//description: Opens FASTA file
|
|
102
123
|
//tags: file-handler
|
|
103
124
|
//meta.ext: fasta, fna, ffn, faa, frn, fa
|
|
104
125
|
//input: string content
|
|
105
126
|
//output: list tables
|
|
106
|
-
export function importFasta(
|
|
107
|
-
const regex = /^>(.*)$/gm;
|
|
108
|
-
const
|
|
109
|
-
const
|
|
110
|
-
let
|
|
111
|
-
let match;
|
|
112
|
-
while (match = regex.exec(
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
127
|
+
export function importFasta(fileContent: string): DG.DataFrame [] {
|
|
128
|
+
const regex = /^>(.*)$/gm; // match the line starting with >
|
|
129
|
+
const descriptionsArray = [];
|
|
130
|
+
const sequencesArray: string[] = [];
|
|
131
|
+
let startOfSequence = 0;
|
|
132
|
+
let match; // match.index is the beginning of the matched line
|
|
133
|
+
while (match = regex.exec(fileContent)) {
|
|
134
|
+
const description = fileContent.substring(match.index + 1, regex.lastIndex);
|
|
135
|
+
descriptionsArray.push(description);
|
|
136
|
+
if (startOfSequence !== 0)
|
|
137
|
+
sequencesArray.push(parseMacromolecule(fileContent, startOfSequence, match.index));
|
|
138
|
+
startOfSequence = regex.lastIndex + 1;
|
|
117
139
|
}
|
|
118
|
-
|
|
119
|
-
const
|
|
120
|
-
const sequenceCol = DG.Column.fromStrings('sequence',
|
|
121
|
-
|
|
122
|
-
const stats: { freq: { [m: string]: number }, sameLength: boolean } = WebLogo.getStats(sequenceCol, 5, WebLogo.splitterAsFasta);
|
|
123
|
-
const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';
|
|
124
|
-
const alphabetCandidates: [string, Set<string>][] = [
|
|
125
|
-
['NT', new Set(Object.keys(Nucleotides.Names)),],
|
|
126
|
-
['PT', new Set(Object.keys(Aminoacids.Names)),],
|
|
127
|
-
];
|
|
128
|
-
// Calculate likelihoods for alphabet_candidates
|
|
129
|
-
const alphabetCandidatesSim: number[] = alphabetCandidates.map(
|
|
130
|
-
(c) => WebLogo.getAlphabetSimilarity(stats.freq, c[1]));
|
|
131
|
-
const maxCos = Math.max(...alphabetCandidatesSim);
|
|
132
|
-
const alphabet = maxCos > 0.65 ? alphabetCandidates[alphabetCandidatesSim.indexOf(maxCos)][0] : 'UN';
|
|
133
|
-
sequenceCol.semType = mmSemType;
|
|
134
|
-
sequenceCol.setTag(DG.TAGS.UNITS, `fasta:${seqType}:${alphabet}`);
|
|
135
|
-
|
|
140
|
+
sequencesArray.push(parseMacromolecule(fileContent, startOfSequence, -1));
|
|
141
|
+
const descriptionsArrayCol = DG.Column.fromStrings('description', descriptionsArray);
|
|
142
|
+
const sequenceCol = DG.Column.fromStrings('sequence', sequencesArray);
|
|
143
|
+
sequenceCol.semType = 'Macromolecule';
|
|
136
144
|
return [DG.DataFrame.fromColumns([
|
|
137
|
-
|
|
145
|
+
descriptionsArrayCol,
|
|
138
146
|
sequenceCol,
|
|
139
147
|
])];
|
|
140
148
|
}
|
|
149
|
+
|
|
150
|
+
//name: Bio | Convert
|
|
151
|
+
//friendly-name: Bio | Convert
|
|
152
|
+
//tags: panel, bio
|
|
153
|
+
//input: column col {semType: Macromolecule}
|
|
154
|
+
export function convertPanel(col: DG.Column): void {
|
|
155
|
+
convert(col);
|
|
156
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import {before, category, test, expect} from '@datagrok-libraries/utils/src/test';
|
|
2
|
+
import * as DG from "datagrok-api/dg";
|
|
3
|
+
import { sequenceSpace } from '../utils/sequence-space';
|
|
4
|
+
import { readDataframe } from './utils';
|
|
5
|
+
//import * as grok from 'datagrok-api/grok';
|
|
6
|
+
|
|
7
|
+
category('sequenceSpace', async () => {
|
|
8
|
+
|
|
9
|
+
let testFastaDf: DG.DataFrame;
|
|
10
|
+
|
|
11
|
+
before(async () => {
|
|
12
|
+
//@ts-ignore
|
|
13
|
+
testFastaDf = await readDataframe('sample_FASTA.csv');
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
test('sequenceSpaceOpens', async () => {
|
|
18
|
+
//@ts-ignore
|
|
19
|
+
const res = await sequenceSpace(testFastaDf.col('Sequence')!, 't-SNE', 'Levenshtein', ['Embed_X', 'Embed_Y']);
|
|
20
|
+
expect(res.coordinates != undefined, true);
|
|
21
|
+
expect(res.distance != undefined, true);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
});
|
package/src/tests/utils.ts
CHANGED
|
@@ -1,7 +1,20 @@
|
|
|
1
1
|
import * as DG from 'datagrok-api/dg';
|
|
2
|
-
|
|
2
|
+
import * as grok from "datagrok-api/grok";
|
|
3
3
|
import {expect} from '@datagrok-libraries/utils/src/test';
|
|
4
4
|
import {runKalign} from '../utils/multiple-sequence-alignment';
|
|
5
|
+
import { _package} from '../package-test';
|
|
6
|
+
|
|
7
|
+
export async function loadFileAsText(name: string): Promise<string> {
|
|
8
|
+
return await _package.files.readAsText(name);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export async function readDataframe(tableName: string): Promise<DG.DataFrame> {
|
|
12
|
+
const file = await loadFileAsText(tableName);
|
|
13
|
+
const df = DG.DataFrame.fromCsv(file);
|
|
14
|
+
df.name = tableName.replace('.csv', '');
|
|
15
|
+
return df;
|
|
16
|
+
}
|
|
17
|
+
|
|
5
18
|
|
|
6
19
|
/**
|
|
7
20
|
* Tests if a table has non zero rows and columns.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import * as DG from 'datagrok-api/dg';
|
|
2
|
+
import * as ui from 'datagrok-api/ui';
|
|
3
|
+
|
|
4
|
+
export function convert(col: DG.Column): void {
|
|
5
|
+
|
|
6
|
+
const current = col.tags[DG.TAGS.UNITS];
|
|
7
|
+
//TODO: read all notations
|
|
8
|
+
const notations = ['fasta:SEQ:NT', 'fasta:SEQ:PT', 'fasta:SEQ.MSA:NT', 'fasta:SEQ.MSA:PT', 'HELM'];
|
|
9
|
+
const choices = ui.choiceInput("convert to", "", notations.filter(e => e !== current));
|
|
10
|
+
|
|
11
|
+
ui.dialog('Convert sequence')
|
|
12
|
+
.add(
|
|
13
|
+
ui.div([
|
|
14
|
+
ui.h1('current notation'),
|
|
15
|
+
ui.div(current),
|
|
16
|
+
choices.root
|
|
17
|
+
])
|
|
18
|
+
)
|
|
19
|
+
.onOK(() => {
|
|
20
|
+
//TODO: create new converted column
|
|
21
|
+
//col.dataFrame.columns.add();
|
|
22
|
+
})
|
|
23
|
+
.show();
|
|
24
|
+
}
|
|
@@ -26,37 +26,6 @@ function _fastaToStrings(fasta: string): string[] {
|
|
|
26
26
|
return fasta.replace(/>sample\d+(\r\n|\r|\n)/g, '').split('\n');
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
-
/**
|
|
30
|
-
* Converts aligned sequence to semantic type format.
|
|
31
|
-
*
|
|
32
|
-
* @param {string} seq Source sequence.
|
|
33
|
-
* @return {string} Formatted sequence.
|
|
34
|
-
*/
|
|
35
|
-
function _castAligned(seq: string): string {
|
|
36
|
-
let delimited = '';
|
|
37
|
-
|
|
38
|
-
for (const char of seq)
|
|
39
|
-
delimited += char == '-' ? char : `-${char}`;
|
|
40
|
-
|
|
41
|
-
return delimited;
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
/**
|
|
45
|
-
* Formats a batch of sequences to correspond the semantic type.
|
|
46
|
-
*
|
|
47
|
-
* @param {string[]} alignment List of aligned sequences.
|
|
48
|
-
* @return {string[]} Formatted sequences.
|
|
49
|
-
*/
|
|
50
|
-
function _stringsToAligned(alignment: string[]): string[] {
|
|
51
|
-
const nItems = alignment.length;
|
|
52
|
-
const aligned = new Array<string>(nItems);
|
|
53
|
-
|
|
54
|
-
for (let i = 0; i < nItems; ++i)
|
|
55
|
-
aligned[i] = _castAligned(alignment[i]);
|
|
56
|
-
|
|
57
|
-
return aligned;
|
|
58
|
-
}
|
|
59
|
-
|
|
60
29
|
/**
|
|
61
30
|
* Runs Aioli environment with kalign tool.
|
|
62
31
|
*
|
|
@@ -86,7 +55,7 @@ export async function runKalign(col: DG.Column, isAligned = false) : Promise<DG.
|
|
|
86
55
|
console.warn(output);
|
|
87
56
|
|
|
88
57
|
const aligned = _fastaToStrings(buf).slice(0, sequences.length);
|
|
89
|
-
const alignedCol = DG.Column.fromStrings(`msa(${col.name})`,
|
|
58
|
+
const alignedCol = DG.Column.fromStrings(`msa(${col.name})`, aligned);
|
|
90
59
|
alignedCol.setTag(DG.TAGS.UNITS, '');
|
|
91
60
|
alignedCol.semType = C.SEM_TYPES.Macro_Molecule;
|
|
92
61
|
return alignedCol;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import * as DG from 'datagrok-api/dg';
|
|
2
|
+
import { AvailableMetrics } from '@datagrok-libraries/ml/src/typed-metrics';
|
|
3
|
+
import {reduceDimensinalityWithNormalization} from '@datagrok-libraries/ml/src/sequence-space';
|
|
4
|
+
import {BitArrayMetrics, StringMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
|
|
5
|
+
import { Matrix } from '@datagrok-libraries/utils/src/type-declarations';
|
|
6
|
+
import BitArray from '@datagrok-libraries/utils/src/bit-array';
|
|
7
|
+
|
|
8
|
+
export interface ISequenceSpaceResult {
|
|
9
|
+
distance: Matrix;
|
|
10
|
+
coordinates: DG.ColumnList;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export async function sequenceSpace(molColumn: DG.Column, methodName: string, similarityMetric: string,
|
|
14
|
+
axes: string[], options?: any): Promise<ISequenceSpaceResult> {
|
|
15
|
+
let preparedData: any;
|
|
16
|
+
if (!(molColumn!.tags[DG.TAGS.UNITS] === 'HELM')) {
|
|
17
|
+
const sep = molColumn.getTag('separator');
|
|
18
|
+
const sepFinal = sep ? sep === '.' ? '\\\.' : sep: '-';
|
|
19
|
+
var regex = new RegExp(sepFinal, "g");
|
|
20
|
+
if (Object.keys(AvailableMetrics['String']).includes(similarityMetric)) {
|
|
21
|
+
preparedData = molColumn.toList().map((v) => v.replace(regex, '')) as string[];
|
|
22
|
+
} else {
|
|
23
|
+
preparedData = molColumn.toList().map((v) => v.replace(regex, '')) as string[];
|
|
24
|
+
}
|
|
25
|
+
} else {
|
|
26
|
+
preparedData = molColumn.toList();
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const sequenceSpaceResult = await reduceDimensinalityWithNormalization(
|
|
30
|
+
preparedData,
|
|
31
|
+
methodName,
|
|
32
|
+
similarityMetric as StringMetrics|BitArrayMetrics,
|
|
33
|
+
options);
|
|
34
|
+
const cols: DG.Column[] = axes.map((name, index) => DG.Column.fromFloat32Array(name, sequenceSpaceResult.embedding[index]))
|
|
35
|
+
return {distance: sequenceSpaceResult.distance, coordinates: new DG.ColumnList(cols)};
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
export function getEmbeddingColsNames(df: DG.DataFrame){
|
|
40
|
+
const axes = ['Embed_X', 'Embed_Y'];
|
|
41
|
+
const colNameInd = df.columns.names().filter((it) => it.includes(axes[0])).length + 1;
|
|
42
|
+
return axes.map((it) => `${it}_${colNameInd}`);
|
|
43
|
+
}
|