@datagrok/bio 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "@datagrok/bio",
3
3
  "beta": false,
4
4
  "friendlyName": "Bio",
5
- "version": "1.4.0",
5
+ "version": "1.4.1",
6
6
  "description": "Bio is a [package](https://datagrok.ai/help/develop/develop#packages) for the [Datagrok](https://datagrok.ai) platform",
7
7
  "repository": {
8
8
  "type": "git",
@@ -13,6 +13,7 @@
13
13
  "@biowasm/aioli": ">=2.4.0",
14
14
  "@datagrok-libraries/bio": "^2.2.0",
15
15
  "@datagrok-libraries/utils": "^0.4.2",
16
+ "@datagrok-libraries/ml": "^2.0.2",
16
17
  "cash-dom": "latest",
17
18
  "datagrok-api": "^1.4.11",
18
19
  "dayjs": "latest",
@@ -35,7 +36,8 @@
35
36
  "scripts": {
36
37
  "link-api": "npm link datagrok-api",
37
38
  "link-bio": "npm link @datagrok-libraries/bio",
38
- "link-all": "npm link datagrok-api @datagrok-libraries/bio",
39
+ "link-ml": "npm link @datagrok-libraries/ml",
40
+ "link-all": "npm link datagrok-api @datagrok-libraries/utils @datagrok-libraries/bio @datagrok-libraries/ml",
39
41
  "debug-sequences1": "grok publish --rebuild",
40
42
  "release-sequences1": "grok publish --rebuild --release",
41
43
  "build-sequences1": "webpack",
@@ -48,7 +50,8 @@
48
50
  "lint": "eslint \"./src/**/*.ts\"",
49
51
  "lint-fix": "eslint \"./src/**/*.ts\" --fix",
50
52
  "test": "jest",
51
- "test-local": "set HOST=localhost && jest"
53
+ "test-local": "set HOST=localhost && jest",
54
+ "build-bio-local": "npm --prefix ./../../js-api run build && npm --prefix ./../../libraries/utils run build && npm --prefix ./../../libraries/ml run build && npm run build && npm --prefix ./../../libraries/bio run build && npm run build"
52
55
  },
53
56
  "canEdit": [
54
57
  "Developers"
@@ -6,8 +6,9 @@ import './tests/WebLogo-test';
6
6
  import './tests/Palettes-test';
7
7
  import './tests/detectors-test';
8
8
  import './tests/msa-tests';
9
+ import './tests/sequence-space-test';
9
10
 
10
- export const _packageTest = new DG.Package();
11
+ export const _package = new DG.Package();
11
12
  export {tests};
12
13
 
13
14
  /** For the 'test' function argument names are fixed as 'category' and 'test' because of way it is called. */
package/src/package.ts CHANGED
@@ -2,7 +2,6 @@
2
2
  import * as grok from 'datagrok-api/grok';
3
3
  import * as ui from 'datagrok-api/ui';
4
4
  import * as DG from 'datagrok-api/dg';
5
-
6
5
  import {SequenceAlignment, Aligned} from './seq_align';
7
6
 
8
7
  export const _package = new DG.Package();
@@ -10,10 +9,10 @@ export const _package = new DG.Package();
10
9
  import {WebLogo} from '@datagrok-libraries/bio/src/viewers/web-logo';
11
10
  import {VdRegionsViewer} from './viewers/vd-regions-viewer';
12
11
  import {runKalign, testMSAEnoughMemory} from './utils/multiple-sequence-alignment';
12
+ import {convert} from './utils/convert';
13
13
  import {TableView} from 'datagrok-api/dg';
14
- import {mmSemType} from './const';
15
- import {Nucleotides} from '@datagrok-libraries/bio/src/nucleotides';
16
- import {Aminoacids} from '@datagrok-libraries/bio/src/aminoacids';
14
+ import { getEmbeddingColsNames, sequenceSpace } from './utils/sequence-space';
15
+ import { AvailableMetrics } from '@datagrok-libraries/ml/src/typed-metrics';
17
16
 
18
17
  //name: sequenceAlignment
19
18
  //input: string alignType {choices: ['Local alignment', 'Global alignment']}
@@ -60,12 +59,23 @@ export async function activityCliffs(df: DG.DataFrame, smiles: DG.Column, activi
60
59
  //top-menu: Bio | Sequence Space...
61
60
  //name: Sequence Space
62
61
  //input: dataframe table
63
- //input: column smiles { semType: Macromolecule }
64
- //input: string methodName { choices:["UMAP", "t-SNE", "SPE", "pSPE", "OriginalSPE"] }
65
- //input: string similarityMetric { choices:["Tanimoto", "Asymmetric", "Cosine", "Sokal"] }
62
+ //input: column macroMolecule { semType: Macromolecule }
63
+ //input: string methodName { choices:["UMAP", "t-SNE", "SPE"] }
64
+ //input: string similarityMetric { choices:["Levenshtein", "Tanimoto"] }
66
65
  //input: bool plotEmbeddings = true
67
- export async function chemSpaceTopMenu(table: DG.DataFrame, smiles: DG.Column, methodName: string,
68
- similarityMetric: string = 'Tanimoto', plotEmbeddings: boolean): Promise<void> {
66
+ export async function sequenceSpaceTopMenu(table: DG.DataFrame, macroMolecule: DG.Column, methodName: string,
67
+ similarityMetric: string = 'Levenshtein', plotEmbeddings: boolean) : Promise<void> {
68
+ const embedColsNames = getEmbeddingColsNames(table);
69
+ const sequenceSpaceRes = await sequenceSpace(macroMolecule, methodName, similarityMetric, embedColsNames);
70
+ const embeddings = sequenceSpaceRes.coordinates;
71
+ for (const col of embeddings)
72
+ table.columns.add(col);
73
+ if (plotEmbeddings) {
74
+ for (let v of grok.shell.views) {
75
+ if (v.name === table.name)
76
+ (v as DG.TableView).scatterPlot({x: embedColsNames[0], y: embedColsNames[1]});
77
+ }
78
+ }
69
79
  };
70
80
 
71
81
  //top-menu: Bio | MSA...
@@ -97,44 +107,50 @@ export async function compositionAnalysis(): Promise<void> {
97
107
  }
98
108
  }
99
109
 
110
+ // helper function for importFasta
111
+ function parseMacromolecule(
112
+ fileContent: string,
113
+ startOfSequence: number,
114
+ endOfSequence: number
115
+ ): string {
116
+ const seq = fileContent.slice(startOfSequence, endOfSequence);
117
+ const seqArray = seq.split(/\s/);
118
+ return seqArray.join('');
119
+ }
120
+
100
121
  //name: importFasta
101
122
  //description: Opens FASTA file
102
123
  //tags: file-handler
103
124
  //meta.ext: fasta, fna, ffn, faa, frn, fa
104
125
  //input: string content
105
126
  //output: list tables
106
- export function importFasta(content: string): DG.DataFrame [] {
107
- const regex = /^>(.*)$/gm;
108
- const descriptions = [];
109
- const sequences = [];
110
- let index = 0;
111
- let match;
112
- while (match = regex.exec(content)) {
113
- descriptions.push(content.substring(match.index + 1, regex.lastIndex));
114
- if (index !== 0)
115
- sequences.push(content.substring(index, regex.lastIndex));
116
- index = regex.lastIndex + 1;
127
+ export function importFasta(fileContent: string): DG.DataFrame [] {
128
+ const regex = /^>(.*)$/gm; // match the line starting with >
129
+ const descriptionsArray = [];
130
+ const sequencesArray: string[] = [];
131
+ let startOfSequence = 0;
132
+ let match; // match.index is the beginning of the matched line
133
+ while (match = regex.exec(fileContent)) {
134
+ const description = fileContent.substring(match.index + 1, regex.lastIndex);
135
+ descriptionsArray.push(description);
136
+ if (startOfSequence !== 0)
137
+ sequencesArray.push(parseMacromolecule(fileContent, startOfSequence, match.index));
138
+ startOfSequence = regex.lastIndex + 1;
117
139
  }
118
- sequences.push(content.substring(index));
119
- const descriptionsCol = DG.Column.fromStrings('description', descriptions);
120
- const sequenceCol = DG.Column.fromStrings('sequence', sequences);
121
-
122
- const stats: { freq: { [m: string]: number }, sameLength: boolean } = WebLogo.getStats(sequenceCol, 5, WebLogo.splitterAsFasta);
123
- const seqType = stats.sameLength ? 'SEQ.MSA' : 'SEQ';
124
- const alphabetCandidates: [string, Set<string>][] = [
125
- ['NT', new Set(Object.keys(Nucleotides.Names)),],
126
- ['PT', new Set(Object.keys(Aminoacids.Names)),],
127
- ];
128
- // Calculate likelihoods for alphabet_candidates
129
- const alphabetCandidatesSim: number[] = alphabetCandidates.map(
130
- (c) => WebLogo.getAlphabetSimilarity(stats.freq, c[1]));
131
- const maxCos = Math.max(...alphabetCandidatesSim);
132
- const alphabet = maxCos > 0.65 ? alphabetCandidates[alphabetCandidatesSim.indexOf(maxCos)][0] : 'UN';
133
- sequenceCol.semType = mmSemType;
134
- sequenceCol.setTag(DG.TAGS.UNITS, `fasta:${seqType}:${alphabet}`);
135
-
140
+ sequencesArray.push(parseMacromolecule(fileContent, startOfSequence, -1));
141
+ const descriptionsArrayCol = DG.Column.fromStrings('description', descriptionsArray);
142
+ const sequenceCol = DG.Column.fromStrings('sequence', sequencesArray);
143
+ sequenceCol.semType = 'Macromolecule';
136
144
  return [DG.DataFrame.fromColumns([
137
- descriptionsCol,
145
+ descriptionsArrayCol,
138
146
  sequenceCol,
139
147
  ])];
140
148
  }
149
+
150
+ //name: Bio | Convert
151
+ //friendly-name: Bio | Convert
152
+ //tags: panel, bio
153
+ //input: column col {semType: Macromolecule}
154
+ export function convertPanel(col: DG.Column): void {
155
+ convert(col);
156
+ }
@@ -0,0 +1,24 @@
1
+ import {before, category, test, expect} from '@datagrok-libraries/utils/src/test';
2
+ import * as DG from "datagrok-api/dg";
3
+ import { sequenceSpace } from '../utils/sequence-space';
4
+ import { readDataframe } from './utils';
5
+ //import * as grok from 'datagrok-api/grok';
6
+
7
+ category('sequenceSpace', async () => {
8
+
9
+ let testFastaDf: DG.DataFrame;
10
+
11
+ before(async () => {
12
+ //@ts-ignore
13
+ testFastaDf = await readDataframe('sample_FASTA.csv');
14
+ });
15
+
16
+
17
+ test('sequenceSpaceOpens', async () => {
18
+ //@ts-ignore
19
+ const res = await sequenceSpace(testFastaDf.col('Sequence')!, 't-SNE', 'Levenshtein', ['Embed_X', 'Embed_Y']);
20
+ expect(res.coordinates != undefined, true);
21
+ expect(res.distance != undefined, true);
22
+ });
23
+
24
+ });
@@ -1,7 +1,20 @@
1
1
  import * as DG from 'datagrok-api/dg';
2
-
2
+ import * as grok from "datagrok-api/grok";
3
3
  import {expect} from '@datagrok-libraries/utils/src/test';
4
4
  import {runKalign} from '../utils/multiple-sequence-alignment';
5
+ import { _package} from '../package-test';
6
+
7
+ export async function loadFileAsText(name: string): Promise<string> {
8
+ return await _package.files.readAsText(name);
9
+ }
10
+
11
+ export async function readDataframe(tableName: string): Promise<DG.DataFrame> {
12
+ const file = await loadFileAsText(tableName);
13
+ const df = DG.DataFrame.fromCsv(file);
14
+ df.name = tableName.replace('.csv', '');
15
+ return df;
16
+ }
17
+
5
18
 
6
19
  /**
7
20
  * Tests if a table has non zero rows and columns.
@@ -0,0 +1,24 @@
1
+ import * as DG from 'datagrok-api/dg';
2
+ import * as ui from 'datagrok-api/ui';
3
+
4
+ export function convert(col: DG.Column): void {
5
+
6
+ const current = col.tags[DG.TAGS.UNITS];
7
+ //TODO: read all notations
8
+ const notations = ['fasta:SEQ:NT', 'fasta:SEQ:PT', 'fasta:SEQ.MSA:NT', 'fasta:SEQ.MSA:PT', 'HELM'];
9
+ const choices = ui.choiceInput("convert to", "", notations.filter(e => e !== current));
10
+
11
+ ui.dialog('Convert sequence')
12
+ .add(
13
+ ui.div([
14
+ ui.h1('current notation'),
15
+ ui.div(current),
16
+ choices.root
17
+ ])
18
+ )
19
+ .onOK(() => {
20
+ //TODO: create new converted column
21
+ //col.dataFrame.columns.add();
22
+ })
23
+ .show();
24
+ }
@@ -26,37 +26,6 @@ function _fastaToStrings(fasta: string): string[] {
26
26
  return fasta.replace(/>sample\d+(\r\n|\r|\n)/g, '').split('\n');
27
27
  }
28
28
 
29
- /**
30
- * Converts aligned sequence to semantic type format.
31
- *
32
- * @param {string} seq Source sequence.
33
- * @return {string} Formatted sequence.
34
- */
35
- function _castAligned(seq: string): string {
36
- let delimited = '';
37
-
38
- for (const char of seq)
39
- delimited += char == '-' ? char : `-${char}`;
40
-
41
- return delimited;
42
- }
43
-
44
- /**
45
- * Formats a batch of sequences to correspond the semantic type.
46
- *
47
- * @param {string[]} alignment List of aligned sequences.
48
- * @return {string[]} Formatted sequences.
49
- */
50
- function _stringsToAligned(alignment: string[]): string[] {
51
- const nItems = alignment.length;
52
- const aligned = new Array<string>(nItems);
53
-
54
- for (let i = 0; i < nItems; ++i)
55
- aligned[i] = _castAligned(alignment[i]);
56
-
57
- return aligned;
58
- }
59
-
60
29
  /**
61
30
  * Runs Aioli environment with kalign tool.
62
31
  *
@@ -86,7 +55,7 @@ export async function runKalign(col: DG.Column, isAligned = false) : Promise<DG.
86
55
  console.warn(output);
87
56
 
88
57
  const aligned = _fastaToStrings(buf).slice(0, sequences.length);
89
- const alignedCol = DG.Column.fromStrings(`msa(${col.name})`, _stringsToAligned(aligned));
58
+ const alignedCol = DG.Column.fromStrings(`msa(${col.name})`, aligned);
90
59
  alignedCol.setTag(DG.TAGS.UNITS, '');
91
60
  alignedCol.semType = C.SEM_TYPES.Macro_Molecule;
92
61
  return alignedCol;
@@ -0,0 +1,43 @@
1
+ import * as DG from 'datagrok-api/dg';
2
+ import { AvailableMetrics } from '@datagrok-libraries/ml/src/typed-metrics';
3
+ import {reduceDimensinalityWithNormalization} from '@datagrok-libraries/ml/src/sequence-space';
4
+ import {BitArrayMetrics, StringMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
5
+ import { Matrix } from '@datagrok-libraries/utils/src/type-declarations';
6
+ import BitArray from '@datagrok-libraries/utils/src/bit-array';
7
+
8
+ export interface ISequenceSpaceResult {
9
+ distance: Matrix;
10
+ coordinates: DG.ColumnList;
11
+ }
12
+
13
+ export async function sequenceSpace(molColumn: DG.Column, methodName: string, similarityMetric: string,
14
+ axes: string[], options?: any): Promise<ISequenceSpaceResult> {
15
+ let preparedData: any;
16
+ if (!(molColumn!.tags[DG.TAGS.UNITS] === 'HELM')) {
17
+ const sep = molColumn.getTag('separator');
18
+ const sepFinal = sep ? sep === '.' ? '\\\.' : sep: '-';
19
+ var regex = new RegExp(sepFinal, "g");
20
+ if (Object.keys(AvailableMetrics['String']).includes(similarityMetric)) {
21
+ preparedData = molColumn.toList().map((v) => v.replace(regex, '')) as string[];
22
+ } else {
23
+ preparedData = molColumn.toList().map((v) => v.replace(regex, '')) as string[];
24
+ }
25
+ } else {
26
+ preparedData = molColumn.toList();
27
+ }
28
+
29
+ const sequenceSpaceResult = await reduceDimensinalityWithNormalization(
30
+ preparedData,
31
+ methodName,
32
+ similarityMetric as StringMetrics|BitArrayMetrics,
33
+ options);
34
+ const cols: DG.Column[] = axes.map((name, index) => DG.Column.fromFloat32Array(name, sequenceSpaceResult.embedding[index]))
35
+ return {distance: sequenceSpaceResult.distance, coordinates: new DG.ColumnList(cols)};
36
+ }
37
+
38
+
39
+ export function getEmbeddingColsNames(df: DG.DataFrame){
40
+ const axes = ['Embed_X', 'Embed_Y'];
41
+ const colNameInd = df.columns.names().filter((it) => it.includes(axes[0])).length + 1;
42
+ return axes.map((it) => `${it}_${colNameInd}`);
43
+ }