@datagrok/bio 2.11.3 → 2.11.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "name": "Leonid Stolbov",
6
6
  "email": "lstolbov@datagrok.ai"
7
7
  },
8
- "version": "2.11.3",
8
+ "version": "2.11.5",
9
9
  "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
10
10
  "repository": {
11
11
  "type": "git",
@@ -34,9 +34,9 @@
34
34
  ],
35
35
  "dependencies": {
36
36
  "@biowasm/aioli": "^3.1.0",
37
- "@datagrok-libraries/bio": "^5.39.0",
37
+ "@datagrok-libraries/bio": "^5.39.1",
38
38
  "@datagrok-libraries/chem-meta": "^1.0.1",
39
- "@datagrok-libraries/ml": "^6.3.51",
39
+ "@datagrok-libraries/ml": "^6.3.53",
40
40
  "@datagrok-libraries/tutorials": "^1.3.6",
41
41
  "@datagrok-libraries/utils": "^4.0.17",
42
42
  "cash-dom": "^8.0.0",
@@ -55,21 +55,19 @@ export async function sequenceSpaceByFingerprints(spaceParams: ISequenceSpacePar
55
55
  return result;
56
56
  }
57
57
 
58
- export async function getSequenceSpace(spaceParams: ISequenceSpaceParams,
59
- progressFunc?: (epochNum: number, epochsLength: number, embedding: number[][]) => void
60
- ): Promise<ISequenceSpaceResult> {
61
- const ncUH = UnitsHandler.getOrCreate(spaceParams.seqCol);
62
-
63
- //const distanceFName = ncUH.isMsa() ? MmDistanceFunctionsNames.HAMMING : MmDistanceFunctionsNames.LEVENSHTEIN;
64
- const seqList = spaceParams.seqCol.toList();
65
-
58
+ export async function getEncodedSeqSpaceCol(
59
+ seqCol: DG.Column, similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames
60
+ ): Promise<{seqList:string[], options: {[_:string]: any}}> {
61
+ // encodes sequences using utf charachters to also support multichar and non fasta sequences
62
+ const ncUH = UnitsHandler.getOrCreate(seqCol);
63
+ const seqList = seqCol.toList();
66
64
  const splitter = ncUH.getSplitter();
67
65
  const seqColLength = seqList.length;
68
66
  let charCodeCounter = 36;
69
67
  const charCodeMap = new Map<string, string>();
70
68
  for (let i = 0; i < seqColLength; i++) {
71
69
  const seq = seqList[i];
72
- if (seqList[i] === null || spaceParams.seqCol.isNone(i)) {
70
+ if (seqList[i] === null || seqCol.isNone(i)) {
73
71
  seqList[i] = null;
74
72
  continue;
75
73
  }
@@ -84,8 +82,8 @@ export async function getSequenceSpace(spaceParams: ISequenceSpaceParams,
84
82
  seqList[i] += charCodeMap.get(char)!;
85
83
  }
86
84
  }
87
-
88
- if (spaceParams.similarityMetric === MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE) {
85
+ let options = {};
86
+ if (similarityMetric === MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE) {
89
87
  const monomers = Array.from(charCodeMap.keys());
90
88
  const monomerRes = await calculateMonomerSimilarity(monomers);
91
89
  // the susbstitution matrix contains similarity, but we need distances
@@ -98,10 +96,34 @@ export async function getSequenceSpace(spaceParams: ISequenceSpaceParams,
98
96
  Object.entries(monomerRes.alphabetIndexes).forEach(([key, value]) => {
99
97
  monomerHashToMatrixMap[charCodeMap.get(key)!] = value;
100
98
  });
101
- spaceParams.options.distanceFnArgs = {scoringMatrix: monomerRes.scoringMatrix,
99
+ // sets distance function args in place.
100
+ options = {scoringMatrix: monomerRes.scoringMatrix,
102
101
  alphabetIndexes: monomerHashToMatrixMap} satisfies mmDistanceFunctionArgs;
103
102
  }
103
+ // else if (similarityMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH) {
104
+ // const alphabetIndexes: any = {};
105
+ // let i = 0;
106
+ // charCodeMap.forEach((value) => {
107
+ // alphabetIndexes[value] = i;
108
+ // i++;
109
+ // });
110
+ // options = {alphabetIndexes};
111
+ // }
112
+ return {seqList, options};
113
+ }
114
+
115
+ export async function getSequenceSpace(spaceParams: ISequenceSpaceParams,
116
+ progressFunc?: (epochNum: number, epochsLength: number, embedding: number[][]) => void
117
+ ): Promise<ISequenceSpaceResult> {
118
+ const ncUH = UnitsHandler.getOrCreate(spaceParams.seqCol);
119
+ if (ncUH.isHelm())
120
+ return await sequenceSpaceByFingerprints(spaceParams);
121
+
122
+
123
+ const {seqList, options} = await getEncodedSeqSpaceCol(spaceParams.seqCol, spaceParams.similarityMetric);
104
124
 
125
+ spaceParams.options = spaceParams.options ?? {};
126
+ spaceParams.options.distanceFnArgs = options;
105
127
  const sequenceSpaceResult = await reduceDimensinalityWithNormalization(
106
128
  seqList,
107
129
  spaceParams.methodName,
@@ -13,6 +13,7 @@ import {getDendrogramService, IDendrogramService} from '@datagrok-libraries/bio/
13
13
  import {handleError} from './utils';
14
14
  import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
15
15
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
16
+ import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
16
17
 
17
18
  const dataFn: string = 'data/sample_FASTA_PT_activity.csv';
18
19
 
@@ -53,7 +54,7 @@ export async function demoBio01bUI() {
53
54
  .step('Find activity cliffs', async () => {
54
55
  activityCliffsViewer = (await activityCliffs(
55
56
  df, df.getCol('Sequence'), df.getCol('Activity'),
56
- 80, dimRedMethod)) as DG.ScatterPlotViewer;
57
+ 80, dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN)) as DG.ScatterPlotViewer;
57
58
  view.dockManager.dock(activityCliffsViewer, DG.DOCK_TYPE.RIGHT, null, 'Activity Cliffs', 0.35);
58
59
 
59
60
  // Show grid viewer with the cliffs
package/src/package.ts CHANGED
@@ -11,10 +11,10 @@ import {DimReductionMethods, ITSNEOptions, IUMAPOptions} from '@datagrok-librari
11
11
  import {SequenceSpaceFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/seq-space-editor';
12
12
  import {ActivityCliffsFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/activity-cliffs-editor';
13
13
  import {
14
- ISequenceSpaceParams, getActivityCliffs, SequenceSpaceFunc
14
+ ISequenceSpaceParams, getActivityCliffs, SequenceSpaceFunc, CLIFFS_COL_ENCODE_FN
15
15
  } from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
16
16
  import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
17
- import {BitArrayMetrics, BitArrayMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
17
+ import {BitArrayMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
18
18
  import {
19
19
  TAGS as bioTAGS, ALPHABET, NOTATION,
20
20
  } from '@datagrok-libraries/bio/src/utils/macromolecule';
@@ -35,14 +35,14 @@ import {
35
35
  import {VdRegionsViewer} from './viewers/vd-regions-viewer';
36
36
  import {SequenceAlignment} from './seq_align';
37
37
  import {
38
- ISequenceSpaceResult, getEmbeddingColsNames, getSequenceSpace, sequenceSpaceByFingerprints
38
+ ISequenceSpaceResult, getEmbeddingColsNames, getEncodedSeqSpaceCol, getSequenceSpace, sequenceSpaceByFingerprints
39
39
  } from './analysis/sequence-space';
40
40
  import {
41
41
  createLinesGrid, createPropPanelElement, createTooltipElement, getChemSimilaritiesMatrix,
42
42
  } from './analysis/sequence-activity-cliffs';
43
43
  import {SequenceSimilarityViewer} from './analysis/sequence-similarity-viewer';
44
44
  import {SequenceDiversityViewer} from './analysis/sequence-diversity-viewer';
45
- import {SubstructureSearchDialog} from './substructure-search/substructure-search';
45
+ import {MONOMERIC_COL_TAGS, SubstructureSearchDialog, invalidateMols} from './substructure-search/substructure-search';
46
46
  import {convert} from './utils/convert';
47
47
  import {getMacromoleculeColumnPropertyPanel} from './widgets/representations';
48
48
  import {saveAsFastaUI} from './utils/save-as-fasta';
@@ -79,6 +79,7 @@ import {errInfo} from './utils/err-info';
79
79
  import {SHOW_SCATTERPLOT_PROGRESS} from '@datagrok-libraries/ml/src/functionEditors/seq-space-base-editor';
80
80
  import {DIMENSIONALITY_REDUCER_TERMINATE_EVENT}
81
81
  from '@datagrok-libraries/ml/src/workers/dimensionality-reducing-worker-creator';
82
+ import BitArray from '@datagrok-libraries/utils/src/bit-array';
82
83
 
83
84
  export const _package = new BioPackage();
84
85
 
@@ -389,12 +390,14 @@ export async function getRegionTopMenu(
389
390
  //input: column activities
390
391
  //input: double similarity = 80 [Similarity cutoff]
391
392
  //input: string methodName { choices:["UMAP", "t-SNE"] }
393
+ //input: string similarityMetric { choices:["Hamming", "Levenshtein", "Monomer chemical distance"] }
392
394
  //input: object options {optional: true}
393
395
  //output: viewer result
394
396
  //editor: Bio:SeqActivityCliffsEditor
395
397
  export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<string>, activities: DG.Column,
396
- similarity: number, methodName: DimReductionMethods, options?: (IUMAPOptions | ITSNEOptions) & Options,
397
- ): Promise<DG.Viewer | undefined> {
398
+ similarity: number, methodName: DimReductionMethods,
399
+ similarityMetric: MmDistanceFunctionsNames | BitArrayMetrics,
400
+ options?: (IUMAPOptions | ITSNEOptions) & Options): Promise<DG.Viewer | undefined> {
398
401
  if (!checkInputColumnUI(macroMolecule, 'Activity Cliffs'))
399
402
  return;
400
403
  const axesNames = getEmbeddingColsNames(df);
@@ -404,21 +407,27 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
404
407
  'separator': macroMolecule.getTag(bioTAGS.separator),
405
408
  'alphabet': macroMolecule.getTag(bioTAGS.alphabet),
406
409
  };
410
+ let cliffsEncodeFunction: (seqCol: DG.Column, similarityMetric: MmDistanceFunctionsNames | BitArrayMetrics) => any =
411
+ getEncodedSeqSpaceCol;
407
412
  const ncUH = UnitsHandler.getOrCreate(macroMolecule);
408
- let columnDistanceMetric: BitArrayMetrics | MmDistanceFunctionsNames = BitArrayMetricsNames.Tanimoto;
409
- let seqCol = macroMolecule;
410
- let sequenceSpaceFunc: SequenceSpaceFunc = sequenceSpaceByFingerprints;
411
- if (ncUH.isFasta() || (ncUH.isSeparator() && ncUH.alphabet && ncUH.alphabet !== ALPHABET.UN)) {
412
- if (ncUH.isFasta()) {
413
- columnDistanceMetric = ncUH.getDistanceFunctionName();
414
- } else {
415
- seqCol = ncUH.convert(NOTATION.FASTA);
416
- const uh = UnitsHandler.getOrCreate(seqCol);
417
- columnDistanceMetric = uh.getDistanceFunctionName();
418
- tags.units = NOTATION.FASTA;
419
- }
420
- sequenceSpaceFunc = getSequenceSpace;
413
+ const columnDistanceMetric: MmDistanceFunctionsNames | BitArrayMetrics = similarityMetric;
414
+ const seqCol = macroMolecule;
415
+
416
+ let sequenceSpaceFunc: SequenceSpaceFunc = getSequenceSpace;
417
+ if (ncUH.isHelm()) {
418
+ sequenceSpaceFunc = sequenceSpaceByFingerprints;
419
+ cliffsEncodeFunction = async (seqCol: DG.Column, similarityMetric: MmDistanceFunctionsNames | BitArrayMetrics) => {
420
+ await invalidateMols(seqCol, false);
421
+ const molecularCol = seqCol.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS];
422
+ const fingerPrints: DG.Column =
423
+ await grok.functions.call('Chem:getMorganFingerprints', {molColumn: molecularCol});
424
+ const fingerPrintsBitArray = fingerPrints.toList().map((f: DG.BitSet) =>
425
+ BitArray.fromUint32Array(f.length, new Uint32Array(f.getBuffer().buffer)));
426
+ return {seqList: fingerPrintsBitArray, options: {}};
427
+ };
421
428
  }
429
+
430
+
422
431
  const runCliffs = async () => {
423
432
  const sp = await getActivityCliffs(
424
433
  df,
@@ -437,25 +446,26 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
437
446
  createTooltipElement,
438
447
  createPropPanelElement,
439
448
  createLinesGrid,
440
- options);
449
+ {...(options ?? {}), [CLIFFS_COL_ENCODE_FN]: cliffsEncodeFunction});
441
450
  return sp;
442
451
  };
443
452
 
444
- const allowedRowCount = 20000;
445
- const fastRowCount = methodName === DimReductionMethods.UMAP ? 5000 : 2000;
453
+ const allowedRowCount = methodName === DimReductionMethods.UMAP ? 200_000 : 20_000;
454
+ const fastRowCount = methodName === DimReductionMethods.UMAP ? 5_000 : 2_000;
446
455
  if (df.rowCount > allowedRowCount) {
447
456
  grok.shell.warning(`Too many rows, maximum for sequence activity cliffs is ${allowedRowCount}`);
448
457
  return;
449
458
  }
450
459
 
451
- return new Promise<DG.Viewer>((resolve, reject) => {
460
+ return new Promise<DG.Viewer | undefined>((resolve, reject) => {
452
461
  if (df.rowCount > fastRowCount && !options?.[BYPASS_LARGE_DATA_WARNING]) {
453
462
  ui.dialog().add(ui.divText(`Activity cliffs analysis might take several minutes.
454
463
  Do you want to continue?`))
455
464
  .onOK(async () => {
456
- const progressBar = DG.TaskBarProgressIndicator.create(`Running sequence activity cliffs ...`);
457
- runCliffs().then((res) => resolve(res)).catch((err) => reject(err)).finally(() => { progressBar.close(); });
465
+ //const progressBar = DG.TaskBarProgressIndicator.create(`Running sequence activity cliffs ...`);
466
+ runCliffs().then((res) => resolve(res)).catch((err) => reject(err)).finally(() => {});
458
467
  })
468
+ .onCancel(() => { resolve(undefined); })
459
469
  .show();
460
470
  } else {
461
471
  runCliffs().then((res) => resolve(res)).catch((err) => reject(err));
@@ -525,7 +535,7 @@ export async function sequenceSpaceTopMenu(
525
535
  methodName: methodName,
526
536
  similarityMetric: similarityMetric,
527
537
  embedAxesNames: embedColsNames,
528
- options: {...options, sparseMatrixThreshold: sparseMatrixThreshold ?? 0.8,
538
+ options: {...options, sparseMatrixThreshold: sparseMatrixThreshold ?? 0.5,
529
539
  usingSparseMatrix: table.rowCount > 20000},
530
540
  };
531
541
 
@@ -1084,10 +1094,10 @@ export async function demoBioHelmMsaSequenceSpace(): Promise<void> {
1084
1094
  await demoBio05UI();
1085
1095
  }
1086
1096
 
1087
- //name: enumeratorColumnChoice
1097
+ //name: polyToolColumnChoice
1088
1098
  //input: dataframe df [Input data table]
1089
1099
  //input: column macroMolecule
1090
- export async function enumeratorColumnChoice(df: DG.DataFrame, macroMolecule: DG.Column): Promise<void> {
1100
+ export async function polyToolColumnChoice(df: DG.DataFrame, macroMolecule: DG.Column): Promise<void> {
1091
1101
  _setPeptideColumn(macroMolecule);
1092
1102
  await grok.data.detectSemanticTypes(df);
1093
1103
  }
@@ -8,6 +8,8 @@ import {_testActivityCliffsOpen} from './activity-cliffs-utils';
8
8
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
9
9
 
10
10
  import {_package} from '../package-test';
11
+ import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
12
+ import {BitArrayMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
11
13
 
12
14
 
13
15
  category('activityCliffs', async () => {
@@ -39,7 +41,7 @@ category('activityCliffs', async () => {
39
41
  const cliffsNum = DG.Test.isInBenchmark ? 6 : 3;
40
42
 
41
43
  await _testActivityCliffsOpen(actCliffsDf, DimReductionMethods.UMAP,
42
- 'sequence', 'Activity', 90, cliffsNum);
44
+ 'sequence', 'Activity', 90, cliffsNum, MmDistanceFunctionsNames.LEVENSHTEIN);
43
45
  });
44
46
 
45
47
  test('activityCliffsWithEmptyRows', async () => {
@@ -49,7 +51,7 @@ category('activityCliffs', async () => {
49
51
  viewList.push(actCliffsTableViewWithEmptyRows);
50
52
 
51
53
  await _testActivityCliffsOpen(actCliffsDfWithEmptyRows, DimReductionMethods.UMAP,
52
- 'sequence', 'Activity', 90, 3);
54
+ 'sequence', 'Activity', 90, 3, MmDistanceFunctionsNames.LEVENSHTEIN);
53
55
  });
54
56
 
55
57
  test('Helm', async () => {
@@ -57,6 +59,6 @@ category('activityCliffs', async () => {
57
59
  const view = grok.shell.addTableView(df);
58
60
 
59
61
  await _testActivityCliffsOpen(df, DimReductionMethods.UMAP,
60
- 'HELM', 'Activity', 90, 53);
62
+ 'HELM', 'Activity', 90, 53, BitArrayMetricsNames.Tanimoto);
61
63
  });
62
64
  });
@@ -4,14 +4,17 @@ import * as grok from 'datagrok-api/grok';
4
4
  import {expect} from '@datagrok-libraries/utils/src/test';
5
5
  import {activityCliffs, BYPASS_LARGE_DATA_WARNING} from '../package';
6
6
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
7
+ import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
8
+ import {BitArrayMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
7
9
 
8
10
  export async function _testActivityCliffsOpen(df: DG.DataFrame, drMethod: DimReductionMethods,
9
- seqColName: string, activityColName: string, similarityThr: number, tgtNumberCliffs: number
11
+ seqColName: string, activityColName: string, similarityThr: number, tgtNumberCliffs: number,
12
+ similarityMetric: MmDistanceFunctionsNames | BitArrayMetrics
10
13
  ): Promise<void> {
11
14
  await grok.data.detectSemanticTypes(df);
12
15
  const scatterPlot = await activityCliffs(
13
16
  df, df.getCol(seqColName), df.getCol(activityColName),
14
- similarityThr, drMethod, {[`${BYPASS_LARGE_DATA_WARNING}`]: true});
17
+ similarityThr, drMethod, similarityMetric, {[`${BYPASS_LARGE_DATA_WARNING}`]: true});
15
18
  // const scatterPlot = (await grok.functions.call('Bio:activityCliffs', {
16
19
  // table: df, molecules: df.getCol(colName), activities: df.getCol('Activity'),
17
20
  // similarity: 50, methodName: method
@@ -8,9 +8,7 @@ import {IMonomerLib, Monomer} from '@datagrok-libraries/bio/src/types/index';
8
8
  import {MolfileHandler} from '@datagrok-libraries/chem-meta/src/parsing-utils/molfile-handler';
9
9
  import {
10
10
  createJsonMonomerLibFromSdf,
11
- getJsonMonomerLibForEnumerator,
12
11
  IMonomerLibHelper,
13
- isValidEnumeratorLib,
14
12
  } from '@datagrok-libraries/bio/src/monomer-works/monomer-utils';
15
13
  import {
16
14
  HELM_REQUIRED_FIELDS as REQ, HELM_OPTIONAL_FIELDS as OPT, HELM_POLYMER_TYPE
@@ -18,12 +16,8 @@ import {
18
16
 
19
17
  import {_package} from '../package';
20
18
 
21
- const _HELM_REQUIRED_FIELDS_ARRAY = [
22
- REQ.SYMBOL, REQ.NAME, REQ.MOLFILE, REQ.AUTHOR, REQ.ID,
23
- REQ.RGROUPS, REQ.SMILES, REQ.POLYMER_TYPE, REQ.MONOMER_TYPE, REQ.CREATE_DATE,
24
- ] as const;
19
+ import {PolyToolMonomerLibHandler} from '@datagrok-libraries/bio/src/utils/poly-tool/monomer-lib-handler';
25
20
 
26
- const _HELM_OPTIONAL_FIELDS_ARRAY = [OPT.NATURAL_ANALOG, OPT.META] as const;
27
21
  // -- Monomer libraries --
28
22
  export const LIB_STORAGE_NAME = 'Libraries';
29
23
  export const LIB_PATH = 'System:AppData/Bio/libraries/';
@@ -291,8 +285,9 @@ export class MonomerLibHelper implements IMonomerLibHelper {
291
285
  }
292
286
  const df = await fileSource.readCsv(fileName);
293
287
  const json = toJson(df);
294
- if (isValidEnumeratorLib(json))
295
- rawLibData = getJsonMonomerLibForEnumerator(json);
288
+ const polyToolMonomerLib = new PolyToolMonomerLibHandler(json);
289
+ if (polyToolMonomerLib.isValid())
290
+ rawLibData = polyToolMonomerLib.getJsonMonomerLib();
296
291
  else
297
292
  throw new Error('Invalid format of CSV monomer lib');
298
293
  } else {