@datagrok/bio 2.22.0 → 2.22.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "name": "Leonid Stolbov",
6
6
  "email": "lstolbov@datagrok.ai"
7
7
  },
8
- "version": "2.22.0",
8
+ "version": "2.22.3",
9
9
  "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
10
10
  "repository": {
11
11
  "type": "git",
@@ -44,18 +44,18 @@
44
44
  ],
45
45
  "dependencies": {
46
46
  "@biowasm/aioli": "^3.1.0",
47
- "@datagrok-libraries/bio": "^5.53.4",
47
+ "@datagrok-libraries/bio": "^5.54.1",
48
48
  "@datagrok-libraries/chem-meta": "^1.2.7",
49
49
  "@datagrok-libraries/math": "^1.2.4",
50
50
  "@datagrok-libraries/ml": "^6.10.2",
51
51
  "@datagrok-libraries/tutorials": "^1.6.1",
52
52
  "@datagrok-libraries/utils": "^4.5.7",
53
- "datagrok-api": "^1.25.0",
54
53
  "@webgpu/types": "^0.1.40",
55
54
  "ajv": "^8.12.0",
56
55
  "ajv-errors": "^3.0.0",
57
56
  "cash-dom": "^8.0.0",
58
57
  "css-loader": "^6.7.3",
58
+ "datagrok-api": "^1.25.0",
59
59
  "dayjs": "^1.11.4",
60
60
  "fastest-levenshtein": "^1.0.16",
61
61
  "openchemlib": "^7.2.3",
@@ -67,13 +67,13 @@
67
67
  "devDependencies": {
68
68
  "@datagrok-libraries/helm-web-editor": "^1.1.14",
69
69
  "@datagrok-libraries/js-draw-lite": "^0.0.10",
70
- "@datagrok/chem": "^1.13.0",
70
+ "@datagrok/chem": "^1.15.0",
71
71
  "@datagrok/dendrogram": "^1.2.33",
72
72
  "@types/node": "^17.0.24",
73
73
  "@types/wu": "^2.1.44",
74
74
  "@typescript-eslint/eslint-plugin": "^8.8.1",
75
75
  "@typescript-eslint/parser": "^8.8.1",
76
- "datagrok-tools": "latest",
76
+ "datagrok-tools": "^4.14.29",
77
77
  "eslint": "^8.57.1",
78
78
  "eslint-config-google": "^0.14.0",
79
79
  "eslint-plugin-rxjs": "^5.0.3",
@@ -93,7 +93,7 @@
93
93
  "debug-sequences1": "webpack && grok publish",
94
94
  "release-sequences1": "webpack && grok publish --release",
95
95
  "build-sequences1": "webpack",
96
- "build": "webpack",
96
+ "build": " grok api && grok check --soft && webpack",
97
97
  "build-all": "npm --prefix ./../../libraries/chem-meta run build && npm --prefix ./../../js-api run build && npm --prefix ./../../libraries/utils run build && npm --prefix ./../../libraries/math run build && npm --prefix ./../../libraries/ml run build && npm --prefix ./../../libraries/bio run build && npm --prefix ./../../libraries/tutorials run build && npm run build",
98
98
  "debug-sequences1-local": "webpack && grok publish local",
99
99
  "release-sequences1-local": "webpack && grok publish local --release",
package/scripts/embed.py CHANGED
@@ -7,7 +7,13 @@ from rdkit.Chem import AllChem
7
7
  from rdkit import Chem
8
8
  mol = AllChem.MolFromMolBlock(molecule) if ("M END" in molecule) else AllChem.MolFromSmiles(molecule)
9
9
 
10
- AllChem.EmbedMolecule(mol, AllChem.ETKDG())
11
- #AllChem.UFFOptimizeMolecule(mol)
12
- #mol = Chem.RemoveHs(mol)
13
10
  sdf = Chem.MolToMolBlock(mol)
11
+ try:
12
+ AllChem.EmbedMolecule(mol, AllChem.ETKDG())
13
+ AllChem.UFFOptimizeMolecule(mol)
14
+ mol = Chem.RemoveHs(mol)
15
+ sdf = Chem.MolToMolBlock(mol)
16
+ except Exception as e:
17
+ pass
18
+ # mol = Chem.RemoveHs(mol)
19
+
@@ -13,13 +13,17 @@ import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule
13
13
  import {DistanceMatrixService, dmLinearIndex} from '@datagrok-libraries/ml/src/distance-matrix';
14
14
  import {MmcrTemps} from '@datagrok-libraries/bio/src/utils/cell-renderer-consts';
15
15
 
16
+ const MAX_SAMPLE_SIZE = 10000;
17
+
16
18
  export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
17
- diverseColumnLabel: string | null; // Use postfix Label to prevent activating table column selection editor
19
+ diverseColumnLabel: string | null;
18
20
 
19
21
  renderMolIds: number[] | null = null;
20
22
  columnNames = [];
21
23
  computeCompleted = new Subject<boolean>();
22
24
 
25
+ private sampledIndices: number[] | null = null;
26
+
23
27
  constructor(
24
28
  private readonly seqHelper: ISeqHelper,
25
29
  ) {
@@ -32,8 +36,7 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
32
36
  return;
33
37
  if (this.dataFrame) {
34
38
  if (computeData && this.targetColumn) {
35
- const sh = this.seqHelper.getSeqHandler(this.targetColumn);
36
- await (sh.isFasta() ? this.computeByMM() : this.computeByChem());
39
+ await this.computeByMM();
37
40
 
38
41
  const diverseColumnName: string = this.diverseColumnLabel != null ? this.diverseColumnLabel :
39
42
  `diverse (${this.targetColumnName})`;
@@ -57,30 +60,89 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
57
60
  }
58
61
  }
59
62
 
60
- private async computeByChem() {
61
- const monomericMols = await getMonomericMols(this.targetColumn!, this.seqHelper);
62
- //need to create df to calculate fingerprints
63
- const _monomericMolsDf = DG.DataFrame.fromColumns([monomericMols]);
64
- this.renderMolIds = await grok.functions.call('Chem:callChemDiversitySearch', {
65
- col: monomericMols,
66
- metricName: this.distanceMetric,
67
- limit: this.limit,
68
- fingerprint: this.fingerprint,
69
- });
70
- }
71
-
72
63
  private async computeByMM() {
73
- const encodedSequences =
74
- (await getEncodedSeqSpaceCol(this.targetColumn!, MmDistanceFunctionsNames.LEVENSHTEIN)).seqList;
64
+ const totalLength = this.targetColumn!.length;
65
+ let workingIndices: number[];
66
+
67
+ // Determine if we need to sample the data
68
+ if (this.requiresSampling && totalLength > MAX_SAMPLE_SIZE) {
69
+ workingIndices = this.createRandomSample(totalLength, MAX_SAMPLE_SIZE);
70
+ this.sampledIndices = workingIndices;
71
+ } else {
72
+ workingIndices = Array.from({length: totalLength}, (_, i) => i);
73
+ this.sampledIndices = null;
74
+ }
75
+
76
+ const distanceFunction = this.distanceMetric as MmDistanceFunctionsNames;
77
+
78
+ // Call with individual parameters instead of params object
79
+ const encodedResult = await getEncodedSeqSpaceCol(
80
+ this.targetColumn!,
81
+ distanceFunction,
82
+ this.fingerprint,
83
+ this.gapOpen,
84
+ this.gapExtend
85
+ );
86
+ const fullEncodedSequences = encodedResult.seqList;
87
+ const options = encodedResult.options;
88
+
89
+ // Extract only the sequences we need for the working set
90
+ const workingEncodedSequences = workingIndices.map((idx) => fullEncodedSequences[idx]);
91
+
75
92
  const distanceMatrixService = new DistanceMatrixService(true, false);
76
- const distanceMatrixData = await distanceMatrixService.calc(encodedSequences, MmDistanceFunctionsNames.LEVENSHTEIN);
93
+ const distanceMatrixData = await distanceMatrixService.calc(
94
+ workingEncodedSequences,
95
+ distanceFunction,
96
+ true, // normalize
97
+ options
98
+ );
77
99
  distanceMatrixService.terminate();
78
- const len = this.targetColumn!.length;
79
- const linearizeFunc = dmLinearIndex(len);
80
- this.renderMolIds = getDiverseSubset(len, Math.min(len, this.limit),
100
+
101
+ const workingLength = workingIndices.length;
102
+ const linearizeFunc = dmLinearIndex(workingLength);
103
+
104
+ const diverseIndicesInWorkingSet = getDiverseSubset(
105
+ workingLength,
106
+ Math.min(workingLength, this.limit),
81
107
  (i1: number, i2: number) => {
82
- return this.targetColumn!.isNone(i1) || this.targetColumn!.isNone(i2) ? 0 :
83
- distanceMatrixData[linearizeFunc(i1, i2)];
84
- });
108
+ return distanceMatrixData[linearizeFunc(i1, i2)];
109
+ }
110
+ );
111
+
112
+ // Map back to original indices
113
+ this.renderMolIds = diverseIndicesInWorkingSet.map((workingIndex) => workingIndices[workingIndex]);
85
114
  }
115
+
116
+ private createRandomSample(totalLength: number, sampleSize: number): number[] {
117
+ const validIndices: number[] = [];
118
+ for (let i = 0; i < totalLength; i++) {
119
+ if (!this.targetColumn!.isNone(i))
120
+ validIndices.push(i);
121
+ }
122
+
123
+ if (validIndices.length <= sampleSize)
124
+ return validIndices;
125
+
126
+ for (let i = validIndices.length - 1; i > 0; i--) {
127
+ const j = Math.floor(Math.random() * (i + 1));
128
+ const temp = validIndices[i];
129
+ validIndices[i] = validIndices[j];
130
+ validIndices[j] = temp;
131
+ }
132
+
133
+ return validIndices.slice(0, sampleSize);
134
+ }
135
+
136
+ // // Helper method to get information about sampling (useful for debugging/info)
137
+ // getSamplingInfo(): {isSampled: boolean, originalSize?: number, sampleSize?: number, sampledIndices?: number[]} {
138
+ // if (this.sampledIndices) {
139
+ // return {
140
+ // isSampled: true,
141
+ // originalSize: this.targetColumn?.length,
142
+ // sampleSize: this.sampledIndices.length,
143
+ // sampledIndices: this.sampledIndices
144
+ // };
145
+ // }
146
+ // return {isSampled: false};
147
+ // }
86
148
  }
@@ -2,30 +2,62 @@ import * as ui from 'datagrok-api/ui';
2
2
  import * as DG from 'datagrok-api/dg';
3
3
  import * as grok from 'datagrok-api/grok';
4
4
 
5
- import {CHEM_SIMILARITY_METRICS} from '@datagrok-libraries/ml/src/distance-metrics-methods';
5
+ import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
6
6
  import {TAGS as bioTAGS} from '@datagrok-libraries/bio/src/utils/macromolecule';
7
7
  import {SearchBaseViewer} from '@datagrok-libraries/ml/src/viewers/search-base-viewer';
8
8
 
9
- const MAX_ROWS_FOR_DISTANCE_MATRIX = 22000;
9
+ const MAX_ROWS_FOR_DISTANCE_MATRIX = 10000;
10
10
 
11
11
  export class SequenceSearchBaseViewer extends SearchBaseViewer {
12
12
  distanceMetric: string;
13
13
  fingerprint: string;
14
- metricsProperties = ['distanceMetric', 'fingerprint'];
15
- fingerprintChoices = ['Morgan', 'Pattern'];
14
+ gapOpen: number;
15
+ gapExtend: number;
16
+
17
+ metricsProperties = ['distanceMetric', 'fingerprint', 'gapOpen', 'gapExtend'];
18
+ fingerprintChoices = ['Morgan', 'RDKit', 'Pattern', 'AtomPair', 'MACCS', 'TopologicalTorsion'];
19
+ distanceFunctionChoices = [
20
+ MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH,
21
+ MmDistanceFunctionsNames.HAMMING,
22
+ MmDistanceFunctionsNames.LEVENSHTEIN,
23
+ MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE
24
+ ];
25
+
16
26
  tags = [DG.TAGS.UNITS, bioTAGS.aligned, bioTAGS.separator, bioTAGS.alphabet, 'cell.renderer'];
17
27
  preComputeDistanceMatrix: boolean = false;
28
+ requiresSampling: boolean = false;
18
29
 
19
30
  constructor(name: string, semType: string) {
20
31
  super(name, semType);
21
- this.fingerprint = this.string('fingerprint', this.fingerprintChoices[0], {choices: this.fingerprintChoices});
22
- this.distanceMetric = this.string('distanceMetric', CHEM_SIMILARITY_METRICS[0], {choices: CHEM_SIMILARITY_METRICS});
32
+
33
+ this.distanceMetric = this.string('distanceMetric', MmDistanceFunctionsNames.HAMMING, {
34
+ choices: this.distanceFunctionChoices
35
+ });
36
+
37
+ this.fingerprint = this.string('fingerprint', this.fingerprintChoices[0], {
38
+ choices: this.fingerprintChoices
39
+ });
40
+
41
+ this.gapOpen = this.float('gapOpen', 1);
42
+ this.gapExtend = this.float('gapExtend', 0.6);
23
43
  }
24
44
 
25
45
  async onTableAttached(): Promise<void> {
26
46
  super.onTableAttached();
27
47
 
28
- if (this.dataFrame)
29
- this.preComputeDistanceMatrix = this.dataFrame.rowCount <= MAX_ROWS_FOR_DISTANCE_MATRIX;
48
+ if (this.dataFrame) {
49
+ const rowCount = this.dataFrame.rowCount;
50
+ this.preComputeDistanceMatrix = rowCount <= MAX_ROWS_FOR_DISTANCE_MATRIX;
51
+ this.requiresSampling = rowCount > MAX_ROWS_FOR_DISTANCE_MATRIX;
52
+ }
53
+ }
54
+
55
+ needsGapPenalties(): boolean {
56
+ return this.distanceMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH;
57
+ }
58
+
59
+ needsFingerprint(): boolean {
60
+ return this.distanceMetric === MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE ||
61
+ this.distanceMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH;
30
62
  }
31
63
  }
@@ -1,3 +1,5 @@
1
+ /* eslint-disable max-len */
2
+ /* eslint-disable space-infix-ops */
1
3
  import * as grok from 'datagrok-api/grok';
2
4
  import * as ui from 'datagrok-api/ui';
3
5
  import * as DG from 'datagrok-api/dg';
@@ -34,6 +36,12 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
34
36
  analysisGrid?: DG.Grid;
35
37
  subInited: boolean = false;
36
38
 
39
+ // Track last parameters to avoid unnecessary recomputation
40
+ private lastDistanceMetric: string = '';
41
+ private lastFingerprint: string = '';
42
+ private lastGapOpen: number = 0;
43
+ private lastGapExtend: number = 0;
44
+
37
45
  constructor(
38
46
  private readonly seqHelper: ISeqHelper,
39
47
  demo?: boolean,
@@ -55,10 +63,16 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
55
63
  return;
56
64
  if (this.targetColumn) {
57
65
  this.curIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
58
- if (computeData && !this.gridSelect) {
59
- this.targetMoleculeIdx = (this.dataFrame!.currentRowIdx ?? -1) < 0 ? 0 : this.dataFrame!.currentRowIdx;
60
66
 
61
- await this.computeByMM();
67
+ // Force recomputation if parameters changed
68
+ const parametersChanged =
69
+ this.lastDistanceMetric !== this.distanceMetric ||
70
+ this.lastFingerprint !== this.fingerprint ||
71
+ this.lastGapOpen !== this.gapOpen ||
72
+ this.lastGapExtend !== this.gapExtend;
73
+
74
+ if ((computeData && !this.gridSelect) || parametersChanged) {
75
+ this.targetMoleculeIdx = (this.dataFrame!.currentRowIdx ?? -1) < 0 ? 0 : this.dataFrame!.currentRowIdx; await this.computeByMM();
62
76
  const similarColumnName: string = this.similarColumnLabel != null ? this.similarColumnLabel :
63
77
  `similar (${this.targetColumn})`;
64
78
  this.molCol = DG.Column.string(similarColumnName,
@@ -106,14 +120,40 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
106
120
  private async computeByMM() {
107
121
  const len = this.targetColumn!.length;
108
122
  const actualLimit = Math.min(this.limit, len - 1);
109
- if (!this.knn || this.kPrevNeighbors !== actualLimit) {
110
- const encodedSequences =
111
- (await getEncodedSeqSpaceCol(this.targetColumn!, MmDistanceFunctionsNames.LEVENSHTEIN)).seqList;
112
123
 
124
+ // Check if need to recalculate knn due to parameter changes
125
+ const needsRecalculation = !this.knn ||
126
+ this.kPrevNeighbors !== actualLimit ||
127
+ this.lastDistanceMetric !== this.distanceMetric ||
128
+ this.lastFingerprint !== this.fingerprint ||
129
+ this.lastGapOpen !== this.gapOpen ||
130
+ this.lastGapExtend !== this.gapExtend;
131
+
132
+ if (needsRecalculation) {
133
+ const distanceFunction = this.distanceMetric as MmDistanceFunctionsNames;
134
+
135
+ // Call with individual parameters instead of params object
136
+ const encodedResult = await getEncodedSeqSpaceCol(
137
+ this.targetColumn!,
138
+ distanceFunction,
139
+ this.fingerprint,
140
+ this.gapOpen,
141
+ this.gapExtend
142
+ );
143
+ const encodedSequences = encodedResult.seqList;
144
+ const options = encodedResult.options;
145
+
146
+ // Store current parameters for next comparison
147
+ this.lastDistanceMetric = this.distanceMetric;
148
+ this.lastFingerprint = this.fingerprint;
149
+ this.lastGapOpen = this.gapOpen;
150
+ this.lastGapExtend = this.gapExtend;
113
151
  this.kPrevNeighbors = actualLimit;
152
+
114
153
  this.knn = await (new SparseMatrixService()
115
- .getKNN(encodedSequences, MmDistanceFunctionsNames.LEVENSHTEIN, Math.min(this.limit, len - 1)));
154
+ .getKNN(encodedSequences, distanceFunction, actualLimit, options));
116
155
  }
156
+
117
157
  const indexWScore = new Array(actualLimit).fill(0).map((_, i) => ({
118
158
  idx: this.knn!.knnIndexes[this.targetMoleculeIdx][i],
119
159
  score: 1 - this.knn!.knnDistances[this.targetMoleculeIdx][i],
@@ -15,9 +15,13 @@ export interface ISequenceSpaceResult {
15
15
  }
16
16
 
17
17
  export async function getEncodedSeqSpaceCol(
18
- seqCol: DG.Column, similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames, fingerprintType: string = 'Morgan'
18
+ seqCol: DG.Column,
19
+ similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames,
20
+ fingerprintType: string = 'Morgan',
21
+ gapOpen: number = 1,
22
+ gapExtend: number = 0.6
19
23
  ): Promise<{ seqList: string[], options: { [_: string]: any } }> {
20
- // encodes sequences using utf characters to also support multichar and non fasta sequences
24
+ // encodes sequences using utf characters to also support multichar and non fasta sequences
21
25
  const rowCount = seqCol.length;
22
26
  const sh = _package.seqHelper.getSeqHandler(seqCol);
23
27
  const encList = Array<string>(rowCount);
@@ -25,6 +29,7 @@ export async function getEncodedSeqSpaceCol(
25
29
  const charCodeMap = new Map<string, string>();
26
30
  const seqColCats = seqCol.categories;
27
31
  const seqColRawData = seqCol.getRawData();
32
+
28
33
  for (let rowIdx = 0; rowIdx < rowCount; rowIdx++) {
29
34
  const catI = seqColRawData[rowIdx];
30
35
  const seq = seqColCats[catI];
@@ -44,7 +49,10 @@ export async function getEncodedSeqSpaceCol(
44
49
  encList[rowIdx] += charCodeMap.get(char)!;
45
50
  }
46
51
  }
52
+
47
53
  let options = {} as mmDistanceFunctionArgs;
54
+
55
+ // Handle fingerprint-based distance functions
48
56
  if (
49
57
  similarityMetric === MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE ||
50
58
  similarityMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH
@@ -56,9 +64,21 @@ export async function getEncodedSeqSpaceCol(
56
64
  Object.entries(monomerRes.alphabetIndexes).forEach(([key, value]) => {
57
65
  monomerHashToMatrixMap[charCodeMap.get(key)!] = value;
58
66
  });
67
+
59
68
  // sets distance function args in place.
60
- const maxLength = encList.reduce((acc, val) => Math.max(acc, val.length), 0);
61
- options = {scoringMatrix: monomerRes.scoringMatrix, alphabetIndexes: monomerHashToMatrixMap, maxLength};
69
+ const maxLength = encList.reduce((acc, val) => Math.max(acc, val?.length || 0), 0);
70
+ options = {
71
+ scoringMatrix: monomerRes.scoringMatrix,
72
+ alphabetIndexes: monomerHashToMatrixMap,
73
+ maxLength
74
+ } as mmDistanceFunctionArgs;
75
+
76
+ // Add gap penalties only for Needleman-Wunsch
77
+ if (similarityMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH) {
78
+ (options as any).gapOpen = gapOpen;
79
+ (options as any).gapExtend = gapExtend;
80
+ }
62
81
  }
82
+
63
83
  return {seqList: encList, options};
64
84
  }
@@ -77,7 +77,7 @@ export async function demoBio05UI(): Promise<void> {
77
77
  const method: string = pepseaMethods[0];
78
78
  const gapOpen: number = 1.53;
79
79
  const gapExtend: number = 0;
80
- msaHelmCol = (await runPepsea(helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
80
+ msaHelmCol = (await runPepsea(df, helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
81
81
  if (!msaHelmCol)
82
82
  throw new Error(`Empty MSA result.`);
83
83
  df.columns.add(msaHelmCol);