@datagrok/bio 2.22.0 → 2.22.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +5 -0
- package/detectors.js +56 -1
- package/dist/package-test.js +3 -3
- package/dist/package-test.js.map +1 -1
- package/dist/package.js +3 -3
- package/dist/package.js.map +1 -1
- package/package.json +6 -6
- package/scripts/embed.py +9 -3
- package/src/analysis/sequence-diversity-viewer.ts +86 -24
- package/src/analysis/sequence-search-base-viewer.ts +40 -8
- package/src/analysis/sequence-similarity-viewer.ts +47 -7
- package/src/analysis/sequence-space.ts +24 -4
- package/src/demo/bio05-helm-msa-sequence-space.ts +1 -1
- package/src/package-api.ts +417 -0
- package/src/package.g.ts +1 -0
- package/src/package.ts +14 -5
- package/src/tests/msa-tests.ts +1 -1
- package/src/tests/pepsea-tests.ts +3 -3
- package/src/tests/similarity-diversity-tests.ts +5 -5
- package/src/utils/context-menu.ts +7 -6
- package/src/utils/helm-to-molfile/converter/mol-wrapper.ts +1 -1
- package/src/utils/monomer-lib/library-file-manager/ui.ts +9 -1
- package/src/utils/multiple-sequence-alignment-ui.ts +20 -9
- package/src/utils/multiple-sequence-alignment.ts +22 -7
- package/src/utils/pepsea.ts +34 -18
- package/src/widgets/representations.ts +31 -58
- package/src/widgets/sequence-scrolling-widget.ts +184 -176
- package/src/widgets/to-atomic-level-widget.ts +94 -23
- package/test-console-output-1.log +620 -621
- package/test-record-1.mp4 +0 -0
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"name": "Leonid Stolbov",
|
|
6
6
|
"email": "lstolbov@datagrok.ai"
|
|
7
7
|
},
|
|
8
|
-
"version": "2.22.
|
|
8
|
+
"version": "2.22.3",
|
|
9
9
|
"description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -44,18 +44,18 @@
|
|
|
44
44
|
],
|
|
45
45
|
"dependencies": {
|
|
46
46
|
"@biowasm/aioli": "^3.1.0",
|
|
47
|
-
"@datagrok-libraries/bio": "^5.
|
|
47
|
+
"@datagrok-libraries/bio": "^5.54.1",
|
|
48
48
|
"@datagrok-libraries/chem-meta": "^1.2.7",
|
|
49
49
|
"@datagrok-libraries/math": "^1.2.4",
|
|
50
50
|
"@datagrok-libraries/ml": "^6.10.2",
|
|
51
51
|
"@datagrok-libraries/tutorials": "^1.6.1",
|
|
52
52
|
"@datagrok-libraries/utils": "^4.5.7",
|
|
53
|
-
"datagrok-api": "^1.25.0",
|
|
54
53
|
"@webgpu/types": "^0.1.40",
|
|
55
54
|
"ajv": "^8.12.0",
|
|
56
55
|
"ajv-errors": "^3.0.0",
|
|
57
56
|
"cash-dom": "^8.0.0",
|
|
58
57
|
"css-loader": "^6.7.3",
|
|
58
|
+
"datagrok-api": "^1.25.0",
|
|
59
59
|
"dayjs": "^1.11.4",
|
|
60
60
|
"fastest-levenshtein": "^1.0.16",
|
|
61
61
|
"openchemlib": "^7.2.3",
|
|
@@ -67,13 +67,13 @@
|
|
|
67
67
|
"devDependencies": {
|
|
68
68
|
"@datagrok-libraries/helm-web-editor": "^1.1.14",
|
|
69
69
|
"@datagrok-libraries/js-draw-lite": "^0.0.10",
|
|
70
|
-
"@datagrok/chem": "^1.
|
|
70
|
+
"@datagrok/chem": "^1.15.0",
|
|
71
71
|
"@datagrok/dendrogram": "^1.2.33",
|
|
72
72
|
"@types/node": "^17.0.24",
|
|
73
73
|
"@types/wu": "^2.1.44",
|
|
74
74
|
"@typescript-eslint/eslint-plugin": "^8.8.1",
|
|
75
75
|
"@typescript-eslint/parser": "^8.8.1",
|
|
76
|
-
"datagrok-tools": "
|
|
76
|
+
"datagrok-tools": "^4.14.29",
|
|
77
77
|
"eslint": "^8.57.1",
|
|
78
78
|
"eslint-config-google": "^0.14.0",
|
|
79
79
|
"eslint-plugin-rxjs": "^5.0.3",
|
|
@@ -93,7 +93,7 @@
|
|
|
93
93
|
"debug-sequences1": "webpack && grok publish",
|
|
94
94
|
"release-sequences1": "webpack && grok publish --release",
|
|
95
95
|
"build-sequences1": "webpack",
|
|
96
|
-
"build": "webpack",
|
|
96
|
+
"build": " grok api && grok check --soft && webpack",
|
|
97
97
|
"build-all": "npm --prefix ./../../libraries/chem-meta run build && npm --prefix ./../../js-api run build && npm --prefix ./../../libraries/utils run build && npm --prefix ./../../libraries/math run build && npm --prefix ./../../libraries/ml run build && npm --prefix ./../../libraries/bio run build && npm --prefix ./../../libraries/tutorials run build && npm run build",
|
|
98
98
|
"debug-sequences1-local": "webpack && grok publish local",
|
|
99
99
|
"release-sequences1-local": "webpack && grok publish local --release",
|
package/scripts/embed.py
CHANGED
|
@@ -7,7 +7,13 @@ from rdkit.Chem import AllChem
|
|
|
7
7
|
from rdkit import Chem
|
|
8
8
|
mol = AllChem.MolFromMolBlock(molecule) if ("M END" in molecule) else AllChem.MolFromSmiles(molecule)
|
|
9
9
|
|
|
10
|
-
AllChem.EmbedMolecule(mol, AllChem.ETKDG())
|
|
11
|
-
#AllChem.UFFOptimizeMolecule(mol)
|
|
12
|
-
#mol = Chem.RemoveHs(mol)
|
|
13
10
|
sdf = Chem.MolToMolBlock(mol)
|
|
11
|
+
try:
|
|
12
|
+
AllChem.EmbedMolecule(mol, AllChem.ETKDG())
|
|
13
|
+
AllChem.UFFOptimizeMolecule(mol)
|
|
14
|
+
mol = Chem.RemoveHs(mol)
|
|
15
|
+
sdf = Chem.MolToMolBlock(mol)
|
|
16
|
+
except Exception as e:
|
|
17
|
+
pass
|
|
18
|
+
# mol = Chem.RemoveHs(mol)
|
|
19
|
+
|
|
@@ -13,13 +13,17 @@ import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule
|
|
|
13
13
|
import {DistanceMatrixService, dmLinearIndex} from '@datagrok-libraries/ml/src/distance-matrix';
|
|
14
14
|
import {MmcrTemps} from '@datagrok-libraries/bio/src/utils/cell-renderer-consts';
|
|
15
15
|
|
|
16
|
+
const MAX_SAMPLE_SIZE = 10000;
|
|
17
|
+
|
|
16
18
|
export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
17
|
-
diverseColumnLabel: string | null;
|
|
19
|
+
diverseColumnLabel: string | null;
|
|
18
20
|
|
|
19
21
|
renderMolIds: number[] | null = null;
|
|
20
22
|
columnNames = [];
|
|
21
23
|
computeCompleted = new Subject<boolean>();
|
|
22
24
|
|
|
25
|
+
private sampledIndices: number[] | null = null;
|
|
26
|
+
|
|
23
27
|
constructor(
|
|
24
28
|
private readonly seqHelper: ISeqHelper,
|
|
25
29
|
) {
|
|
@@ -32,8 +36,7 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
32
36
|
return;
|
|
33
37
|
if (this.dataFrame) {
|
|
34
38
|
if (computeData && this.targetColumn) {
|
|
35
|
-
|
|
36
|
-
await (sh.isFasta() ? this.computeByMM() : this.computeByChem());
|
|
39
|
+
await this.computeByMM();
|
|
37
40
|
|
|
38
41
|
const diverseColumnName: string = this.diverseColumnLabel != null ? this.diverseColumnLabel :
|
|
39
42
|
`diverse (${this.targetColumnName})`;
|
|
@@ -57,30 +60,89 @@ export class SequenceDiversityViewer extends SequenceSearchBaseViewer {
|
|
|
57
60
|
}
|
|
58
61
|
}
|
|
59
62
|
|
|
60
|
-
private async computeByChem() {
|
|
61
|
-
const monomericMols = await getMonomericMols(this.targetColumn!, this.seqHelper);
|
|
62
|
-
//need to create df to calculate fingerprints
|
|
63
|
-
const _monomericMolsDf = DG.DataFrame.fromColumns([monomericMols]);
|
|
64
|
-
this.renderMolIds = await grok.functions.call('Chem:callChemDiversitySearch', {
|
|
65
|
-
col: monomericMols,
|
|
66
|
-
metricName: this.distanceMetric,
|
|
67
|
-
limit: this.limit,
|
|
68
|
-
fingerprint: this.fingerprint,
|
|
69
|
-
});
|
|
70
|
-
}
|
|
71
|
-
|
|
72
63
|
private async computeByMM() {
|
|
73
|
-
const
|
|
74
|
-
|
|
64
|
+
const totalLength = this.targetColumn!.length;
|
|
65
|
+
let workingIndices: number[];
|
|
66
|
+
|
|
67
|
+
// Determine if we need to sample the data
|
|
68
|
+
if (this.requiresSampling && totalLength > MAX_SAMPLE_SIZE) {
|
|
69
|
+
workingIndices = this.createRandomSample(totalLength, MAX_SAMPLE_SIZE);
|
|
70
|
+
this.sampledIndices = workingIndices;
|
|
71
|
+
} else {
|
|
72
|
+
workingIndices = Array.from({length: totalLength}, (_, i) => i);
|
|
73
|
+
this.sampledIndices = null;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const distanceFunction = this.distanceMetric as MmDistanceFunctionsNames;
|
|
77
|
+
|
|
78
|
+
// Call with individual parameters instead of params object
|
|
79
|
+
const encodedResult = await getEncodedSeqSpaceCol(
|
|
80
|
+
this.targetColumn!,
|
|
81
|
+
distanceFunction,
|
|
82
|
+
this.fingerprint,
|
|
83
|
+
this.gapOpen,
|
|
84
|
+
this.gapExtend
|
|
85
|
+
);
|
|
86
|
+
const fullEncodedSequences = encodedResult.seqList;
|
|
87
|
+
const options = encodedResult.options;
|
|
88
|
+
|
|
89
|
+
// Extract only the sequences we need for the working set
|
|
90
|
+
const workingEncodedSequences = workingIndices.map((idx) => fullEncodedSequences[idx]);
|
|
91
|
+
|
|
75
92
|
const distanceMatrixService = new DistanceMatrixService(true, false);
|
|
76
|
-
const distanceMatrixData = await distanceMatrixService.calc(
|
|
93
|
+
const distanceMatrixData = await distanceMatrixService.calc(
|
|
94
|
+
workingEncodedSequences,
|
|
95
|
+
distanceFunction,
|
|
96
|
+
true, // normalize
|
|
97
|
+
options
|
|
98
|
+
);
|
|
77
99
|
distanceMatrixService.terminate();
|
|
78
|
-
|
|
79
|
-
const
|
|
80
|
-
|
|
100
|
+
|
|
101
|
+
const workingLength = workingIndices.length;
|
|
102
|
+
const linearizeFunc = dmLinearIndex(workingLength);
|
|
103
|
+
|
|
104
|
+
const diverseIndicesInWorkingSet = getDiverseSubset(
|
|
105
|
+
workingLength,
|
|
106
|
+
Math.min(workingLength, this.limit),
|
|
81
107
|
(i1: number, i2: number) => {
|
|
82
|
-
return
|
|
83
|
-
|
|
84
|
-
|
|
108
|
+
return distanceMatrixData[linearizeFunc(i1, i2)];
|
|
109
|
+
}
|
|
110
|
+
);
|
|
111
|
+
|
|
112
|
+
// Map back to original indices
|
|
113
|
+
this.renderMolIds = diverseIndicesInWorkingSet.map((workingIndex) => workingIndices[workingIndex]);
|
|
85
114
|
}
|
|
115
|
+
|
|
116
|
+
private createRandomSample(totalLength: number, sampleSize: number): number[] {
|
|
117
|
+
const validIndices: number[] = [];
|
|
118
|
+
for (let i = 0; i < totalLength; i++) {
|
|
119
|
+
if (!this.targetColumn!.isNone(i))
|
|
120
|
+
validIndices.push(i);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if (validIndices.length <= sampleSize)
|
|
124
|
+
return validIndices;
|
|
125
|
+
|
|
126
|
+
for (let i = validIndices.length - 1; i > 0; i--) {
|
|
127
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
128
|
+
const temp = validIndices[i];
|
|
129
|
+
validIndices[i] = validIndices[j];
|
|
130
|
+
validIndices[j] = temp;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return validIndices.slice(0, sampleSize);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// // Helper method to get information about sampling (useful for debugging/info)
|
|
137
|
+
// getSamplingInfo(): {isSampled: boolean, originalSize?: number, sampleSize?: number, sampledIndices?: number[]} {
|
|
138
|
+
// if (this.sampledIndices) {
|
|
139
|
+
// return {
|
|
140
|
+
// isSampled: true,
|
|
141
|
+
// originalSize: this.targetColumn?.length,
|
|
142
|
+
// sampleSize: this.sampledIndices.length,
|
|
143
|
+
// sampledIndices: this.sampledIndices
|
|
144
|
+
// };
|
|
145
|
+
// }
|
|
146
|
+
// return {isSampled: false};
|
|
147
|
+
// }
|
|
86
148
|
}
|
|
@@ -2,30 +2,62 @@ import * as ui from 'datagrok-api/ui';
|
|
|
2
2
|
import * as DG from 'datagrok-api/dg';
|
|
3
3
|
import * as grok from 'datagrok-api/grok';
|
|
4
4
|
|
|
5
|
-
import {
|
|
5
|
+
import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
|
|
6
6
|
import {TAGS as bioTAGS} from '@datagrok-libraries/bio/src/utils/macromolecule';
|
|
7
7
|
import {SearchBaseViewer} from '@datagrok-libraries/ml/src/viewers/search-base-viewer';
|
|
8
8
|
|
|
9
|
-
const MAX_ROWS_FOR_DISTANCE_MATRIX =
|
|
9
|
+
const MAX_ROWS_FOR_DISTANCE_MATRIX = 10000;
|
|
10
10
|
|
|
11
11
|
export class SequenceSearchBaseViewer extends SearchBaseViewer {
|
|
12
12
|
distanceMetric: string;
|
|
13
13
|
fingerprint: string;
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
gapOpen: number;
|
|
15
|
+
gapExtend: number;
|
|
16
|
+
|
|
17
|
+
metricsProperties = ['distanceMetric', 'fingerprint', 'gapOpen', 'gapExtend'];
|
|
18
|
+
fingerprintChoices = ['Morgan', 'RDKit', 'Pattern', 'AtomPair', 'MACCS', 'TopologicalTorsion'];
|
|
19
|
+
distanceFunctionChoices = [
|
|
20
|
+
MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH,
|
|
21
|
+
MmDistanceFunctionsNames.HAMMING,
|
|
22
|
+
MmDistanceFunctionsNames.LEVENSHTEIN,
|
|
23
|
+
MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE
|
|
24
|
+
];
|
|
25
|
+
|
|
16
26
|
tags = [DG.TAGS.UNITS, bioTAGS.aligned, bioTAGS.separator, bioTAGS.alphabet, 'cell.renderer'];
|
|
17
27
|
preComputeDistanceMatrix: boolean = false;
|
|
28
|
+
requiresSampling: boolean = false;
|
|
18
29
|
|
|
19
30
|
constructor(name: string, semType: string) {
|
|
20
31
|
super(name, semType);
|
|
21
|
-
|
|
22
|
-
this.distanceMetric = this.string('distanceMetric',
|
|
32
|
+
|
|
33
|
+
this.distanceMetric = this.string('distanceMetric', MmDistanceFunctionsNames.HAMMING, {
|
|
34
|
+
choices: this.distanceFunctionChoices
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
this.fingerprint = this.string('fingerprint', this.fingerprintChoices[0], {
|
|
38
|
+
choices: this.fingerprintChoices
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
this.gapOpen = this.float('gapOpen', 1);
|
|
42
|
+
this.gapExtend = this.float('gapExtend', 0.6);
|
|
23
43
|
}
|
|
24
44
|
|
|
25
45
|
async onTableAttached(): Promise<void> {
|
|
26
46
|
super.onTableAttached();
|
|
27
47
|
|
|
28
|
-
if (this.dataFrame)
|
|
29
|
-
|
|
48
|
+
if (this.dataFrame) {
|
|
49
|
+
const rowCount = this.dataFrame.rowCount;
|
|
50
|
+
this.preComputeDistanceMatrix = rowCount <= MAX_ROWS_FOR_DISTANCE_MATRIX;
|
|
51
|
+
this.requiresSampling = rowCount > MAX_ROWS_FOR_DISTANCE_MATRIX;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
needsGapPenalties(): boolean {
|
|
56
|
+
return this.distanceMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
needsFingerprint(): boolean {
|
|
60
|
+
return this.distanceMetric === MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE ||
|
|
61
|
+
this.distanceMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH;
|
|
30
62
|
}
|
|
31
63
|
}
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
/* eslint-disable max-len */
|
|
2
|
+
/* eslint-disable space-infix-ops */
|
|
1
3
|
import * as grok from 'datagrok-api/grok';
|
|
2
4
|
import * as ui from 'datagrok-api/ui';
|
|
3
5
|
import * as DG from 'datagrok-api/dg';
|
|
@@ -34,6 +36,12 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
34
36
|
analysisGrid?: DG.Grid;
|
|
35
37
|
subInited: boolean = false;
|
|
36
38
|
|
|
39
|
+
// Track last parameters to avoid unnecessary recomputation
|
|
40
|
+
private lastDistanceMetric: string = '';
|
|
41
|
+
private lastFingerprint: string = '';
|
|
42
|
+
private lastGapOpen: number = 0;
|
|
43
|
+
private lastGapExtend: number = 0;
|
|
44
|
+
|
|
37
45
|
constructor(
|
|
38
46
|
private readonly seqHelper: ISeqHelper,
|
|
39
47
|
demo?: boolean,
|
|
@@ -55,10 +63,16 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
55
63
|
return;
|
|
56
64
|
if (this.targetColumn) {
|
|
57
65
|
this.curIdx = this.dataFrame!.currentRowIdx == -1 ? 0 : this.dataFrame!.currentRowIdx;
|
|
58
|
-
if (computeData && !this.gridSelect) {
|
|
59
|
-
this.targetMoleculeIdx = (this.dataFrame!.currentRowIdx ?? -1) < 0 ? 0 : this.dataFrame!.currentRowIdx;
|
|
60
66
|
|
|
61
|
-
|
|
67
|
+
// Force recomputation if parameters changed
|
|
68
|
+
const parametersChanged =
|
|
69
|
+
this.lastDistanceMetric !== this.distanceMetric ||
|
|
70
|
+
this.lastFingerprint !== this.fingerprint ||
|
|
71
|
+
this.lastGapOpen !== this.gapOpen ||
|
|
72
|
+
this.lastGapExtend !== this.gapExtend;
|
|
73
|
+
|
|
74
|
+
if ((computeData && !this.gridSelect) || parametersChanged) {
|
|
75
|
+
this.targetMoleculeIdx = (this.dataFrame!.currentRowIdx ?? -1) < 0 ? 0 : this.dataFrame!.currentRowIdx; await this.computeByMM();
|
|
62
76
|
const similarColumnName: string = this.similarColumnLabel != null ? this.similarColumnLabel :
|
|
63
77
|
`similar (${this.targetColumn})`;
|
|
64
78
|
this.molCol = DG.Column.string(similarColumnName,
|
|
@@ -106,14 +120,40 @@ export class SequenceSimilarityViewer extends SequenceSearchBaseViewer {
|
|
|
106
120
|
private async computeByMM() {
|
|
107
121
|
const len = this.targetColumn!.length;
|
|
108
122
|
const actualLimit = Math.min(this.limit, len - 1);
|
|
109
|
-
if (!this.knn || this.kPrevNeighbors !== actualLimit) {
|
|
110
|
-
const encodedSequences =
|
|
111
|
-
(await getEncodedSeqSpaceCol(this.targetColumn!, MmDistanceFunctionsNames.LEVENSHTEIN)).seqList;
|
|
112
123
|
|
|
124
|
+
// Check if need to recalculate knn due to parameter changes
|
|
125
|
+
const needsRecalculation = !this.knn ||
|
|
126
|
+
this.kPrevNeighbors !== actualLimit ||
|
|
127
|
+
this.lastDistanceMetric !== this.distanceMetric ||
|
|
128
|
+
this.lastFingerprint !== this.fingerprint ||
|
|
129
|
+
this.lastGapOpen !== this.gapOpen ||
|
|
130
|
+
this.lastGapExtend !== this.gapExtend;
|
|
131
|
+
|
|
132
|
+
if (needsRecalculation) {
|
|
133
|
+
const distanceFunction = this.distanceMetric as MmDistanceFunctionsNames;
|
|
134
|
+
|
|
135
|
+
// Call with individual parameters instead of params object
|
|
136
|
+
const encodedResult = await getEncodedSeqSpaceCol(
|
|
137
|
+
this.targetColumn!,
|
|
138
|
+
distanceFunction,
|
|
139
|
+
this.fingerprint,
|
|
140
|
+
this.gapOpen,
|
|
141
|
+
this.gapExtend
|
|
142
|
+
);
|
|
143
|
+
const encodedSequences = encodedResult.seqList;
|
|
144
|
+
const options = encodedResult.options;
|
|
145
|
+
|
|
146
|
+
// Store current parameters for next comparison
|
|
147
|
+
this.lastDistanceMetric = this.distanceMetric;
|
|
148
|
+
this.lastFingerprint = this.fingerprint;
|
|
149
|
+
this.lastGapOpen = this.gapOpen;
|
|
150
|
+
this.lastGapExtend = this.gapExtend;
|
|
113
151
|
this.kPrevNeighbors = actualLimit;
|
|
152
|
+
|
|
114
153
|
this.knn = await (new SparseMatrixService()
|
|
115
|
-
.getKNN(encodedSequences,
|
|
154
|
+
.getKNN(encodedSequences, distanceFunction, actualLimit, options));
|
|
116
155
|
}
|
|
156
|
+
|
|
117
157
|
const indexWScore = new Array(actualLimit).fill(0).map((_, i) => ({
|
|
118
158
|
idx: this.knn!.knnIndexes[this.targetMoleculeIdx][i],
|
|
119
159
|
score: 1 - this.knn!.knnDistances[this.targetMoleculeIdx][i],
|
|
@@ -15,9 +15,13 @@ export interface ISequenceSpaceResult {
|
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
export async function getEncodedSeqSpaceCol(
|
|
18
|
-
seqCol: DG.Column,
|
|
18
|
+
seqCol: DG.Column,
|
|
19
|
+
similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames,
|
|
20
|
+
fingerprintType: string = 'Morgan',
|
|
21
|
+
gapOpen: number = 1,
|
|
22
|
+
gapExtend: number = 0.6
|
|
19
23
|
): Promise<{ seqList: string[], options: { [_: string]: any } }> {
|
|
20
|
-
// encodes sequences using utf characters to also support multichar and non fasta sequences
|
|
24
|
+
// encodes sequences using utf characters to also support multichar and non fasta sequences
|
|
21
25
|
const rowCount = seqCol.length;
|
|
22
26
|
const sh = _package.seqHelper.getSeqHandler(seqCol);
|
|
23
27
|
const encList = Array<string>(rowCount);
|
|
@@ -25,6 +29,7 @@ export async function getEncodedSeqSpaceCol(
|
|
|
25
29
|
const charCodeMap = new Map<string, string>();
|
|
26
30
|
const seqColCats = seqCol.categories;
|
|
27
31
|
const seqColRawData = seqCol.getRawData();
|
|
32
|
+
|
|
28
33
|
for (let rowIdx = 0; rowIdx < rowCount; rowIdx++) {
|
|
29
34
|
const catI = seqColRawData[rowIdx];
|
|
30
35
|
const seq = seqColCats[catI];
|
|
@@ -44,7 +49,10 @@ export async function getEncodedSeqSpaceCol(
|
|
|
44
49
|
encList[rowIdx] += charCodeMap.get(char)!;
|
|
45
50
|
}
|
|
46
51
|
}
|
|
52
|
+
|
|
47
53
|
let options = {} as mmDistanceFunctionArgs;
|
|
54
|
+
|
|
55
|
+
// Handle fingerprint-based distance functions
|
|
48
56
|
if (
|
|
49
57
|
similarityMetric === MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE ||
|
|
50
58
|
similarityMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH
|
|
@@ -56,9 +64,21 @@ export async function getEncodedSeqSpaceCol(
|
|
|
56
64
|
Object.entries(monomerRes.alphabetIndexes).forEach(([key, value]) => {
|
|
57
65
|
monomerHashToMatrixMap[charCodeMap.get(key)!] = value;
|
|
58
66
|
});
|
|
67
|
+
|
|
59
68
|
// sets distance function args in place.
|
|
60
|
-
const maxLength = encList.reduce((acc, val) => Math.max(acc, val
|
|
61
|
-
options = {
|
|
69
|
+
const maxLength = encList.reduce((acc, val) => Math.max(acc, val?.length || 0), 0);
|
|
70
|
+
options = {
|
|
71
|
+
scoringMatrix: monomerRes.scoringMatrix,
|
|
72
|
+
alphabetIndexes: monomerHashToMatrixMap,
|
|
73
|
+
maxLength
|
|
74
|
+
} as mmDistanceFunctionArgs;
|
|
75
|
+
|
|
76
|
+
// Add gap penalties only for Needleman-Wunsch
|
|
77
|
+
if (similarityMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH) {
|
|
78
|
+
(options as any).gapOpen = gapOpen;
|
|
79
|
+
(options as any).gapExtend = gapExtend;
|
|
80
|
+
}
|
|
62
81
|
}
|
|
82
|
+
|
|
63
83
|
return {seqList: encList, options};
|
|
64
84
|
}
|
|
@@ -77,7 +77,7 @@ export async function demoBio05UI(): Promise<void> {
|
|
|
77
77
|
const method: string = pepseaMethods[0];
|
|
78
78
|
const gapOpen: number = 1.53;
|
|
79
79
|
const gapExtend: number = 0;
|
|
80
|
-
msaHelmCol = (await runPepsea(helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
|
|
80
|
+
msaHelmCol = (await runPepsea(df, helmCol, msaHelmColName, method, gapOpen, gapExtend, undefined))!;
|
|
81
81
|
if (!msaHelmCol)
|
|
82
82
|
throw new Error(`Empty MSA result.`);
|
|
83
83
|
df.columns.add(msaHelmCol);
|