@datagrok/bio 2.11.3 → 2.11.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -5,7 +5,7 @@
5
5
  "name": "Leonid Stolbov",
6
6
  "email": "lstolbov@datagrok.ai"
7
7
  },
8
- "version": "2.11.3",
8
+ "version": "2.11.6",
9
9
  "description": "Bioinformatics support (import/export of sequences, conversion, visualization, analysis). [See more](https://github.com/datagrok-ai/public/blob/master/packages/Bio/README.md) for details.",
10
10
  "repository": {
11
11
  "type": "git",
@@ -34,9 +34,9 @@
34
34
  ],
35
35
  "dependencies": {
36
36
  "@biowasm/aioli": "^3.1.0",
37
- "@datagrok-libraries/bio": "^5.39.0",
37
+ "@datagrok-libraries/bio": "^5.39.1",
38
38
  "@datagrok-libraries/chem-meta": "^1.0.1",
39
- "@datagrok-libraries/ml": "^6.3.51",
39
+ "@datagrok-libraries/ml": "^6.3.53",
40
40
  "@datagrok-libraries/tutorials": "^1.3.6",
41
41
  "@datagrok-libraries/utils": "^4.0.17",
42
42
  "cash-dom": "^8.0.0",
@@ -55,21 +55,19 @@ export async function sequenceSpaceByFingerprints(spaceParams: ISequenceSpacePar
55
55
  return result;
56
56
  }
57
57
 
58
- export async function getSequenceSpace(spaceParams: ISequenceSpaceParams,
59
- progressFunc?: (epochNum: number, epochsLength: number, embedding: number[][]) => void
60
- ): Promise<ISequenceSpaceResult> {
61
- const ncUH = UnitsHandler.getOrCreate(spaceParams.seqCol);
62
-
63
- //const distanceFName = ncUH.isMsa() ? MmDistanceFunctionsNames.HAMMING : MmDistanceFunctionsNames.LEVENSHTEIN;
64
- const seqList = spaceParams.seqCol.toList();
65
-
58
+ export async function getEncodedSeqSpaceCol(
59
+ seqCol: DG.Column, similarityMetric: BitArrayMetrics | MmDistanceFunctionsNames
60
+ ): Promise<{seqList:string[], options: {[_:string]: any}}> {
61
+ // encodes sequences using utf charachters to also support multichar and non fasta sequences
62
+ const ncUH = UnitsHandler.getOrCreate(seqCol);
63
+ const seqList = seqCol.toList();
66
64
  const splitter = ncUH.getSplitter();
67
65
  const seqColLength = seqList.length;
68
66
  let charCodeCounter = 36;
69
67
  const charCodeMap = new Map<string, string>();
70
68
  for (let i = 0; i < seqColLength; i++) {
71
69
  const seq = seqList[i];
72
- if (seqList[i] === null || spaceParams.seqCol.isNone(i)) {
70
+ if (seqList[i] === null || seqCol.isNone(i)) {
73
71
  seqList[i] = null;
74
72
  continue;
75
73
  }
@@ -84,8 +82,8 @@ export async function getSequenceSpace(spaceParams: ISequenceSpaceParams,
84
82
  seqList[i] += charCodeMap.get(char)!;
85
83
  }
86
84
  }
87
-
88
- if (spaceParams.similarityMetric === MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE) {
85
+ let options = {};
86
+ if (similarityMetric === MmDistanceFunctionsNames.MONOMER_CHEMICAL_DISTANCE) {
89
87
  const monomers = Array.from(charCodeMap.keys());
90
88
  const monomerRes = await calculateMonomerSimilarity(monomers);
91
89
  // the susbstitution matrix contains similarity, but we need distances
@@ -98,10 +96,34 @@ export async function getSequenceSpace(spaceParams: ISequenceSpaceParams,
98
96
  Object.entries(monomerRes.alphabetIndexes).forEach(([key, value]) => {
99
97
  monomerHashToMatrixMap[charCodeMap.get(key)!] = value;
100
98
  });
101
- spaceParams.options.distanceFnArgs = {scoringMatrix: monomerRes.scoringMatrix,
99
+ // sets distance function args in place.
100
+ options = {scoringMatrix: monomerRes.scoringMatrix,
102
101
  alphabetIndexes: monomerHashToMatrixMap} satisfies mmDistanceFunctionArgs;
103
102
  }
103
+ // else if (similarityMetric === MmDistanceFunctionsNames.NEEDLEMANN_WUNSCH) {
104
+ // const alphabetIndexes: any = {};
105
+ // let i = 0;
106
+ // charCodeMap.forEach((value) => {
107
+ // alphabetIndexes[value] = i;
108
+ // i++;
109
+ // });
110
+ // options = {alphabetIndexes};
111
+ // }
112
+ return {seqList, options};
113
+ }
114
+
115
+ export async function getSequenceSpace(spaceParams: ISequenceSpaceParams,
116
+ progressFunc?: (epochNum: number, epochsLength: number, embedding: number[][]) => void
117
+ ): Promise<ISequenceSpaceResult> {
118
+ const ncUH = UnitsHandler.getOrCreate(spaceParams.seqCol);
119
+ if (ncUH.isHelm())
120
+ return await sequenceSpaceByFingerprints(spaceParams);
121
+
122
+
123
+ const {seqList, options} = await getEncodedSeqSpaceCol(spaceParams.seqCol, spaceParams.similarityMetric);
104
124
 
125
+ spaceParams.options = spaceParams.options ?? {};
126
+ spaceParams.options.distanceFnArgs = options;
105
127
  const sequenceSpaceResult = await reduceDimensinalityWithNormalization(
106
128
  seqList,
107
129
  spaceParams.methodName,
@@ -13,6 +13,7 @@ import {getDendrogramService, IDendrogramService} from '@datagrok-libraries/bio/
13
13
  import {handleError} from './utils';
14
14
  import {DemoScript} from '@datagrok-libraries/tutorials/src/demo-script';
15
15
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
16
+ import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
16
17
 
17
18
  const dataFn: string = 'data/sample_FASTA_PT_activity.csv';
18
19
 
@@ -53,7 +54,7 @@ export async function demoBio01bUI() {
53
54
  .step('Find activity cliffs', async () => {
54
55
  activityCliffsViewer = (await activityCliffs(
55
56
  df, df.getCol('Sequence'), df.getCol('Activity'),
56
- 80, dimRedMethod)) as DG.ScatterPlotViewer;
57
+ 80, dimRedMethod, MmDistanceFunctionsNames.LEVENSHTEIN)) as DG.ScatterPlotViewer;
57
58
  view.dockManager.dock(activityCliffsViewer, DG.DOCK_TYPE.RIGHT, null, 'Activity Cliffs', 0.35);
58
59
 
59
60
  // Show grid viewer with the cliffs
package/src/package.ts CHANGED
@@ -11,10 +11,10 @@ import {DimReductionMethods, ITSNEOptions, IUMAPOptions} from '@datagrok-librari
11
11
  import {SequenceSpaceFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/seq-space-editor';
12
12
  import {ActivityCliffsFunctionEditor} from '@datagrok-libraries/ml/src/functionEditors/activity-cliffs-editor';
13
13
  import {
14
- ISequenceSpaceParams, getActivityCliffs, SequenceSpaceFunc
14
+ ISequenceSpaceParams, getActivityCliffs, SequenceSpaceFunc, CLIFFS_COL_ENCODE_FN
15
15
  } from '@datagrok-libraries/ml/src/viewers/activity-cliffs';
16
16
  import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
17
- import {BitArrayMetrics, BitArrayMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
17
+ import {BitArrayMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
18
18
  import {
19
19
  TAGS as bioTAGS, ALPHABET, NOTATION,
20
20
  } from '@datagrok-libraries/bio/src/utils/macromolecule';
@@ -35,14 +35,14 @@ import {
35
35
  import {VdRegionsViewer} from './viewers/vd-regions-viewer';
36
36
  import {SequenceAlignment} from './seq_align';
37
37
  import {
38
- ISequenceSpaceResult, getEmbeddingColsNames, getSequenceSpace, sequenceSpaceByFingerprints
38
+ ISequenceSpaceResult, getEmbeddingColsNames, getEncodedSeqSpaceCol, getSequenceSpace, sequenceSpaceByFingerprints
39
39
  } from './analysis/sequence-space';
40
40
  import {
41
41
  createLinesGrid, createPropPanelElement, createTooltipElement, getChemSimilaritiesMatrix,
42
42
  } from './analysis/sequence-activity-cliffs';
43
43
  import {SequenceSimilarityViewer} from './analysis/sequence-similarity-viewer';
44
44
  import {SequenceDiversityViewer} from './analysis/sequence-diversity-viewer';
45
- import {SubstructureSearchDialog} from './substructure-search/substructure-search';
45
+ import {MONOMERIC_COL_TAGS, SubstructureSearchDialog, invalidateMols} from './substructure-search/substructure-search';
46
46
  import {convert} from './utils/convert';
47
47
  import {getMacromoleculeColumnPropertyPanel} from './widgets/representations';
48
48
  import {saveAsFastaUI} from './utils/save-as-fasta';
@@ -75,10 +75,12 @@ import {GetRegionApp} from './apps/get-region-app';
75
75
  import {GetRegionFuncEditor} from './utils/get-region-func-editor';
76
76
  import {sequenceToMolfile} from './utils/sequence-to-mol';
77
77
  import {errInfo} from './utils/err-info';
78
+ import {detectMacromoleculeProbeDo} from './utils/detect-macromolecule-probe';
78
79
 
79
80
  import {SHOW_SCATTERPLOT_PROGRESS} from '@datagrok-libraries/ml/src/functionEditors/seq-space-base-editor';
80
81
  import {DIMENSIONALITY_REDUCER_TERMINATE_EVENT}
81
82
  from '@datagrok-libraries/ml/src/workers/dimensionality-reducing-worker-creator';
83
+ import BitArray from '@datagrok-libraries/utils/src/bit-array';
82
84
 
83
85
  export const _package = new BioPackage();
84
86
 
@@ -389,12 +391,14 @@ export async function getRegionTopMenu(
389
391
  //input: column activities
390
392
  //input: double similarity = 80 [Similarity cutoff]
391
393
  //input: string methodName { choices:["UMAP", "t-SNE"] }
394
+ //input: string similarityMetric { choices:["Hamming", "Levenshtein", "Monomer chemical distance"] }
392
395
  //input: object options {optional: true}
393
396
  //output: viewer result
394
397
  //editor: Bio:SeqActivityCliffsEditor
395
398
  export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<string>, activities: DG.Column,
396
- similarity: number, methodName: DimReductionMethods, options?: (IUMAPOptions | ITSNEOptions) & Options,
397
- ): Promise<DG.Viewer | undefined> {
399
+ similarity: number, methodName: DimReductionMethods,
400
+ similarityMetric: MmDistanceFunctionsNames | BitArrayMetrics,
401
+ options?: (IUMAPOptions | ITSNEOptions) & Options): Promise<DG.Viewer | undefined> {
398
402
  if (!checkInputColumnUI(macroMolecule, 'Activity Cliffs'))
399
403
  return;
400
404
  const axesNames = getEmbeddingColsNames(df);
@@ -404,21 +408,26 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
404
408
  'separator': macroMolecule.getTag(bioTAGS.separator),
405
409
  'alphabet': macroMolecule.getTag(bioTAGS.alphabet),
406
410
  };
411
+ let cliffsEncodeFunction: (seqCol: DG.Column, similarityMetric: MmDistanceFunctionsNames | BitArrayMetrics) => any =
412
+ getEncodedSeqSpaceCol;
407
413
  const ncUH = UnitsHandler.getOrCreate(macroMolecule);
408
- let columnDistanceMetric: BitArrayMetrics | MmDistanceFunctionsNames = BitArrayMetricsNames.Tanimoto;
409
- let seqCol = macroMolecule;
410
- let sequenceSpaceFunc: SequenceSpaceFunc = sequenceSpaceByFingerprints;
411
- if (ncUH.isFasta() || (ncUH.isSeparator() && ncUH.alphabet && ncUH.alphabet !== ALPHABET.UN)) {
412
- if (ncUH.isFasta()) {
413
- columnDistanceMetric = ncUH.getDistanceFunctionName();
414
- } else {
415
- seqCol = ncUH.convert(NOTATION.FASTA);
416
- const uh = UnitsHandler.getOrCreate(seqCol);
417
- columnDistanceMetric = uh.getDistanceFunctionName();
418
- tags.units = NOTATION.FASTA;
419
- }
420
- sequenceSpaceFunc = getSequenceSpace;
414
+ const columnDistanceMetric: MmDistanceFunctionsNames | BitArrayMetrics = similarityMetric;
415
+ const seqCol = macroMolecule;
416
+
417
+ let sequenceSpaceFunc: SequenceSpaceFunc = getSequenceSpace;
418
+ if (ncUH.isHelm()) {
419
+ sequenceSpaceFunc = sequenceSpaceByFingerprints;
420
+ cliffsEncodeFunction = async (seqCol: DG.Column, similarityMetric: MmDistanceFunctionsNames | BitArrayMetrics) => {
421
+ await invalidateMols(seqCol, false);
422
+ const molecularCol = seqCol.temp[MONOMERIC_COL_TAGS.MONOMERIC_MOLS];
423
+ const fingerPrints: DG.Column =
424
+ await grok.functions.call('Chem:getMorganFingerprints', {molColumn: molecularCol});
425
+ const fingerPrintsBitArray = fingerPrints.toList().map((f: DG.BitSet) =>
426
+ BitArray.fromUint32Array(f.length, new Uint32Array(f.getBuffer().buffer)));
427
+ return {seqList: fingerPrintsBitArray, options: {}};
428
+ };
421
429
  }
430
+
422
431
  const runCliffs = async () => {
423
432
  const sp = await getActivityCliffs(
424
433
  df,
@@ -437,25 +446,26 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
437
446
  createTooltipElement,
438
447
  createPropPanelElement,
439
448
  createLinesGrid,
440
- options);
449
+ {...(options ?? {}), [CLIFFS_COL_ENCODE_FN]: cliffsEncodeFunction});
441
450
  return sp;
442
451
  };
443
452
 
444
- const allowedRowCount = 20000;
445
- const fastRowCount = methodName === DimReductionMethods.UMAP ? 5000 : 2000;
453
+ const allowedRowCount = methodName === DimReductionMethods.UMAP ? 200_000 : 20_000;
454
+ const fastRowCount = methodName === DimReductionMethods.UMAP ? 5_000 : 2_000;
446
455
  if (df.rowCount > allowedRowCount) {
447
456
  grok.shell.warning(`Too many rows, maximum for sequence activity cliffs is ${allowedRowCount}`);
448
457
  return;
449
458
  }
450
459
 
451
- return new Promise<DG.Viewer>((resolve, reject) => {
460
+ const pi = DG.TaskBarProgressIndicator.create(`Running sequence activity cliffs ...`);
461
+ return new Promise<DG.Viewer | undefined>((resolve, reject) => {
452
462
  if (df.rowCount > fastRowCount && !options?.[BYPASS_LARGE_DATA_WARNING]) {
453
463
  ui.dialog().add(ui.divText(`Activity cliffs analysis might take several minutes.
454
464
  Do you want to continue?`))
455
465
  .onOK(async () => {
456
- const progressBar = DG.TaskBarProgressIndicator.create(`Running sequence activity cliffs ...`);
457
- runCliffs().then((res) => resolve(res)).catch((err) => reject(err)).finally(() => { progressBar.close(); });
466
+ runCliffs().then((res) => resolve(res)).catch((err) => reject(err));
458
467
  })
468
+ .onCancel(() => { resolve(undefined); })
459
469
  .show();
460
470
  } else {
461
471
  runCliffs().then((res) => resolve(res)).catch((err) => reject(err));
@@ -464,7 +474,7 @@ export async function activityCliffs(df: DG.DataFrame, macroMolecule: DG.Column<
464
474
  const [errMsg, errStack] = errInfo(err);
465
475
  _package.logger.error(errMsg, undefined, errStack);
466
476
  throw err;
467
- });
477
+ }).finally(() => { pi.close(); });
468
478
  }
469
479
 
470
480
  //top-menu: Bio | Analyze | Sequence Space...
@@ -516,6 +526,7 @@ export async function sequenceSpaceTopMenu(
516
526
  const progress = (_nEpoch / epochsLength * 100);
517
527
  pg.update(progress, `Running sequence space ... ${progress.toFixed(0)}%`);
518
528
  }
529
+
519
530
  const embedColsNames = getEmbeddingColsNames(table);
520
531
  const withoutEmptyValues = DG.DataFrame.fromColumns([macroMolecule]).clone();
521
532
  const emptyValsIdxs = removeEmptyStringRows(withoutEmptyValues, macroMolecule);
@@ -525,7 +536,7 @@ export async function sequenceSpaceTopMenu(
525
536
  methodName: methodName,
526
537
  similarityMetric: similarityMetric,
527
538
  embedAxesNames: embedColsNames,
528
- options: {...options, sparseMatrixThreshold: sparseMatrixThreshold ?? 0.8,
539
+ options: {...options, sparseMatrixThreshold: sparseMatrixThreshold ?? 0.5,
529
540
  usingSparseMatrix: table.rowCount > 20000},
530
541
  };
531
542
 
@@ -1084,10 +1095,10 @@ export async function demoBioHelmMsaSequenceSpace(): Promise<void> {
1084
1095
  await demoBio05UI();
1085
1096
  }
1086
1097
 
1087
- //name: enumeratorColumnChoice
1098
+ //name: polyToolColumnChoice
1088
1099
  //input: dataframe df [Input data table]
1089
1100
  //input: column macroMolecule
1090
- export async function enumeratorColumnChoice(df: DG.DataFrame, macroMolecule: DG.Column): Promise<void> {
1101
+ export async function polyToolColumnChoice(df: DG.DataFrame, macroMolecule: DG.Column): Promise<void> {
1091
1102
  _setPeptideColumn(macroMolecule);
1092
1103
  await grok.data.detectSemanticTypes(df);
1093
1104
  }
@@ -1099,3 +1110,14 @@ export async function sdfToJsonLib(table: DG.DataFrame) {
1099
1110
  const jsonMonomerLibrary = JSON.stringify(_jsonMonomerLibrary);
1100
1111
  DG.Utils.download(`${table.name}.json`, jsonMonomerLibrary);
1101
1112
  }
1113
+
1114
+ // -- Utils --
1115
+
1116
+ //name: detectMacromoleculeProbe
1117
+ //input: file file
1118
+ //input: string colName = ''
1119
+ //input: int probeCount = 100
1120
+ export async function detectMacromoleculeProbe(file: DG.FileInfo, colName: string, probeCount: number): Promise<void> {
1121
+ const csv: string = await file.readAsString();
1122
+ await detectMacromoleculeProbeDo(csv, colName, probeCount);
1123
+ }
@@ -8,6 +8,8 @@ import {_testActivityCliffsOpen} from './activity-cliffs-utils';
8
8
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
9
9
 
10
10
  import {_package} from '../package-test';
11
+ import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
12
+ import {BitArrayMetricsNames} from '@datagrok-libraries/ml/src/typed-metrics';
11
13
 
12
14
 
13
15
  category('activityCliffs', async () => {
@@ -39,7 +41,7 @@ category('activityCliffs', async () => {
39
41
  const cliffsNum = DG.Test.isInBenchmark ? 6 : 3;
40
42
 
41
43
  await _testActivityCliffsOpen(actCliffsDf, DimReductionMethods.UMAP,
42
- 'sequence', 'Activity', 90, cliffsNum);
44
+ 'sequence', 'Activity', 90, cliffsNum, MmDistanceFunctionsNames.LEVENSHTEIN);
43
45
  });
44
46
 
45
47
  test('activityCliffsWithEmptyRows', async () => {
@@ -49,7 +51,7 @@ category('activityCliffs', async () => {
49
51
  viewList.push(actCliffsTableViewWithEmptyRows);
50
52
 
51
53
  await _testActivityCliffsOpen(actCliffsDfWithEmptyRows, DimReductionMethods.UMAP,
52
- 'sequence', 'Activity', 90, 3);
54
+ 'sequence', 'Activity', 90, 3, MmDistanceFunctionsNames.LEVENSHTEIN);
53
55
  });
54
56
 
55
57
  test('Helm', async () => {
@@ -57,6 +59,6 @@ category('activityCliffs', async () => {
57
59
  const view = grok.shell.addTableView(df);
58
60
 
59
61
  await _testActivityCliffsOpen(df, DimReductionMethods.UMAP,
60
- 'HELM', 'Activity', 90, 53);
62
+ 'HELM', 'Activity', 90, 53, BitArrayMetricsNames.Tanimoto);
61
63
  });
62
64
  });
@@ -4,14 +4,17 @@ import * as grok from 'datagrok-api/grok';
4
4
  import {expect} from '@datagrok-libraries/utils/src/test';
5
5
  import {activityCliffs, BYPASS_LARGE_DATA_WARNING} from '../package';
6
6
  import {DimReductionMethods} from '@datagrok-libraries/ml/src/reduce-dimensionality';
7
+ import {MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
8
+ import {BitArrayMetrics} from '@datagrok-libraries/ml/src/typed-metrics';
7
9
 
8
10
  export async function _testActivityCliffsOpen(df: DG.DataFrame, drMethod: DimReductionMethods,
9
- seqColName: string, activityColName: string, similarityThr: number, tgtNumberCliffs: number
11
+ seqColName: string, activityColName: string, similarityThr: number, tgtNumberCliffs: number,
12
+ similarityMetric: MmDistanceFunctionsNames | BitArrayMetrics
10
13
  ): Promise<void> {
11
14
  await grok.data.detectSemanticTypes(df);
12
15
  const scatterPlot = await activityCliffs(
13
16
  df, df.getCol(seqColName), df.getCol(activityColName),
14
- similarityThr, drMethod, {[`${BYPASS_LARGE_DATA_WARNING}`]: true});
17
+ similarityThr, drMethod, similarityMetric, {[`${BYPASS_LARGE_DATA_WARNING}`]: true});
15
18
  // const scatterPlot = (await grok.functions.call('Bio:activityCliffs', {
16
19
  // table: df, molecules: df.getCol(colName), activities: df.getCol('Activity'),
17
20
  // similarity: 50, methodName: method
@@ -41,79 +41,79 @@ category('converters', () => {
41
41
  }
42
42
 
43
43
  const _csvTxts: { [key: string]: string } = {
44
- fastaPt: `seq
45
- FWPHEY
46
- YNRQWYV
47
- MKPSEYV`,
48
- separatorPt: `seq
49
- F-W-P-H-E-Y
50
- Y-N-R-Q-W-Y-V
51
- M-K-P-S-E-Y-V`,
52
- helmPt: `seq
53
- PEPTIDE1{F.W.P.H.E.Y}$$$$
54
- PEPTIDE1{Y.N.R.Q.W.Y.V}$$$$
55
- PEPTIDE1{M.K.P.S.E.Y.V}$$$$`,
56
- fastaDna: `seq
57
- ACGTC
58
- CAGTGT
59
- TTCAAC`,
60
- separatorDna: `seq
61
- A/C/G/T/C
62
- C/A/G/T/G/T
63
- T/T/C/A/A/C`,
64
- helmDna: `seq
65
- RNA1{d(A)p.d(C)p.d(G)p.d(T)p.d(C)p}$$$$
66
- RNA1{d(C)p.d(A)p.d(G)p.d(T)p.d(G)p.d(T)p}$$$$
67
- RNA1{d(T)p.d(T)p.d(C)p.d(A)p.d(A)p.d(C)p}$$$$`,
68
- fastaRna: `seq
69
- ACGUC
70
- CAGUGU
71
- UUCAAC`,
72
- separatorRna: `seq
73
- A*C*G*U*C
74
- C*A*G*U*G*U
75
- U*U*C*A*A*C`,
76
- helmRna: `seq
77
- RNA1{r(A)p.r(C)p.r(G)p.r(U)p.r(C)p}$$$$
78
- RNA1{r(C)p.r(A)p.r(G)p.r(U)p.r(G)p.r(U)p}$$$$
79
- RNA1{r(U)p.r(U)p.r(C)p.r(A)p.r(A)p.r(C)p}$$$$`,
80
- fastaGaps: `seq
81
- FW-PH-EYY
82
- FYNRQWYV-
83
- FKP-Q-SEYV`,
84
- separatorGaps: `seq
85
- F/W//P/H//E/Y/Y
86
- F/Y/N/R/Q/W/Y/V/
87
- F/K/P//Q//S/E/Y/V`,
88
- helmGaps: `seq
89
- PEPTIDE1{F.W.*.P.H.*.E.Y.Y}$$$$
90
- PEPTIDE1{F.Y.N.R.Q.W.Y.V.*}$$$$
91
- PEPTIDE1{F.K.P.*.Q.*.S.E.Y.V}$$$$`,
44
+ [Samples.fastaPt]: `seq
45
+ FWPHEYFWPHEY
46
+ YNRQWYVYNRQWYV
47
+ MKPSEYVMKPSEYV`,
48
+ [Samples.separatorPt]: `seq
49
+ F-W-P-H-E-Y-F-W-P-H-E-Y
50
+ Y-N-R-Q-W-Y-V-Y-N-R-Q-W-Y-V
51
+ M-K-P-S-E-Y-V-M-K-P-S-E-Y-V`,
52
+ [Samples.helmPt]: `seq
53
+ PEPTIDE1{F.W.P.H.E.Y.F.W.P.H.E.Y}$$$$
54
+ PEPTIDE1{Y.N.R.Q.W.Y.V.Y.N.R.Q.W.Y.V}$$$$
55
+ PEPTIDE1{M.K.P.S.E.Y.V.M.K.P.S.E.Y.V}$$$$`,
56
+ [Samples.fastaDna]: `seq
57
+ ACGTCACGTC
58
+ CAGTGTCAGTGT
59
+ TTCAACTTCAAC`,
60
+ [Samples.separatorDna]: `seq
61
+ A/C/G/T/C/A/C/G/T/C
62
+ C/A/G/T/G/T/C/A/G/T/G/T
63
+ T/T/C/A/A/C/T/T/C/A/A/C`,
64
+ [Samples.helmDna]: `seq
65
+ RNA1{d(A)p.d(C)p.d(G)p.d(T)p.d(C)p.d(A)p.d(C)p.d(G)p.d(T)p.d(C)p}$$$$
66
+ RNA1{d(C)p.d(A)p.d(G)p.d(T)p.d(G)p.d(T)p.d(C)p.d(A)p.d(G)p.d(T)p.d(G)p.d(T)p}$$$$
67
+ RNA1{d(T)p.d(T)p.d(C)p.d(A)p.d(A)p.d(C)p.d(T)p.d(T)p.d(C)p.d(A)p.d(A)p.d(C)p}$$$$`,
68
+ [Samples.fastaRna]: `seq
69
+ ACGUCACGUC
70
+ CAGUGUCAGUGU
71
+ UUCAACUUCAAC`,
72
+ [Samples.separatorRna]: `seq
73
+ A*C*G*U*C*A*C*G*U*C
74
+ C*A*G*U*G*U*C*A*G*U*G*U
75
+ U*U*C*A*A*C*U*U*C*A*A*C`,
76
+ [Samples.helmRna]: `seq
77
+ RNA1{r(A)p.r(C)p.r(G)p.r(U)p.r(C)p.r(A)p.r(C)p.r(G)p.r(U)p.r(C)p}$$$$
78
+ RNA1{r(C)p.r(A)p.r(G)p.r(U)p.r(G)p.r(U)p.r(C)p.r(A)p.r(G)p.r(U)p.r(G)p.r(U)p}$$$$
79
+ RNA1{r(U)p.r(U)p.r(C)p.r(A)p.r(A)p.r(C)p.r(U)p.r(U)p.r(C)p.r(A)p.r(A)p.r(C)p}$$$$`,
80
+ [Samples.fastaGaps]: `seq
81
+ FW-PH-EYYFW-PH-EYY
82
+ FYNRQWYV-FYNRQWYV-
83
+ FKP-Q-SEYVFKP-Q-SEYV`,
84
+ [Samples.separatorGaps]: `seq
85
+ F/W//P/H//E/Y/Y/F/W//P/H//E/Y/Y
86
+ F/Y/N/R/Q/W/Y/V//F/Y/N/R/Q/W/Y/V/
87
+ F/K/P//Q//S/E/Y/V/F/K/P//Q//S/E/Y/V`,
88
+ [Samples.helmGaps]: `seq
89
+ PEPTIDE1{F.W.*.P.H.*.E.Y.Y.F.W.*.P.H.*.E.Y.Y}$$$$
90
+ PEPTIDE1{F.Y.N.R.Q.W.Y.V.*.F.Y.N.R.Q.W.Y.V.*}$$$$
91
+ PEPTIDE1{F.K.P.*.Q.*.S.E.Y.V.F.K.P.*.Q.*.S.E.Y.V}$$$$`,
92
92
 
93
- fastaUn: `seq
94
- [meI][hHis][Aca]NT[dE][Thr_PO3H2][Aca]D
95
- [meI][hHis][Aca][Cys_SEt]T[dK][Thr_PO3H2][Aca][Tyr_PO3H2]
96
- [Lys_Boc][hHis][Aca][Cys_SEt]T[dK][Thr_PO3H2][Aca][Tyr_PO3H2]`,
97
- separatorUn: `seq
98
- meI-hHis-Aca-N-T-dE-Thr_PO3H2-Aca-D
99
- meI-hHis-Aca-Cys_SEt-T-dK-Thr_PO3H2-Aca-Tyr_PO3H2
100
- Lys_Boc-hHis-Aca-Cys_SEt-T-dK-Thr_PO3H2-Aca-Tyr_PO3H2`,
101
- helmUn: `seq
102
- PEPTIDE1{meI.hHis.Aca.N.T.dE.Thr_PO3H2.Aca.D}$$$$
103
- PEPTIDE1{meI.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2}$$$$
104
- PEPTIDE1{Lys_Boc.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2}$$$$`,
105
- helmLoneDeoxyribose: `seq
106
- RNA1{d(A).d(C).d(G).d(T).d(C)}$$$$
107
- RNA1{d(C).d(A).d(G).d(T).d(G).d(T)p}$$$$
108
- RNA1{d(T).d(T).d(C).d(A).d(A).d(C)p}$$$$`,
109
- helmLoneRibose: `seq
110
- RNA1{r(A).r(C).r(G).r(U).r(C)}$$$$
111
- RNA1{r(C).r(A).r(G).r(U).r(G).r(U)p}$$$$
112
- RNA1{r(U).r(U).r(C).r(A).r(A).r(C)p}$$$$`,
113
- helmLonePhosphorus: `seq
114
- RNA1{p.p.r(A)p.r(C)p.r(G)p.r(U)p.r(C)p}$$$$
115
- RNA1{p.p.r(C)p.r(A)p.p.r(G)p.r(U)p.r(G)p.r(U)p}$$$$
116
- RNA1{p.r(U)p.r(U)p.r(C)p.r(A)p.r(A)p.r(C)p.p.p}$$$$`,
93
+ [Samples.fastaUn]: `seq
94
+ [meI][hHis][Aca]NT[dE][Thr_PO3H2][Aca]D[meI][hHis][Aca]NT[dE][Thr_PO3H2][Aca]D
95
+ [meI][hHis][Aca][Cys_SEt]T[dK][Thr_PO3H2][Aca][Tyr_PO3H2][meI][hHis][Aca][Cys_SEt]T[dK][Thr_PO3H2][Aca][Tyr_PO3H2]
96
+ [Lys_Boc][hHis][Aca][Cys_SEt]T[dK][Thr_PO3H2][Aca][Tyr_PO3H2][Lys_Boc][hHis][Aca][Cys_SEt]T[dK][Thr_PO3H2][Aca]`,
97
+ [Samples.separatorUn]: `seq
98
+ meI-hHis-Aca-N-T-dE-Thr_PO3H2-Aca-D-meI-hHis-Aca-N-T-dE-Thr_PO3H2-Aca-D
99
+ meI-hHis-Aca-Cys_SEt-T-dK-Thr_PO3H2-Aca-Tyr_PO3H2-meI-hHis-Aca-Cys_SEt-T-dK-Thr_PO3H2-Aca-Tyr_PO3H2
100
+ Lys_Boc-hHis-Aca-Cys_SEt-T-dK-Thr_PO3H2-Aca-Tyr_PO3H2-Lys_Boc-hHis-Aca-Cys_SEt-T-dK-Thr_PO3H2-Aca`,
101
+ [Samples.helmUn]: `seq
102
+ PEPTIDE1{meI.hHis.Aca.N.T.dE.Thr_PO3H2.Aca.D.meI.hHis.Aca.N.T.dE.Thr_PO3H2.Aca.D}$$$$
103
+ PEPTIDE1{meI.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2.meI.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2}$$$$
104
+ PEPTIDE1{Lys_Boc.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca.Tyr_PO3H2.Lys_Boc.hHis.Aca.Cys_SEt.T.dK.Thr_PO3H2.Aca}$$$$`,
105
+ [Samples.helmLoneDeoxyribose]: `seq
106
+ RNA1{d(A).d(C).d(G).d(T).d(C).d(A).d(C).d(G).d(T).d(C)}$$$$
107
+ RNA1{d(C).d(A).d(G).d(T).d(G).d(T)p.d(C).d(A).d(G).d(T).d(G).d(T)p}$$$$
108
+ RNA1{d(T).d(T).d(C).d(A).d(A).d(C)p.d(T).d(T).d(C).d(A).d(A).d(C)p}$$$$`,
109
+ [Samples.helmLoneRibose]: `seq
110
+ RNA1{r(A).r(C).r(G).r(U).r(C).r(A).r(C).r(G).r(U).r(C)}$$$$
111
+ RNA1{r(C).r(A).r(G).r(U).r(G).r(U)p.r(C).r(A).r(G).r(U).r(G).r(U)p}$$$$
112
+ RNA1{r(U).r(U).r(C).r(A).r(A).r(C)p.r(U).r(U).r(C).r(A).r(A).r(C)p}$$$$`,
113
+ [Samples.helmLonePhosphorus]: `seq
114
+ RNA1{p.p.r(A)p.r(C)p.r(G)p.r(U)p.r(C)p.r(A)p.r(C)p.r(G)p.r(U)p.r(C)p}$$$$
115
+ RNA1{p.p.r(C)p.r(A)p.p.r(G)p.r(U)p.r(G)p.r(U)p.r(C)p.r(A)p.p.r(G)p.r(U)p.r(G)p.r(U)p}$$$$
116
+ RNA1{p.r(U)p.r(U)p.r(C)p.r(A)p.r(A)p.r(C)p.r(U)p.r(U)p.r(C)p.r(A)p.r(A)p.r(C)p.p.p}$$$$`,
117
117
  };
118
118
 
119
119
  /** Also detects semantic types
@@ -39,11 +39,11 @@ category('detectorsBenchmark', () => {
39
39
  });
40
40
 
41
41
  test('separatorDnaShorts50Many1E6', async () => {
42
- await detectMacromoleculeBenchmark(10, NOTATION.SEPARATOR, ALPHABET.DNA, 50, 1E6, '/');
42
+ await detectMacromoleculeBenchmark(20, NOTATION.SEPARATOR, ALPHABET.DNA, 50, 1E6, '/');
43
43
  });
44
44
 
45
45
  test('separatorDnaLong1e6Few50', async () => {
46
- await detectMacromoleculeBenchmark(10, NOTATION.SEPARATOR, ALPHABET.DNA, 1E6, 50, '/');
46
+ await detectMacromoleculeBenchmark(20, NOTATION.SEPARATOR, ALPHABET.DNA, 1E6, 50, '/');
47
47
  });
48
48
 
49
49
  async function detectMacromoleculeBenchmark(
@@ -71,63 +71,63 @@ CCCCN1C(=O)CN=C(c2cc(F)ccc12)C3CCCCC3
71
71
  C1CCCCC1
72
72
  CCCCCC`;
73
73
  [csvTests.fastaDna1]: string = `seq
74
- ACGTC
75
- CAGTGT
76
- TTCAAC`;
74
+ ACGTCACGTC
75
+ CAGTGTCAGTGT
76
+ TTCAACTTCAAC`;
77
77
  [csvTests.fastaRna1]: string = `seq
78
- ACGUC
79
- CAGUGU
80
- UUCAAC`;
78
+ ACGUCACGUC
79
+ CAGUGUCAGUGU
80
+ UUCAACUUCAAC`;
81
81
  /** Pure amino acids sequence */
82
82
  [csvTests.fastaPt1]: string = `seq
83
83
  FWPHEY
84
84
  YNRQWYV
85
85
  MKPSEYV`;
86
86
  [csvTests.fastaUn]: string = `seq
87
- [meI][hHis][Aca]NT[dE][Thr_PO3H2][Aca]D
88
- [meI][hHis][Aca][Cys_SEt]T[dK][Thr_PO3H2][Aca][Tyr_PO3H2]
89
- [Lys_Boc][hHis][Aca][Cys_SEt]T[dK][Thr_PO3H2][Aca][Tyr_PO3H2]`;
87
+ [meI][hHis][Aca]NT[dE][Thr_PO3H2][Aca]DN
88
+ [meI][hHis][Aca][Cys_SEt]T[dK][Thr_PO3H2][Aca][Tyr_PO3H2][Aca]
89
+ [Lys_Boc][hHis][Aca][Cys_SEt]T[dK][Thr_PO3H2][Aca][Tyr_PO3H2][Aca]`;
90
90
  [csvTests.sepDna]: string = `seq
91
- A*C*G*T*C
92
- C*A*G*T*G*T
93
- T*T*C*A*A*C`;
91
+ A*C*G*T*C*A*C*G*T*C
92
+ C*A*G*T*G*T*C*A*G*T*G*T
93
+ T*T*C*A*A*C*T*T*C*A*A*C`;
94
94
  [csvTests.sepRna]: string = `seq
95
- A*C*G*U*C
96
- C*A*G*U*G*U
97
- U*U*C*A*A*C`;
95
+ A*C*G*U*C*A*C*G*U*C
96
+ C*A*G*U*G*U*C*A*G*U*G*U
97
+ U*U*C*A*A*C*U*U*C*A*A*C`;
98
98
  [csvTests.sepPt]: string = `seq
99
- F-W-P-H-E-Y
100
- Y-N-R-Q-W-Y-V
101
- M-K-P-S-E-Y-V`;
99
+ F-W-P-H-E-Y-F-W-P-H-E-Y
100
+ Y-N-R-Q-W-Y-V-Y-N-R-Q-W-Y-V
101
+ M-K-P-S-E-Y-V-M-K-P-S-E-Y-V`;
102
102
  [csvTests.sepUn1]: string = `seq
103
- abc-dfgg-abc1-cfr3-rty-wert
104
- rut12-her2-rty-wert-abc-abc1-dfgg
105
- rut12-rty-her2-abc-cfr3-wert-rut12`;
103
+ abc-dfgg-abc1-cfr3-rty-wert-cfr3-rty-wert
104
+ rut12-her2-rty-wert-abc-abc1-dfgg-abc-abc1-dfgg
105
+ rut12-rty-her2-abc-cfr3-wert-rut12-cfr3-wert-rut12`;
106
106
  [csvTests.sepUn2]: string = `seq
107
- abc/dfgg/abc1/cfr3/rty/wert
108
- rut12/her2/rty/wert//abc/abc1/dfgg
109
- rut12/rty/her2/abc/cfr3//wert/rut12`;
107
+ abc/dfgg/abc1/cfr3/rty/wert/abc/dfgg/abc1/cfr3/rty/wert
108
+ rut12/her2/rty/wert//abc/abc1/dfgg/rut12/her2/rty/wert//abc/abc1/dfgg
109
+ rut12/rty/her2/abc/cfr3//wert/rut12/rut12/rty/her2/abc/cfr3//wert/rut12`;
110
110
  [csvTests.sepMsaDna1]: string = `seq
111
- A-C--G-T--C-T
112
- C-A-C--T--G-T
113
- A-C-C-G-T-A-C-T`;
111
+ A-C--G-T--C-T-A-C--G-T--C-T
112
+ C-A-C--T--G-T-C-A-C--T--G-T
113
+ A-C-C-G-T-A-C-T-A-C-C-G-T-A-C-T`;
114
114
  [csvTests.sepMsaUnWEmpty]: string = `seq
115
- m1-M-m3-mon4-mon5-N-T-MON8-N9
116
- m1-mon2-m3-mon4-mon5-Num--MON8-N9
115
+ m1-M-m3-mon4-mon5-N-T-MON8-N9-m1-M-m3-mon4-mon5-N-T-MON8-N9
116
+ m1-mon2-m3-mon4-mon5-Num--MON8-N9-m1-mon2-m3-mon4-mon5-Num--MON8-N9
117
117
 
118
- mon1-M-mon3-mon4-mon5---MON8-N9`;
118
+ mon1-M-mon3-mon4-mon5---MON8-N9-mon1-M-mon3-mon4-mon5---MON8-N9`;
119
119
  [csvTests.sepComplex]: string = `seq
120
120
  Ac(1)-F-K(AEEA-AEEA-R-Ac)-L-mF-V-Y-mNle-D-W-N-mF-C(1)-G-NH2
121
121
  Ac(1)-F-K(AEEA-ARRA-W-Ac)-L-mF-V-Y-mNle-D-W-N-mF-C(1)-G-NH2
122
122
  Ac(1)-F-K(AEEA-AEEA-Ac)-L-mF-V-Y-mNle-D-W-N-mF-C(1)-G-NH2`;
123
123
  [csvTests.fastaMsaDna1]: string = `seq
124
- AC-GT-CT
125
- CAC-T-GT
126
- ACCGTACT`;
124
+ AC-GT-CTAC-GT-CT
125
+ CAC-T-GTCAC-T-GT
126
+ ACCGTACTACCGTACT`;
127
127
  [csvTests.fastaMsaPt1]: string = `seq
128
- FWR-WYV-KHP
129
- YNR-WYV-KHP
130
- MWRSWY-CKHP`;
128
+ FWR-WYV-KHPFWR-WYV-KHP
129
+ YNR-WYV-KHPYNR-WYV-KHP
130
+ MWRSWY-CKHPMWRSWY-CKHP`;
131
131
  }();
132
132
 
133
133
  const enum Samples {