@datagrok/bio 2.22.12 → 2.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import wu from 'wu';
6
6
 
7
7
  /* eslint-disable max-len */
8
8
  import {ALIGNMENT, ALPHABET, candidateAlphabets, getSplitterWithSeparator, NOTATION, positionSeparator, splitterAsFasta, splitterAsHelm, TAGS} from '@datagrok-libraries/bio/src/utils/macromolecule/index';
9
- import {INotationProvider, ISeqSplitted, SeqColStats, SplitterFunc,} from '@datagrok-libraries/bio/src/utils/macromolecule/types';
9
+ import {INotationProvider, ISeqConnection, ISeqSplitted, SeqColStats, SplitterFunc,} from '@datagrok-libraries/bio/src/utils/macromolecule/types';
10
10
  import {detectAlphabet, detectHelmAlphabet, splitterAsFastaSimple, StringListSeqSplitted} from '@datagrok-libraries/bio/src/utils/macromolecule/utils';
11
11
  import {mmDistanceFunctions, MmDistanceFunctionsNames} from '@datagrok-libraries/ml/src/macromolecule-distance-functions';
12
12
  import {mmDistanceFunctionType} from '@datagrok-libraries/ml/src/macromolecule-distance-functions/types';
@@ -28,10 +28,22 @@ export class SeqHandler implements ISeqHandler {
28
28
  protected readonly _units: string; // units, of the form fasta, separator
29
29
  protected readonly _notation: NOTATION; // current notation (without :SEQ:NT, etc.)
30
30
  protected readonly _defaultGapOriginal: string;
31
- protected readonly notationProvider!: INotationProvider;
31
+ private _notationProvider: INotationProvider | null = null;
32
+ private _tempReadForNotProvider = false;
33
+ protected get notationProvider(): INotationProvider | null {
34
+ if (!this._tempReadForNotProvider) {
35
+ this._tempReadForNotProvider = true;
36
+ this._notationProvider = this._notationProvider ?? this._column.temp[SeqTemps.notationProvider] ?? null;
37
+ }
38
+ return this._notationProvider;
39
+ };
40
+ protected set notationProvider(v: INotationProvider | null) { this._notationProvider = v; };
32
41
 
33
42
  private _splitter: SplitterFunc | null = null;
34
43
 
44
+ private _refinerPromise: Promise<void> = Promise.resolve();
45
+ public get refinerPromise(): Promise<void> { return this._refinerPromise; }
46
+
35
47
  protected constructor(col: DG.Column<string>,
36
48
  private readonly seqHelper: SeqHelper,
37
49
  ) {
@@ -44,7 +56,7 @@ export class SeqHandler implements ISeqHandler {
44
56
  this._units = units!;
45
57
 
46
58
  this._notation = this.getNotation();
47
- if (this.isCustom()) {
59
+ if (this.isCustom() || this.isBiln()) {
48
60
  // this.column.temp[SeqTemps.notationProvider] must be set at detector stage
49
61
  this.notationProvider = this.column.temp[SeqTemps.notationProvider] ?? null;
50
62
  }
@@ -52,7 +64,7 @@ export class SeqHandler implements ISeqHandler {
52
64
  const defaultGapOriginal = this.isFasta() ? GapOriginals[NOTATION.FASTA] :
53
65
  this.isSeparator() ? GapOriginals[NOTATION.SEPARATOR] :
54
66
  this.isHelm() ? GapOriginals[NOTATION.HELM] :
55
- this.isCustom() ? (this.notationProvider?.defaultGapOriginal ?? GapOriginals[NOTATION.SEPARATOR]) :
67
+ this.isCustom() || this.isBiln() ? (this.notationProvider?.defaultGapOriginal ?? GapOriginals[NOTATION.SEPARATOR]) :
56
68
  undefined;
57
69
  if (defaultGapOriginal == undefined)
58
70
  throw new Error(`Unexpected defaultGapOriginal for notation '${this.notation}'`);
@@ -70,8 +82,8 @@ export class SeqHandler implements ISeqHandler {
70
82
  this.seqHelper.setUnitsToSeparatorColumn(this, separator);
71
83
  } else if (this.isHelm())
72
84
  this.seqHelper.setUnitsToHelmColumn(this);
73
- else if (this.isCustom())
74
- this.notationProvider!.setUnits(this);
85
+ else if (this.isCustom() || this.isBiln())
86
+ this.notationProvider?.setUnits(this);
75
87
  else
76
88
  throw new Error(`Unexpected units '${this.column.meta.units}'.`);
77
89
  }
@@ -96,8 +108,8 @@ export class SeqHandler implements ISeqHandler {
96
108
 
97
109
  this.columnVersion = this.column.version;
98
110
  // refine separator only at this stage
99
- if (this.isSeparator() && (!this.isCustom() || !this.notationProvider) && !col.temp['seqHandlerRefined']) {
100
- this.refineSeparatorNotation();
111
+ if (this.isSeparator() && ((!this.isCustom() && !this.isBiln()) || !this.notationProvider) && !col.temp['seqHandlerRefined']) {
112
+ this._refinerPromise = this.refineSeparatorNotation();
101
113
  col.temp['seqHandlerRefined'] = true;
102
114
  }
103
115
  }
@@ -138,16 +150,11 @@ export class SeqHandler implements ISeqHandler {
138
150
  const stats = getStats(categoriesSample, 3, (s) => s.split(this.separator!));
139
151
  let invalidateRequired = false;
140
152
 
141
- const refinerList = [
142
- {package: 'SequenceTranslator', name: 'refineNotationProviderForHarmonizedSequence'},
143
- ];
153
+ const refinerList = DG.Func.find({tags: ['notationRefiner']});
144
154
 
145
155
  for (const refineFuncFind of refinerList) {
146
156
  try {
147
- const funcList = DG.Func.find(refineFuncFind);
148
- if (funcList.length === 0) continue;
149
-
150
- const funcFc = funcList[0].prepare({col: this.column, stats: stats, separator: this.separator});
157
+ const funcFc = refineFuncFind.prepare({col: this.column, stats: stats, separator: this.separator});
151
158
  const refineRes = (await funcFc.call()).getOutputParamValue();
152
159
  invalidateRequired ||= refineRes;
153
160
  } catch (err) {
@@ -157,6 +164,8 @@ export class SeqHandler implements ISeqHandler {
157
164
 
158
165
  if (invalidateRequired) {
159
166
  // Applying custom notation provider MUST invalidate SeqHandler
167
+ // some things might still have the old seqHandler attached, so we need to make sure they have access to notationProvider
168
+ this._tempReadForNotProvider = false;
160
169
  delete this.column.temp[SeqTemps.seqHandler];
161
170
 
162
171
  this.column.fireValuesChanged();
@@ -371,7 +380,7 @@ export class SeqHandler implements ISeqHandler {
371
380
  const seq = this.column.get(rowIdx);
372
381
  if (this.notation === NOTATION.HELM)
373
382
  resHelm = seq;
374
- else if (this.notation === NOTATION.CUSTOM)
383
+ else if (this.notation === NOTATION.CUSTOM || this.notation === NOTATION.BILN)
375
384
  resHelm = this.notationProvider!.getHelm(seq, {});
376
385
  else
377
386
  resHelm = this.getConverter(NOTATION.HELM)(seq);
@@ -440,6 +449,8 @@ export class SeqHandler implements ISeqHandler {
440
449
 
441
450
  public isCustom(): boolean { return this.notation === NOTATION.CUSTOM; }
442
451
 
452
+ public isBiln(): boolean { return this.notation === NOTATION.BILN; }
453
+
443
454
  public isRna(): boolean { return this.alphabet === ALPHABET.RNA; }
444
455
 
445
456
  public isDna(): boolean { return this.alphabet === ALPHABET.DNA; }
@@ -471,6 +482,8 @@ export class SeqHandler implements ISeqHandler {
471
482
  return NOTATION.HELM;
472
483
  else if (this.units.toLowerCase().startsWith(NOTATION.CUSTOM))
473
484
  return NOTATION.CUSTOM;
485
+ else if (this.units.toLowerCase().startsWith(NOTATION.BILN))
486
+ return NOTATION.BILN;
474
487
  else
475
488
  throw new Error(`Column '${this.column.name}' has unexpected notation '${this.units}'.`);
476
489
  }
@@ -538,6 +551,12 @@ export class SeqHandler implements ISeqHandler {
538
551
  newColumn.setTag(TAGS.alphabetSize, srcAlphabetSize);
539
552
  }
540
553
 
554
+ // if its biln, we need to set it as a separator column, later to be refined
555
+ if (tgtNotation === NOTATION.BILN) {
556
+ newColumn.setTag(TAGS.separator, '-');
557
+ newColumn.meta.units = NOTATION.SEPARATOR;
558
+ }
559
+
541
560
  return newColumn;
542
561
  }
543
562
 
@@ -828,6 +847,10 @@ export class SeqHandler implements ISeqHandler {
828
847
  res = function(srcSS: ISeqSplitted): string { return joinToHelm(srcSS, wrappers, isDnaOrRna); };
829
848
  break;
830
849
  }
850
+ case NOTATION.BILN: {
851
+ res = function(srcSS: ISeqSplitted): string { return joinToBiln(srcSS); };
852
+ break;
853
+ }
831
854
  default:
832
855
  throw new Error(`Unexpected notation '${notation}'.`);
833
856
  }
@@ -846,8 +869,10 @@ export class SeqHandler implements ISeqHandler {
846
869
  return function(srcSeq: string) { return srcSh.convertToHelm(srcSeq); };
847
870
  else if (tgtUnits === NOTATION.SEPARATOR)
848
871
  return function(srcSeq: string) { return srcSh.convertToSeparator(srcSeq, tgtSeparator!); };
872
+ else if (tgtUnits === NOTATION.BILN)
873
+ return function(srcSeq: string) { return srcSh.convertToBiln(srcSeq); };
849
874
  else
850
- throw new Error();
875
+ throw new Error('Unexpected target units \'' + tgtUnits + '\'.');
851
876
  }
852
877
 
853
878
  /** Gets a column's UnitsHandler object from temp slot or creates a new and stores it to the temp slot. */
@@ -897,11 +922,17 @@ export class SeqHandler implements ISeqHandler {
897
922
 
898
923
  const wrappers = this.getHelmWrappers();
899
924
 
900
- const isDnaOrRna = src.startsWith('DNA') || src.startsWith('RNA');
925
+ const isDnaOrRna = this.isDna() || this.isRna();
901
926
  const srcSS = this.splitter(src);
902
927
  return joinToHelm(srcSS, wrappers, isDnaOrRna);
903
928
  }
904
929
 
930
+ private convertToBiln(src: string): string {
931
+ if (this.notation == NOTATION.BILN) return src;
932
+ const srcSS = this.splitter(src);
933
+ return joinToBiln(srcSS);
934
+ }
935
+
905
936
  /** Splits Helm sequence adjusting nucleotides to single char symbols. (!) Removes lone phosphorus. */
906
937
  private splitterAsHelmNucl(src: string): ISeqSplitted {
907
938
  const srcMList: ISeqSplitted = this.splitter(src);
@@ -921,11 +952,11 @@ export class SeqHandler implements ISeqHandler {
921
952
 
922
953
  // Custom notation provider
923
954
 
924
- getRendererBack(gridCol: DG.GridColumn | null, tableCol: DG.Column<string>): CellRendererBackBase<string> {
955
+ getRendererBack(gridCol: DG.GridColumn | null, tableCol: DG.Column<string>): CellRendererBackBase<string> | null {
925
956
  const temp = this.column.temp as GridCellRendererTemp<any>;
926
957
  let res = temp.rendererBack;
927
958
  if (!res)
928
- res = temp.rendererBack = this.notationProvider!.createCellRendererBack(gridCol, tableCol);
959
+ res = temp.rendererBack = this.notationProvider?.createCellRendererBack(gridCol, tableCol);
929
960
  return res;
930
961
  }
931
962
  }
@@ -950,20 +981,95 @@ function joinToSeparator(seqS: ISeqSplitted, tgtSeparator: string, isHelm: boole
950
981
  }
951
982
 
952
983
  function joinToHelm(srcSS: ISeqSplitted, wrappers: string[], isDnaOrRna: boolean): string {
953
- const [prefix, leftWrapper, rightWrapper, postfix] = wrappers;
954
- const resOMList: string[] = new Array<string>(srcSS.length);
955
- for (let posIdx: number = 0; posIdx < srcSS.length; ++posIdx) {
956
- const cm = srcSS.getCanonical(posIdx);
957
- let om: string = srcSS.getOriginal(posIdx);
958
- if (cm === GAP_SYMBOL)
959
- om = GapOriginals[NOTATION.HELM];
960
- else {
961
- if (isDnaOrRna)
962
- om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
963
- om = om.length === 1 ? `${leftWrapper}${om}${rightWrapper}` : `${leftWrapper}[${om}]${rightWrapper}`;
984
+ if (!srcSS.graphInfo || !((srcSS.graphInfo.connections?.length ?? 0) > 0)) {
985
+ // no graph info - linear sequence
986
+ const [prefix, leftWrapper, rightWrapper, postfix] = wrappers;
987
+ const resOMList: string[] = new Array<string>(srcSS.length);
988
+ for (let posIdx: number = 0; posIdx < srcSS.length; ++posIdx) {
989
+ const cm = srcSS.getCanonical(posIdx);
990
+ let om: string = cm;
991
+ if (cm === GAP_SYMBOL)
992
+ om = GapOriginals[NOTATION.HELM];
993
+ else {
994
+ if (isDnaOrRna)
995
+ om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
996
+ om = om.length === 1 ? `${leftWrapper}${om}${rightWrapper}` : `${leftWrapper}[${om}]${rightWrapper}`;
997
+ }
998
+ resOMList[posIdx] = om;
999
+ }
1000
+ return `${prefix}${resOMList.join('.')}${postfix}`;
1001
+ } else {
1002
+ // there is a graph info - so we will need to be a bit tricky here
1003
+ const seqType = isDnaOrRna ? 'RNA' : 'PEPTIDE';
1004
+ const postFix = '$$$'; // three dollar signs - one is placed at the end of sequences
1005
+ const disjointSequenceIdxs = srcSS.graphInfo.disjointSeqStarts;
1006
+ const leftWrapper = wrappers[1];
1007
+ const rightWrapper = wrappers[2];
1008
+ const disjointSequences: string[] = [];
1009
+ for (let i = 0; i < disjointSequenceIdxs.length; i++) {
1010
+ const startIdx = disjointSequenceIdxs[i];
1011
+ const endIdx = i + 1 < disjointSequenceIdxs.length ? disjointSequenceIdxs[i + 1] : srcSS.length;
1012
+ const resOMList: string[] = new Array<string>(endIdx - startIdx);
1013
+ for (let posIdx = startIdx; posIdx < endIdx; ++posIdx) {
1014
+ const cm = srcSS.getCanonical(posIdx);
1015
+ let om: string = cm;
1016
+ if (cm === GAP_SYMBOL)
1017
+ om = GapOriginals[NOTATION.HELM];
1018
+ else {
1019
+ if (isDnaOrRna)
1020
+ om = om.replace(HELM_WRAPPERS_REGEXP, '$1');
1021
+ om = om.length === 1 ? `${leftWrapper}${om}${rightWrapper}` : `${leftWrapper}[${om}]${rightWrapper}`;
1022
+ }
1023
+ resOMList[posIdx - startIdx] = om;
1024
+ }
1025
+ disjointSequences.push(`${seqType}${i + 1}{${resOMList.join('.')}}`);
1026
+ }
1027
+ // PEPTIDE2,PEPTIDE2,16:R2-1:R1|PEPTIDE3,PEPTIDE3,16:R2-1:R1|PEPTIDE3,PEPTIDE2,10:R3-1:R3|PEPTIDE1,PEPTIDE2,1:R2-9:R3$$$V2.0
1028
+ const sequencePart = disjointSequences.join('|');
1029
+
1030
+ const sequenceConnections = srcSS.graphInfo.connections.map((conn) => {
1031
+ return `${seqType}${conn.seqIndex1 + 1},${seqType}${conn.seqIndex2 + 1},${conn.monomerIndex1 + 1}:R${conn.rGroup1}-${conn.monomerIndex2 + 1}:R${conn.rGroup2}`;
1032
+ }).join('|');
1033
+
1034
+ return `${sequencePart}$${sequenceConnections}${postFix}V2.0`;
1035
+ }
1036
+ }
1037
+
1038
+ function joinToBiln(srcSS: ISeqSplitted): string {
1039
+ if (!srcSS.graphInfo || !((srcSS.graphInfo.connections?.length ?? 0) > 0)) {
1040
+ const resOMList: string[] = new Array<string>(srcSS.length);
1041
+ for (let posIdx: number = 0; posIdx < srcSS.length; ++posIdx) {
1042
+ resOMList[posIdx] = srcSS.getCanonical(posIdx);
1043
+ if (resOMList[posIdx]?.includes('-')) // Biln uses '-' as a separator, need to enclose in []
1044
+ resOMList[posIdx] = `[${resOMList[posIdx]}]`;
1045
+ }
1046
+ return resOMList.join('-'); // Biln uses '-' as a separator
1047
+ } else { // conversion happens only if there is a graph info
1048
+ const disjointSequenceIdxs = srcSS.graphInfo.disjointSeqStarts;
1049
+ const allSeqParts = new Array<string>(srcSS.length);
1050
+ for (let posIdx = 0; posIdx < srcSS.length; ++posIdx) {
1051
+ allSeqParts[posIdx] = srcSS.getCanonical(posIdx);
1052
+ if (allSeqParts[posIdx]?.includes('-')) // Biln uses '-' as a separator, need to enclose in []
1053
+ allSeqParts[posIdx] = `[${allSeqParts[posIdx]}]`;
964
1054
  }
965
- resOMList[posIdx] = om;
1055
+ for (let i = 0; i < srcSS.graphInfo.connections.length; i++) {
1056
+ const conn: ISeqConnection = srcSS.graphInfo.connections[i];
1057
+ const conId = `${i + 1}`;
1058
+ const seq1Idx = conn.seqIndex1;
1059
+ const seq2Idx = conn.seqIndex2;
1060
+ const monomer1Idx = disjointSequenceIdxs[seq1Idx] + conn.monomerIndex1;
1061
+ const monomer2Idx = disjointSequenceIdxs[seq2Idx] + conn.monomerIndex2;
1062
+ const seqPart1 = `${allSeqParts[monomer1Idx]}(${conId},${conn.rGroup1})`;
1063
+ const seqPart2 = `${allSeqParts[monomer2Idx]}(${conId},${conn.rGroup2})`;
1064
+ allSeqParts[monomer1Idx] = seqPart1;
1065
+ allSeqParts[monomer2Idx] = seqPart2;
1066
+ }
1067
+
1068
+ const disjointParts = disjointSequenceIdxs.map((startIdx, i) => {
1069
+ const endIdx = i + 1 < disjointSequenceIdxs.length ? disjointSequenceIdxs[i + 1] : srcSS.length;
1070
+ return allSeqParts.slice(startIdx, endIdx).join('-');
1071
+ });
1072
+ return disjointParts.join('.'); // Biln uses '-' as a separator and '.' between disjoint sequences
966
1073
  }
967
- return `${prefix}${resOMList.join('.')}${postfix}`;
968
1074
  }
969
1075
 
@@ -136,7 +136,7 @@ export class SeqHelper implements ISeqHelper {
136
136
  }
137
137
 
138
138
  public setUnitsToSeparatorColumn(uh: SeqHandler, separator?: string) {
139
- if (uh.column.semType !== DG.SEMTYPE.MACROMOLECULE || uh.column.meta.units !== NOTATION.SEPARATOR)
139
+ if (uh.column.semType !== DG.SEMTYPE.MACROMOLECULE)
140
140
  throw new Error(`The column of notation '${NOTATION.SEPARATOR}' must be '${DG.SEMTYPE.MACROMOLECULE}'.`);
141
141
  if (!separator)
142
142
  throw new Error(`The column of notation '${NOTATION.SEPARATOR}' must have the separator tag.`);
@@ -109,7 +109,7 @@ export function getMacromoleculeColumnPropertyPanel(col: DG.Column): DG.Widget {
109
109
  const units = col.meta.units;
110
110
 
111
111
  // Don't show for formats that have their own complex renderers (like Helm).
112
- if (units === NOTATION.HELM || units === NOTATION.CUSTOM)
112
+ if (units === NOTATION.HELM)
113
113
  return false;
114
114
 
115
115
  // For all other cases, including 'UN' (non-canonical), 'fasta', and 'separator' show the multiline toggle.
@@ -4,6 +4,7 @@ import * as DG from 'datagrok-api/dg';
4
4
  import * as OCL from 'openchemlib/full';
5
5
  import {NOTATION} from '@datagrok-libraries/bio/src/utils/macromolecule';
6
6
  import {_package, PackageFunctions} from '../package';
7
+ import {SeqTemps} from '@datagrok-libraries/bio/src/utils/macromolecule/seq-handler';
7
8
 
8
9
 
9
10
  export async function toAtomicLevelSingle(sequence: DG.SemanticValue): Promise<{mol: string, errorText: string}> {
@@ -17,7 +18,7 @@ export async function toAtomicLevelSingle(sequence: DG.SemanticValue): Promise<{
17
18
  errorText = 'Atomic level conversion requeires a sequence column';
18
19
  return {errorText, mol: ''};
19
20
  }
20
- const supportedUnits: string[] = [NOTATION.FASTA, NOTATION.SEPARATOR, NOTATION.HELM];
21
+ const supportedUnits: string[] = [NOTATION.FASTA, NOTATION.SEPARATOR, NOTATION.HELM, NOTATION.BILN];
21
22
  //todo: add support for custom notations
22
23
  if (!supportedUnits.includes(sequence.cell.column.meta.units?.toLowerCase() ?? '')) {
23
24
  errorText = 'Unsupported sequence notation. please use Bio | Polytool | Convert';
@@ -29,8 +30,8 @@ export async function toAtomicLevelSingle(sequence: DG.SemanticValue): Promise<{
29
30
  errorText = 'No sequence handler found';
30
31
  return {errorText, mol: ''};
31
32
  }
32
- if ((seqSh.getSplitted(sequence.cell.rowIndex, 50)?.length ?? 100) > 40) {
33
- errorText = 'Maximum number of monomers is 40';
33
+ if ((seqSh.getSplitted(sequence.cell.rowIndex, 60)?.length ?? 100) > 50) {
34
+ errorText = 'Maximum number of monomers is 50';
34
35
  return {errorText, mol: ''};
35
36
  }
36
37
  const singleValCol = DG.Column.fromStrings('singleVal', [sequence.value]);
@@ -39,7 +40,14 @@ export async function toAtomicLevelSingle(sequence: DG.SemanticValue): Promise<{
39
40
  Object.entries(sequence.cell.column.tags).forEach(([key, value]) => {
40
41
  singleValCol.setTag(key, value as string);
41
42
  });
42
- await PackageFunctions.toAtomicLevel(sDf, singleValCol, sequence.cell.column.meta.units === NOTATION.HELM, false);
43
+
44
+ // if column has notation provider, we need to copy it over
45
+ if (sequence.cell.column.temp[SeqTemps.notationProvider])
46
+ singleValCol.temp[SeqTemps.notationProvider] = sequence.cell.column.temp[SeqTemps.notationProvider];
47
+ // helm and biln will have cyclization marks, so we need to use POM to convert them
48
+ const shouldUsePOM = (seqSh.getSplitted(sequence.cell.rowIndex).graphInfo?.connections?.length ?? 0) > 0;
49
+ await PackageFunctions.toAtomicLevel(sDf, singleValCol,
50
+ shouldUsePOM, false);
43
51
  if (sDf.columns.length < 2) {
44
52
  errorText = 'No structure generated';
45
53
  return {errorText, mol: ''};