@teselagen/bio-parsers 0.4.15 → 0.4.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,3 @@
1
- export function parseFeatureLocation(locStr: any, isProtein: any, inclusive1BasedStart: any, inclusive1BasedEnd: any, isCircular: any, sequenceLength: any): any[];
1
+ export function parseFeatureLocation(locStr: any, isProtein: any, inclusive1BasedStart: any, inclusive1BasedEnd: any, isCircular: any, sequenceLength: any): any;
2
2
  export default genbankToJson;
3
3
  declare function genbankToJson(string: any, options?: {}): any;
package/index.js CHANGED
@@ -11506,6 +11506,26 @@ function getDegenerateDnaStringFromAAString(aaString) {
11506
11506
  return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
11507
11507
  }
11508
11508
  __name(getDegenerateDnaStringFromAAString, "getDegenerateDnaStringFromAAString");
11509
+ function getAminoAcidStringFromSequenceString(sequenceString, { doNotExcludeAsterisk } = {}) {
11510
+ const aminoAcidsPerBase = getAminoAcidDataForEachBaseOfDna(
11511
+ sequenceString,
11512
+ true
11513
+ );
11514
+ const aaArray = [];
11515
+ let aaString = "";
11516
+ aminoAcidsPerBase.forEach((aa, index) => {
11517
+ if (!aa.fullCodon) {
11518
+ return;
11519
+ }
11520
+ if (!doNotExcludeAsterisk && index >= aminoAcidsPerBase.length - 3 && aa.aminoAcid.value === "*") {
11521
+ return;
11522
+ }
11523
+ aaArray[aa.aminoAcidIndex] = aa.aminoAcid.value;
11524
+ });
11525
+ aaString = aaArray.join("");
11526
+ return aaString;
11527
+ }
11528
+ __name(getAminoAcidStringFromSequenceString, "getAminoAcidStringFromSequenceString");
11509
11529
  function tidyUpSequenceData(pSeqData, options = {}) {
11510
11530
  const {
11511
11531
  annotationsAsObjects,
@@ -11513,6 +11533,7 @@ function tidyUpSequenceData(pSeqData, options = {}) {
11513
11533
  doNotRemoveInvalidChars,
11514
11534
  additionalValidChars,
11515
11535
  noTranslationData,
11536
+ includeProteinSequence,
11516
11537
  doNotProvideIdsForAnnotations,
11517
11538
  noCdsTranslations,
11518
11539
  convertAnnotationsFromAAIndices,
@@ -11546,7 +11567,9 @@ function tidyUpSequenceData(pSeqData, options = {}) {
11546
11567
  }
11547
11568
  if (!doNotRemoveInvalidChars) {
11548
11569
  if (seqData.isProtein) {
11549
- const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({}, topLevelSeqData || seqData));
11570
+ const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadProps(__spreadValues({}, topLevelSeqData || seqData), {
11571
+ isProtein: true
11572
+ }));
11550
11573
  seqData.proteinSequence = newSeq;
11551
11574
  } else {
11552
11575
  const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
@@ -11567,6 +11590,10 @@ function tidyUpSequenceData(pSeqData, options = {}) {
11567
11590
  null,
11568
11591
  true
11569
11592
  );
11593
+ } else if (includeProteinSequence) {
11594
+ seqData.proteinSequence = getAminoAcidStringFromSequenceString(
11595
+ seqData.sequence
11596
+ );
11570
11597
  }
11571
11598
  seqData.size = seqData.noSequence ? seqData.size : seqData.sequence.length;
11572
11599
  seqData.proteinSize = seqData.noSequence ? seqData.proteinSize : seqData.proteinSequence.length;
@@ -19787,19 +19814,38 @@ function flattenSequenceArray(parsingResultArray, opts) {
19787
19814
  return parsingResultArray;
19788
19815
  }
19789
19816
  __name(flattenSequenceArray, "flattenSequenceArray");
19817
+ function wrapOriginSpanningFeatures(locArrayInput, sequenceLength, inclusive1BasedStart, inclusive1BasedEnd) {
19818
+ const locArrayOutput = locArrayInput.map((loc) => __spreadValues({}, loc));
19819
+ for (let i = 0; i < locArrayOutput.length - 1; i++) {
19820
+ const firstFeature = locArrayOutput[i];
19821
+ const secondFeature = locArrayOutput[i + 1];
19822
+ if (firstFeature.end === sequenceLength - (inclusive1BasedEnd ? 0 : 1) && secondFeature.start === 1 - (inclusive1BasedStart ? 0 : 1)) {
19823
+ locArrayOutput[i] = {
19824
+ start: firstFeature.start,
19825
+ end: secondFeature.end
19826
+ };
19827
+ locArrayOutput.splice(i + 1, 1);
19828
+ }
19829
+ }
19830
+ return locArrayOutput;
19831
+ }
19832
+ __name(wrapOriginSpanningFeatures, "wrapOriginSpanningFeatures");
19790
19833
  function parseFeatureLocation(locStr, isProtein, inclusive1BasedStart, inclusive1BasedEnd, isCircular, sequenceLength) {
19791
19834
  locStr = locStr.trim();
19792
- const locArr = [];
19793
- locStr.replace(/(\d+)/g, function(string, match) {
19794
- locArr.push(match);
19835
+ const positionsArray = [];
19836
+ const locationParts = locStr.split(",");
19837
+ locationParts.forEach((locPart) => {
19838
+ const extractedPositions = locPart.match(/(\d+)/g);
19839
+ if (extractedPositions === null) {
19840
+ return;
19841
+ }
19842
+ positionsArray.push(extractedPositions[0]);
19843
+ positionsArray.push(extractedPositions[1] || extractedPositions[0]);
19795
19844
  });
19796
19845
  const locArray = [];
19797
- for (let i = 0; i < locArr.length; i += 2) {
19798
- const start = parseInt(locArr[i], 10) - (inclusive1BasedStart ? 0 : 1);
19799
- let end = parseInt(locArr[i + 1], 10) - (inclusive1BasedEnd ? 0 : 1);
19800
- if (isNaN(end)) {
19801
- end = start;
19802
- }
19846
+ for (let i = 0; i < positionsArray.length; i += 2) {
19847
+ const start = parseInt(positionsArray[i], 10) - (inclusive1BasedStart ? 0 : 1);
19848
+ const end = parseInt(positionsArray[i + 1], 10) - (inclusive1BasedEnd ? 0 : 1);
19803
19849
  const location = {
19804
19850
  start,
19805
19851
  end
@@ -19808,20 +19854,16 @@ function parseFeatureLocation(locStr, isProtein, inclusive1BasedStart, inclusive
19808
19854
  isProtein ? convertAACaretPositionOrRangeToDna(location) : location
19809
19855
  );
19810
19856
  }
19811
- if (isCircular) {
19812
- for (let i = 0; i < locArray.length; i += 2) {
19813
- const firstFeature = locArray[i];
19814
- const secondFeature = locArray[i + 1];
19815
- if (firstFeature.end === sequenceLength - (inclusive1BasedEnd ? 0 : 1) && secondFeature.start === 1 - (inclusive1BasedStart ? 0 : 1)) {
19816
- locArray[i] = {
19817
- start: firstFeature.start,
19818
- end: secondFeature.end
19819
- };
19820
- locArray.splice(i + 1, 1);
19821
- }
19822
- }
19857
+ if (isCircular && sequenceLength) {
19858
+ return wrapOriginSpanningFeatures(
19859
+ locArray,
19860
+ sequenceLength,
19861
+ inclusive1BasedStart,
19862
+ inclusive1BasedEnd
19863
+ );
19864
+ } else {
19865
+ return locArray;
19823
19866
  }
19824
- return locArray;
19825
19867
  }
19826
19868
  __name(parseFeatureLocation, "parseFeatureLocation");
19827
19869
  function genbankToJson(string, options = {}) {
@@ -19918,7 +19960,7 @@ function genbankToJson(string, options = {}) {
19918
19960
  parseOrigin(line, key);
19919
19961
  break;
19920
19962
  case genbankAnnotationKey.END_SEQUENCE_TAG:
19921
- endSeq();
19963
+ endSeq(options);
19922
19964
  break;
19923
19965
  case genbankAnnotationKey.DEFINITION_TAG:
19924
19966
  line = line.replace(/DEFINITION/, "");
@@ -20023,9 +20065,9 @@ function genbankToJson(string, options = {}) {
20023
20065
  }
20024
20066
  });
20025
20067
  return results;
20026
- function endSeq() {
20068
+ function endSeq(options2) {
20027
20069
  hasFoundLocus = false;
20028
- postProcessCurSeq();
20070
+ postProcessCurSeq(options2);
20029
20071
  resultsArray.push(result || { success: false });
20030
20072
  }
20031
20073
  __name(endSeq, "endSeq");
@@ -20039,11 +20081,13 @@ function genbankToJson(string, options = {}) {
20039
20081
  }
20040
20082
  }
20041
20083
  __name(addMessage, "addMessage");
20042
- function postProcessCurSeq() {
20084
+ function postProcessCurSeq(options2) {
20043
20085
  if (result && result.parsedSequence && result.parsedSequence.features) {
20044
20086
  for (let i = 0; i < result.parsedSequence.features.length; i++) {
20045
20087
  result.parsedSequence.features[i] = postProcessGenbankFeature(
20046
- result.parsedSequence.features[i]
20088
+ result.parsedSequence.features[i],
20089
+ result.parsedSequence,
20090
+ options2
20047
20091
  );
20048
20092
  }
20049
20093
  }
@@ -20289,7 +20333,7 @@ function genbankToJson(string, options = {}) {
20289
20333
  return runon;
20290
20334
  }
20291
20335
  __name(isKeywordRunon, "isKeywordRunon");
20292
- function postProcessGenbankFeature(feat) {
20336
+ function postProcessGenbankFeature(feat, parsedSequence, options2) {
20293
20337
  if (feat.notes.label) {
20294
20338
  feat.name = feat.notes.label[0];
20295
20339
  } else if (feat.notes.gene) {
@@ -20322,6 +20366,15 @@ function genbankToJson(string, options = {}) {
20322
20366
  feat.arrowheadType = feat.notes.direction[0].toUpperCase() === "BOTH" ? "BOTH" : feat.notes.direction[0].toUpperCase() === "NONE" ? "NONE" : void 0;
20323
20367
  delete feat.notes.direction;
20324
20368
  }
20369
+ if (parsedSequence.circular) {
20370
+ const { inclusive1BasedStart: inclusive1BasedStart2, inclusive1BasedEnd: inclusive1BasedEnd2 } = options2;
20371
+ feat.locations = wrapOriginSpanningFeatures(
20372
+ feat.locations,
20373
+ parsedSequence.sequence.length,
20374
+ inclusive1BasedStart2,
20375
+ inclusive1BasedEnd2
20376
+ );
20377
+ }
20325
20378
  return feat;
20326
20379
  }
20327
20380
  __name(postProcessGenbankFeature, "postProcessGenbankFeature");
package/index.mjs CHANGED
@@ -11504,6 +11504,26 @@ function getDegenerateDnaStringFromAAString(aaString) {
11504
11504
  return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
11505
11505
  }
11506
11506
  __name(getDegenerateDnaStringFromAAString, "getDegenerateDnaStringFromAAString");
11507
+ function getAminoAcidStringFromSequenceString(sequenceString, { doNotExcludeAsterisk } = {}) {
11508
+ const aminoAcidsPerBase = getAminoAcidDataForEachBaseOfDna(
11509
+ sequenceString,
11510
+ true
11511
+ );
11512
+ const aaArray = [];
11513
+ let aaString = "";
11514
+ aminoAcidsPerBase.forEach((aa, index) => {
11515
+ if (!aa.fullCodon) {
11516
+ return;
11517
+ }
11518
+ if (!doNotExcludeAsterisk && index >= aminoAcidsPerBase.length - 3 && aa.aminoAcid.value === "*") {
11519
+ return;
11520
+ }
11521
+ aaArray[aa.aminoAcidIndex] = aa.aminoAcid.value;
11522
+ });
11523
+ aaString = aaArray.join("");
11524
+ return aaString;
11525
+ }
11526
+ __name(getAminoAcidStringFromSequenceString, "getAminoAcidStringFromSequenceString");
11507
11527
  function tidyUpSequenceData(pSeqData, options = {}) {
11508
11528
  const {
11509
11529
  annotationsAsObjects,
@@ -11511,6 +11531,7 @@ function tidyUpSequenceData(pSeqData, options = {}) {
11511
11531
  doNotRemoveInvalidChars,
11512
11532
  additionalValidChars,
11513
11533
  noTranslationData,
11534
+ includeProteinSequence,
11514
11535
  doNotProvideIdsForAnnotations,
11515
11536
  noCdsTranslations,
11516
11537
  convertAnnotationsFromAAIndices,
@@ -11544,7 +11565,9 @@ function tidyUpSequenceData(pSeqData, options = {}) {
11544
11565
  }
11545
11566
  if (!doNotRemoveInvalidChars) {
11546
11567
  if (seqData.isProtein) {
11547
- const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({}, topLevelSeqData || seqData));
11568
+ const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadProps(__spreadValues({}, topLevelSeqData || seqData), {
11569
+ isProtein: true
11570
+ }));
11548
11571
  seqData.proteinSequence = newSeq;
11549
11572
  } else {
11550
11573
  const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
@@ -11565,6 +11588,10 @@ function tidyUpSequenceData(pSeqData, options = {}) {
11565
11588
  null,
11566
11589
  true
11567
11590
  );
11591
+ } else if (includeProteinSequence) {
11592
+ seqData.proteinSequence = getAminoAcidStringFromSequenceString(
11593
+ seqData.sequence
11594
+ );
11568
11595
  }
11569
11596
  seqData.size = seqData.noSequence ? seqData.size : seqData.sequence.length;
11570
11597
  seqData.proteinSize = seqData.noSequence ? seqData.proteinSize : seqData.proteinSequence.length;
@@ -19785,19 +19812,38 @@ function flattenSequenceArray(parsingResultArray, opts) {
19785
19812
  return parsingResultArray;
19786
19813
  }
19787
19814
  __name(flattenSequenceArray, "flattenSequenceArray");
19815
+ function wrapOriginSpanningFeatures(locArrayInput, sequenceLength, inclusive1BasedStart, inclusive1BasedEnd) {
19816
+ const locArrayOutput = locArrayInput.map((loc) => __spreadValues({}, loc));
19817
+ for (let i = 0; i < locArrayOutput.length - 1; i++) {
19818
+ const firstFeature = locArrayOutput[i];
19819
+ const secondFeature = locArrayOutput[i + 1];
19820
+ if (firstFeature.end === sequenceLength - (inclusive1BasedEnd ? 0 : 1) && secondFeature.start === 1 - (inclusive1BasedStart ? 0 : 1)) {
19821
+ locArrayOutput[i] = {
19822
+ start: firstFeature.start,
19823
+ end: secondFeature.end
19824
+ };
19825
+ locArrayOutput.splice(i + 1, 1);
19826
+ }
19827
+ }
19828
+ return locArrayOutput;
19829
+ }
19830
+ __name(wrapOriginSpanningFeatures, "wrapOriginSpanningFeatures");
19788
19831
  function parseFeatureLocation(locStr, isProtein, inclusive1BasedStart, inclusive1BasedEnd, isCircular, sequenceLength) {
19789
19832
  locStr = locStr.trim();
19790
- const locArr = [];
19791
- locStr.replace(/(\d+)/g, function(string, match) {
19792
- locArr.push(match);
19833
+ const positionsArray = [];
19834
+ const locationParts = locStr.split(",");
19835
+ locationParts.forEach((locPart) => {
19836
+ const extractedPositions = locPart.match(/(\d+)/g);
19837
+ if (extractedPositions === null) {
19838
+ return;
19839
+ }
19840
+ positionsArray.push(extractedPositions[0]);
19841
+ positionsArray.push(extractedPositions[1] || extractedPositions[0]);
19793
19842
  });
19794
19843
  const locArray = [];
19795
- for (let i = 0; i < locArr.length; i += 2) {
19796
- const start = parseInt(locArr[i], 10) - (inclusive1BasedStart ? 0 : 1);
19797
- let end = parseInt(locArr[i + 1], 10) - (inclusive1BasedEnd ? 0 : 1);
19798
- if (isNaN(end)) {
19799
- end = start;
19800
- }
19844
+ for (let i = 0; i < positionsArray.length; i += 2) {
19845
+ const start = parseInt(positionsArray[i], 10) - (inclusive1BasedStart ? 0 : 1);
19846
+ const end = parseInt(positionsArray[i + 1], 10) - (inclusive1BasedEnd ? 0 : 1);
19801
19847
  const location = {
19802
19848
  start,
19803
19849
  end
@@ -19806,20 +19852,16 @@ function parseFeatureLocation(locStr, isProtein, inclusive1BasedStart, inclusive
19806
19852
  isProtein ? convertAACaretPositionOrRangeToDna(location) : location
19807
19853
  );
19808
19854
  }
19809
- if (isCircular) {
19810
- for (let i = 0; i < locArray.length; i += 2) {
19811
- const firstFeature = locArray[i];
19812
- const secondFeature = locArray[i + 1];
19813
- if (firstFeature.end === sequenceLength - (inclusive1BasedEnd ? 0 : 1) && secondFeature.start === 1 - (inclusive1BasedStart ? 0 : 1)) {
19814
- locArray[i] = {
19815
- start: firstFeature.start,
19816
- end: secondFeature.end
19817
- };
19818
- locArray.splice(i + 1, 1);
19819
- }
19820
- }
19855
+ if (isCircular && sequenceLength) {
19856
+ return wrapOriginSpanningFeatures(
19857
+ locArray,
19858
+ sequenceLength,
19859
+ inclusive1BasedStart,
19860
+ inclusive1BasedEnd
19861
+ );
19862
+ } else {
19863
+ return locArray;
19821
19864
  }
19822
- return locArray;
19823
19865
  }
19824
19866
  __name(parseFeatureLocation, "parseFeatureLocation");
19825
19867
  function genbankToJson(string, options = {}) {
@@ -19916,7 +19958,7 @@ function genbankToJson(string, options = {}) {
19916
19958
  parseOrigin(line, key);
19917
19959
  break;
19918
19960
  case genbankAnnotationKey.END_SEQUENCE_TAG:
19919
- endSeq();
19961
+ endSeq(options);
19920
19962
  break;
19921
19963
  case genbankAnnotationKey.DEFINITION_TAG:
19922
19964
  line = line.replace(/DEFINITION/, "");
@@ -20021,9 +20063,9 @@ function genbankToJson(string, options = {}) {
20021
20063
  }
20022
20064
  });
20023
20065
  return results;
20024
- function endSeq() {
20066
+ function endSeq(options2) {
20025
20067
  hasFoundLocus = false;
20026
- postProcessCurSeq();
20068
+ postProcessCurSeq(options2);
20027
20069
  resultsArray.push(result || { success: false });
20028
20070
  }
20029
20071
  __name(endSeq, "endSeq");
@@ -20037,11 +20079,13 @@ function genbankToJson(string, options = {}) {
20037
20079
  }
20038
20080
  }
20039
20081
  __name(addMessage, "addMessage");
20040
- function postProcessCurSeq() {
20082
+ function postProcessCurSeq(options2) {
20041
20083
  if (result && result.parsedSequence && result.parsedSequence.features) {
20042
20084
  for (let i = 0; i < result.parsedSequence.features.length; i++) {
20043
20085
  result.parsedSequence.features[i] = postProcessGenbankFeature(
20044
- result.parsedSequence.features[i]
20086
+ result.parsedSequence.features[i],
20087
+ result.parsedSequence,
20088
+ options2
20045
20089
  );
20046
20090
  }
20047
20091
  }
@@ -20287,7 +20331,7 @@ function genbankToJson(string, options = {}) {
20287
20331
  return runon;
20288
20332
  }
20289
20333
  __name(isKeywordRunon, "isKeywordRunon");
20290
- function postProcessGenbankFeature(feat) {
20334
+ function postProcessGenbankFeature(feat, parsedSequence, options2) {
20291
20335
  if (feat.notes.label) {
20292
20336
  feat.name = feat.notes.label[0];
20293
20337
  } else if (feat.notes.gene) {
@@ -20320,6 +20364,15 @@ function genbankToJson(string, options = {}) {
20320
20364
  feat.arrowheadType = feat.notes.direction[0].toUpperCase() === "BOTH" ? "BOTH" : feat.notes.direction[0].toUpperCase() === "NONE" ? "NONE" : void 0;
20321
20365
  delete feat.notes.direction;
20322
20366
  }
20367
+ if (parsedSequence.circular) {
20368
+ const { inclusive1BasedStart: inclusive1BasedStart2, inclusive1BasedEnd: inclusive1BasedEnd2 } = options2;
20369
+ feat.locations = wrapOriginSpanningFeatures(
20370
+ feat.locations,
20371
+ parsedSequence.sequence.length,
20372
+ inclusive1BasedStart2,
20373
+ inclusive1BasedEnd2
20374
+ );
20375
+ }
20323
20376
  return feat;
20324
20377
  }
20325
20378
  __name(postProcessGenbankFeature, "postProcessGenbankFeature");
package/index.umd.js CHANGED
@@ -11508,6 +11508,26 @@ var __async = (__this, __arguments, generator) => {
11508
11508
  return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
11509
11509
  }
11510
11510
  __name(getDegenerateDnaStringFromAAString, "getDegenerateDnaStringFromAAString");
11511
+ function getAminoAcidStringFromSequenceString(sequenceString, { doNotExcludeAsterisk } = {}) {
11512
+ const aminoAcidsPerBase = getAminoAcidDataForEachBaseOfDna(
11513
+ sequenceString,
11514
+ true
11515
+ );
11516
+ const aaArray = [];
11517
+ let aaString = "";
11518
+ aminoAcidsPerBase.forEach((aa, index) => {
11519
+ if (!aa.fullCodon) {
11520
+ return;
11521
+ }
11522
+ if (!doNotExcludeAsterisk && index >= aminoAcidsPerBase.length - 3 && aa.aminoAcid.value === "*") {
11523
+ return;
11524
+ }
11525
+ aaArray[aa.aminoAcidIndex] = aa.aminoAcid.value;
11526
+ });
11527
+ aaString = aaArray.join("");
11528
+ return aaString;
11529
+ }
11530
+ __name(getAminoAcidStringFromSequenceString, "getAminoAcidStringFromSequenceString");
11511
11531
  function tidyUpSequenceData(pSeqData, options = {}) {
11512
11532
  const {
11513
11533
  annotationsAsObjects,
@@ -11515,6 +11535,7 @@ var __async = (__this, __arguments, generator) => {
11515
11535
  doNotRemoveInvalidChars,
11516
11536
  additionalValidChars,
11517
11537
  noTranslationData,
11538
+ includeProteinSequence,
11518
11539
  doNotProvideIdsForAnnotations,
11519
11540
  noCdsTranslations,
11520
11541
  convertAnnotationsFromAAIndices,
@@ -11548,7 +11569,9 @@ var __async = (__this, __arguments, generator) => {
11548
11569
  }
11549
11570
  if (!doNotRemoveInvalidChars) {
11550
11571
  if (seqData.isProtein) {
11551
- const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({}, topLevelSeqData || seqData));
11572
+ const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadProps(__spreadValues({}, topLevelSeqData || seqData), {
11573
+ isProtein: true
11574
+ }));
11552
11575
  seqData.proteinSequence = newSeq;
11553
11576
  } else {
11554
11577
  const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
@@ -11569,6 +11592,10 @@ var __async = (__this, __arguments, generator) => {
11569
11592
  null,
11570
11593
  true
11571
11594
  );
11595
+ } else if (includeProteinSequence) {
11596
+ seqData.proteinSequence = getAminoAcidStringFromSequenceString(
11597
+ seqData.sequence
11598
+ );
11572
11599
  }
11573
11600
  seqData.size = seqData.noSequence ? seqData.size : seqData.sequence.length;
11574
11601
  seqData.proteinSize = seqData.noSequence ? seqData.proteinSize : seqData.proteinSequence.length;
@@ -19789,19 +19816,38 @@ var __async = (__this, __arguments, generator) => {
19789
19816
  return parsingResultArray;
19790
19817
  }
19791
19818
  __name(flattenSequenceArray, "flattenSequenceArray");
19819
+ function wrapOriginSpanningFeatures(locArrayInput, sequenceLength, inclusive1BasedStart, inclusive1BasedEnd) {
19820
+ const locArrayOutput = locArrayInput.map((loc) => __spreadValues({}, loc));
19821
+ for (let i2 = 0; i2 < locArrayOutput.length - 1; i2++) {
19822
+ const firstFeature = locArrayOutput[i2];
19823
+ const secondFeature = locArrayOutput[i2 + 1];
19824
+ if (firstFeature.end === sequenceLength - (inclusive1BasedEnd ? 0 : 1) && secondFeature.start === 1 - (inclusive1BasedStart ? 0 : 1)) {
19825
+ locArrayOutput[i2] = {
19826
+ start: firstFeature.start,
19827
+ end: secondFeature.end
19828
+ };
19829
+ locArrayOutput.splice(i2 + 1, 1);
19830
+ }
19831
+ }
19832
+ return locArrayOutput;
19833
+ }
19834
+ __name(wrapOriginSpanningFeatures, "wrapOriginSpanningFeatures");
19792
19835
  function parseFeatureLocation(locStr, isProtein, inclusive1BasedStart, inclusive1BasedEnd, isCircular, sequenceLength) {
19793
19836
  locStr = locStr.trim();
19794
- const locArr = [];
19795
- locStr.replace(/(\d+)/g, function(string, match) {
19796
- locArr.push(match);
19837
+ const positionsArray = [];
19838
+ const locationParts = locStr.split(",");
19839
+ locationParts.forEach((locPart) => {
19840
+ const extractedPositions = locPart.match(/(\d+)/g);
19841
+ if (extractedPositions === null) {
19842
+ return;
19843
+ }
19844
+ positionsArray.push(extractedPositions[0]);
19845
+ positionsArray.push(extractedPositions[1] || extractedPositions[0]);
19797
19846
  });
19798
19847
  const locArray = [];
19799
- for (let i2 = 0; i2 < locArr.length; i2 += 2) {
19800
- const start = parseInt(locArr[i2], 10) - (inclusive1BasedStart ? 0 : 1);
19801
- let end = parseInt(locArr[i2 + 1], 10) - (inclusive1BasedEnd ? 0 : 1);
19802
- if (isNaN(end)) {
19803
- end = start;
19804
- }
19848
+ for (let i2 = 0; i2 < positionsArray.length; i2 += 2) {
19849
+ const start = parseInt(positionsArray[i2], 10) - (inclusive1BasedStart ? 0 : 1);
19850
+ const end = parseInt(positionsArray[i2 + 1], 10) - (inclusive1BasedEnd ? 0 : 1);
19805
19851
  const location = {
19806
19852
  start,
19807
19853
  end
@@ -19810,20 +19856,16 @@ var __async = (__this, __arguments, generator) => {
19810
19856
  isProtein ? convertAACaretPositionOrRangeToDna(location) : location
19811
19857
  );
19812
19858
  }
19813
- if (isCircular) {
19814
- for (let i2 = 0; i2 < locArray.length; i2 += 2) {
19815
- const firstFeature = locArray[i2];
19816
- const secondFeature = locArray[i2 + 1];
19817
- if (firstFeature.end === sequenceLength - (inclusive1BasedEnd ? 0 : 1) && secondFeature.start === 1 - (inclusive1BasedStart ? 0 : 1)) {
19818
- locArray[i2] = {
19819
- start: firstFeature.start,
19820
- end: secondFeature.end
19821
- };
19822
- locArray.splice(i2 + 1, 1);
19823
- }
19824
- }
19859
+ if (isCircular && sequenceLength) {
19860
+ return wrapOriginSpanningFeatures(
19861
+ locArray,
19862
+ sequenceLength,
19863
+ inclusive1BasedStart,
19864
+ inclusive1BasedEnd
19865
+ );
19866
+ } else {
19867
+ return locArray;
19825
19868
  }
19826
- return locArray;
19827
19869
  }
19828
19870
  __name(parseFeatureLocation, "parseFeatureLocation");
19829
19871
  function genbankToJson(string, options = {}) {
@@ -19920,7 +19962,7 @@ var __async = (__this, __arguments, generator) => {
19920
19962
  parseOrigin(line, key);
19921
19963
  break;
19922
19964
  case genbankAnnotationKey.END_SEQUENCE_TAG:
19923
- endSeq();
19965
+ endSeq(options);
19924
19966
  break;
19925
19967
  case genbankAnnotationKey.DEFINITION_TAG:
19926
19968
  line = line.replace(/DEFINITION/, "");
@@ -20025,9 +20067,9 @@ var __async = (__this, __arguments, generator) => {
20025
20067
  }
20026
20068
  });
20027
20069
  return results;
20028
- function endSeq() {
20070
+ function endSeq(options2) {
20029
20071
  hasFoundLocus = false;
20030
- postProcessCurSeq();
20072
+ postProcessCurSeq(options2);
20031
20073
  resultsArray.push(result || { success: false });
20032
20074
  }
20033
20075
  __name(endSeq, "endSeq");
@@ -20041,11 +20083,13 @@ var __async = (__this, __arguments, generator) => {
20041
20083
  }
20042
20084
  }
20043
20085
  __name(addMessage, "addMessage");
20044
- function postProcessCurSeq() {
20086
+ function postProcessCurSeq(options2) {
20045
20087
  if (result && result.parsedSequence && result.parsedSequence.features) {
20046
20088
  for (let i2 = 0; i2 < result.parsedSequence.features.length; i2++) {
20047
20089
  result.parsedSequence.features[i2] = postProcessGenbankFeature(
20048
- result.parsedSequence.features[i2]
20090
+ result.parsedSequence.features[i2],
20091
+ result.parsedSequence,
20092
+ options2
20049
20093
  );
20050
20094
  }
20051
20095
  }
@@ -20291,7 +20335,7 @@ var __async = (__this, __arguments, generator) => {
20291
20335
  return runon;
20292
20336
  }
20293
20337
  __name(isKeywordRunon, "isKeywordRunon");
20294
- function postProcessGenbankFeature(feat) {
20338
+ function postProcessGenbankFeature(feat, parsedSequence, options2) {
20295
20339
  if (feat.notes.label) {
20296
20340
  feat.name = feat.notes.label[0];
20297
20341
  } else if (feat.notes.gene) {
@@ -20324,6 +20368,15 @@ var __async = (__this, __arguments, generator) => {
20324
20368
  feat.arrowheadType = feat.notes.direction[0].toUpperCase() === "BOTH" ? "BOTH" : feat.notes.direction[0].toUpperCase() === "NONE" ? "NONE" : void 0;
20325
20369
  delete feat.notes.direction;
20326
20370
  }
20371
+ if (parsedSequence.circular) {
20372
+ const { inclusive1BasedStart: inclusive1BasedStart2, inclusive1BasedEnd: inclusive1BasedEnd2 } = options2;
20373
+ feat.locations = wrapOriginSpanningFeatures(
20374
+ feat.locations,
20375
+ parsedSequence.sequence.length,
20376
+ inclusive1BasedStart2,
20377
+ inclusive1BasedEnd2
20378
+ );
20379
+ }
20327
20380
  return feat;
20328
20381
  }
20329
20382
  __name(postProcessGenbankFeature, "postProcessGenbankFeature");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@teselagen/bio-parsers",
3
- "version": "0.4.15",
3
+ "version": "0.4.16",
4
4
  "dependencies": {
5
5
  "@teselagen/sequence-utils": "0.3.24",
6
6
  "@teselagen/range-utils": "0.3.7",
@@ -8,6 +8,40 @@ import splitStringIntoLines from "./utils/splitStringIntoLines.js";
8
8
 
9
9
  import createInitialSequence from "./utils/createInitialSequence";
10
10
 
11
+ function wrapOriginSpanningFeatures(
12
+ locArrayInput,
13
+ sequenceLength,
14
+ inclusive1BasedStart,
15
+ inclusive1BasedEnd
16
+ ) {
17
+ // In genbank files, locations of origin-spanning features are represented as follows:
18
+ // complement(join(490883..490885,1..879)) (for a circular sequence of length 490885)
19
+ // Then, for locations in locArray we check if there is a location that ends at sequenceLength
20
+ // joined with a location that starts at 1. If so, we merge them into a single location.
21
+ // (see https://github.com/TeselaGen/tg-oss/issues/35)
22
+
23
+ // make a deep copy of the array to avoid modifying the original
24
+ const locArrayOutput = locArrayInput.map(loc => ({ ...loc }));
25
+
26
+ // Iterate by pairs of features
27
+ for (let i = 0; i < locArrayOutput.length - 1; i++) {
28
+ const firstFeature = locArrayOutput[i];
29
+ const secondFeature = locArrayOutput[i + 1];
30
+ if (
31
+ firstFeature.end === sequenceLength - (inclusive1BasedEnd ? 0 : 1) &&
32
+ secondFeature.start === 1 - (inclusive1BasedStart ? 0 : 1)
33
+ ) {
34
+ // Merge the two features
35
+ locArrayOutput[i] = {
36
+ start: firstFeature.start,
37
+ end: secondFeature.end
38
+ };
39
+ locArrayOutput.splice(i + 1, 1);
40
+ }
41
+ }
42
+ return locArrayOutput;
43
+ }
44
+
11
45
  export function parseFeatureLocation(
12
46
  locStr,
13
47
  isProtein,
@@ -17,22 +51,37 @@ export function parseFeatureLocation(
17
51
  sequenceLength
18
52
  ) {
19
53
  locStr = locStr.trim();
20
- const locArr = [];
21
- locStr.replace(/(\d+)/g, function (string, match) {
22
- locArr.push(match);
54
+ const positionsArray = [];
55
+ // Genbank feature locations can be:
56
+ // 4 -> single base (equivalent to 4..4)
57
+ // 4..8 -> range
58
+ // complement(4..8) -> complement of range
59
+ // join(4..8, 10..12) -> join of ranges
60
+ // complement(join(4..8, 10..12)) -> complement of join of ranges
61
+ // but also
62
+ // join(4, 10..12) -> join of single base and range
63
+ // First we split with commas to obtain pairs of positions
64
+ const locationParts = locStr.split(",");
65
+ locationParts.forEach(locPart => {
66
+ // Extract two integers from loc, if only one is present
67
+ // we push the same one as the end (e.g. if location is `4` we push
68
+ // 4 twice
69
+ const extractedPositions = locPart.match(/(\d+)/g);
70
+ // Sometimes the location is split between two lines and has a trailing comma
71
+ if (extractedPositions === null) {
72
+ return;
73
+ }
74
+ positionsArray.push(extractedPositions[0]);
75
+ positionsArray.push(extractedPositions[1] || extractedPositions[0]);
23
76
  });
77
+
24
78
  const locArray = [];
25
- for (let i = 0; i < locArr.length; i += 2) {
26
- const start = parseInt(locArr[i], 10) - (inclusive1BasedStart ? 0 : 1);
27
- let end = parseInt(locArr[i + 1], 10) - (inclusive1BasedEnd ? 0 : 1);
28
- if (isNaN(end)) {
29
- //if no end is supplied, assume that the end should be set to whatever the start is
30
- //this makes a feature location passed as:
31
- //147
32
- //function like:
33
- //147..147
34
- end = start;
35
- }
79
+ for (let i = 0; i < positionsArray.length; i += 2) {
80
+ const start =
81
+ parseInt(positionsArray[i], 10) - (inclusive1BasedStart ? 0 : 1);
82
+ const end =
83
+ parseInt(positionsArray[i + 1], 10) - (inclusive1BasedEnd ? 0 : 1);
84
+
36
85
  const location = {
37
86
  start: start,
38
87
  end: end
@@ -41,31 +90,21 @@ export function parseFeatureLocation(
41
90
  isProtein ? convertAACaretPositionOrRangeToDna(location) : location
42
91
  );
43
92
  }
44
- // In genbank files, origin-spanning features are represented as follows:
45
- // complement(join(490883..490885,1..879)) (for a circular sequence of length 490885)
46
- // Then, for locations in locArray we check if there is a feature that ends at sequenceLength
47
- // joined with a feature that starts at 1. If so, we merge them into a single feature.
48
- // (see https://github.com/TeselaGen/tg-oss/issues/35)
49
93
 
50
- if (isCircular) {
51
- // Iterate by pairs of features
52
- for (let i = 0; i < locArray.length; i += 2) {
53
- const firstFeature = locArray[i];
54
- const secondFeature = locArray[i + 1];
55
- if (
56
- firstFeature.end === sequenceLength - (inclusive1BasedEnd ? 0 : 1) &&
57
- secondFeature.start === 1 - (inclusive1BasedStart ? 0 : 1)
58
- ) {
59
- // Merge the two features
60
- locArray[i] = {
61
- start: firstFeature.start,
62
- end: secondFeature.end
63
- };
64
- locArray.splice(i + 1, 1);
65
- }
66
- }
94
+ // sequenceLength will not be set when this is called during genbank parsing,
95
+ // and the wrapping is done in endSeq > postProcessCurSeq > postProcessGenbankFeature
96
+ // However, this is useful if the function is used as a standalone parser of
97
+ // feature locations.
98
+ if (isCircular && sequenceLength) {
99
+ return wrapOriginSpanningFeatures(
100
+ locArray,
101
+ sequenceLength,
102
+ inclusive1BasedStart,
103
+ inclusive1BasedEnd
104
+ );
105
+ } else {
106
+ return locArray;
67
107
  }
68
- return locArray;
69
108
  }
70
109
 
71
110
  function genbankToJson(string, options = {}) {
@@ -182,7 +221,7 @@ function genbankToJson(string, options = {}) {
182
221
  parseOrigin(line, key);
183
222
  break;
184
223
  case genbankAnnotationKey.END_SEQUENCE_TAG:
185
- endSeq();
224
+ endSeq(options);
186
225
  break;
187
226
  case genbankAnnotationKey.DEFINITION_TAG:
188
227
  line = line.replace(/DEFINITION/, "");
@@ -321,10 +360,10 @@ function genbankToJson(string, options = {}) {
321
360
 
322
361
  return results;
323
362
 
324
- function endSeq() {
363
+ function endSeq(options) {
325
364
  //do some post processing clean-up
326
365
  hasFoundLocus = false;
327
- postProcessCurSeq();
366
+ postProcessCurSeq(options);
328
367
  //push the result into the resultsArray
329
368
  resultsArray.push(result || { success: false });
330
369
  }
@@ -341,11 +380,13 @@ function genbankToJson(string, options = {}) {
341
380
  }
342
381
  }
343
382
 
344
- function postProcessCurSeq() {
383
+ function postProcessCurSeq(options) {
345
384
  if (result && result.parsedSequence && result.parsedSequence.features) {
346
385
  for (let i = 0; i < result.parsedSequence.features.length; i++) {
347
386
  result.parsedSequence.features[i] = postProcessGenbankFeature(
348
- result.parsedSequence.features[i]
387
+ result.parsedSequence.features[i],
388
+ result.parsedSequence,
389
+ options
349
390
  );
350
391
  }
351
392
  }
@@ -656,7 +697,7 @@ function genbankToJson(string, options = {}) {
656
697
  return runon;
657
698
  }
658
699
 
659
- function postProcessGenbankFeature(feat) {
700
+ function postProcessGenbankFeature(feat, parsedSequence, options) {
660
701
  if (feat.notes.label) {
661
702
  feat.name = feat.notes.label[0];
662
703
  } else if (feat.notes.gene) {
@@ -697,6 +738,16 @@ function genbankToJson(string, options = {}) {
697
738
  : undefined;
698
739
  delete feat.notes.direction;
699
740
  }
741
+ if (parsedSequence.circular) {
742
+ const { inclusive1BasedStart, inclusive1BasedEnd } = options;
743
+ feat.locations = wrapOriginSpanningFeatures(
744
+ feat.locations,
745
+ parsedSequence.sequence.length,
746
+ inclusive1BasedStart,
747
+ inclusive1BasedEnd
748
+ );
749
+ }
750
+
700
751
  return feat;
701
752
  }
702
753
  }