@teselagen/sequence-utils 0.3.8 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bioData.d.ts CHANGED
@@ -1,4 +1,5 @@
1
1
  export const protein_letters: "ACDEFGHIKLMNPQRSTVWY";
2
+ export const protein_letters_withUandX: "ACDEFGHIKLMNPQRSTVWYUX";
2
3
  export const extended_protein_letters: "ACDEFGHIKLMNPQRSTVWYBXZJUO.*-";
3
4
  export const ambiguous_dna_letters: "GATCRYWSMKHBVDN";
4
5
  export const unambiguous_dna_letters: "GATC";
@@ -1 +1,26 @@
1
- export default function filterSequenceString(sequenceString: any, additionalValidChars: string | undefined, charOverrides: any): any;
1
+ export default function filterSequenceString(sequenceString: any, { additionalValidChars, isOligo, name, isProtein, isRna, isMixedRnaAndDna, includeStopCodon }?: {
2
+ additionalValidChars?: string | undefined;
3
+ isOligo: any;
4
+ name: any;
5
+ isProtein: any;
6
+ isRna: any;
7
+ isMixedRnaAndDna: any;
8
+ includeStopCodon: any;
9
+ }): (string | string[])[];
10
+ export function getAcceptedChars({ isOligo, isProtein, isRna, isMixedRnaAndDna, includeStopCodon }?: {
11
+ isOligo: any;
12
+ isProtein: any;
13
+ isRna: any;
14
+ isMixedRnaAndDna: any;
15
+ includeStopCodon: any;
16
+ }): string;
17
+ export function getReplaceChars({ isOligo, isProtein, isRna, isMixedRnaAndDna }?: {
18
+ isOligo: any;
19
+ isProtein: any;
20
+ isRna: any;
21
+ isMixedRnaAndDna: any;
22
+ }): {
23
+ t?: undefined;
24
+ } | {
25
+ t: string;
26
+ };
package/index.d.ts CHANGED
@@ -20,7 +20,6 @@ export { default as aliasedEnzymesByName } from "./aliasedEnzymesByName";
20
20
  export { default as defaultEnzymesByName } from "./defaultEnzymesByName";
21
21
  export { default as generateSequenceData } from "./generateSequenceData";
22
22
  export { default as generateAnnotations } from "./generateAnnotations";
23
- export { default as filterAminoAcidSequenceString } from "./filterAminoAcidSequenceString";
24
23
  export { default as filterSequenceString } from "./filterSequenceString";
25
24
  export { default as findNearestRangeOfSequenceOverlapToPosition } from "./findNearestRangeOfSequenceOverlapToPosition";
26
25
  export { default as findOrfsInPlasmid } from "./findOrfsInPlasmid";
package/index.js CHANGED
@@ -5999,6 +5999,7 @@ lodash.exports;
5999
5999
  })(lodash, lodash.exports);
6000
6000
  var lodashExports = lodash.exports;
6001
6001
  const protein_letters = "ACDEFGHIKLMNPQRSTVWY";
6002
+ const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
6002
6003
  const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO.*-";
6003
6004
  const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
6004
6005
  const unambiguous_dna_letters = "GATC";
@@ -6067,6 +6068,7 @@ const bioData = /* @__PURE__ */ Object.freeze(/* @__PURE__ */ Object.definePrope
6067
6068
  extended_protein_letters,
6068
6069
  extended_protein_values,
6069
6070
  protein_letters,
6071
+ protein_letters_withUandX,
6070
6072
  unambiguous_dna_letters,
6071
6073
  unambiguous_rna_letters
6072
6074
  }, Symbol.toStringTag, { value: "Module" }));
@@ -12321,20 +12323,91 @@ const modifiableTypes = [
12321
12323
  "primers",
12322
12324
  "guides"
12323
12325
  ];
12324
- function filterSequenceString(sequenceString, additionalValidChars = "", charOverrides) {
12325
- if (sequenceString) {
12326
- return sequenceString.replace(
12327
- new RegExp(
12328
- `[^${charOverrides || `atgcyrswkmbvdhnu${additionalValidChars.split("").join("\\")}`}]`,
12329
- "gi"
12330
- ),
12331
- ""
12326
+ function filterSequenceString(sequenceString, {
12327
+ additionalValidChars = "",
12328
+ isOligo,
12329
+ name,
12330
+ isProtein,
12331
+ isRna,
12332
+ isMixedRnaAndDna,
12333
+ includeStopCodon
12334
+ } = {}) {
12335
+ const acceptedChars = getAcceptedChars({
12336
+ isOligo,
12337
+ isProtein,
12338
+ isRna,
12339
+ isMixedRnaAndDna,
12340
+ includeStopCodon
12341
+ });
12342
+ const replaceChars = getReplaceChars({
12343
+ isOligo,
12344
+ isProtein,
12345
+ isRna,
12346
+ isMixedRnaAndDna
12347
+ });
12348
+ let sanitizedVal = "";
12349
+ const invalidChars = [];
12350
+ const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
12351
+ const warnings = [];
12352
+ const replaceCount = {};
12353
+ sequenceString.split("").forEach((letter) => {
12354
+ const lowerLetter = letter.toLowerCase();
12355
+ if (replaceChars && replaceChars[lowerLetter]) {
12356
+ if (!replaceCount[lowerLetter]) {
12357
+ replaceCount[lowerLetter] = 0;
12358
+ }
12359
+ replaceCount[lowerLetter]++;
12360
+ const isUpper = lowerLetter !== letter;
12361
+ sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
12362
+ } else if (chars.includes(lowerLetter)) {
12363
+ sanitizedVal += letter;
12364
+ } else {
12365
+ invalidChars.push(letter);
12366
+ }
12367
+ });
12368
+ Object.keys(replaceCount).forEach((letter) => {
12369
+ warnings.push(
12370
+ `Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
12332
12371
  );
12333
- } else {
12334
- return sequenceString;
12372
+ });
12373
+ if (sequenceString.length !== sanitizedVal.length) {
12374
+ warnings.push(
12375
+ `${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
12376
+ );
12377
+ }
12378
+ if (typeof window !== "undefined" && window.toastr && warnings.length) {
12379
+ warnings.forEach((warning) => {
12380
+ window.toastr.warning(warning);
12381
+ });
12335
12382
  }
12383
+ return [sanitizedVal, warnings];
12336
12384
  }
12337
12385
  __name(filterSequenceString, "filterSequenceString");
12386
+ function getAcceptedChars({
12387
+ isOligo,
12388
+ isProtein,
12389
+ isRna,
12390
+ isMixedRnaAndDna,
12391
+ includeStopCodon
12392
+ } = {}) {
12393
+ return isProtein ? `${protein_letters_withUandX.toLowerCase()}${includeStopCodon ? "*." : ""}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
12394
+ //just plain old dna
12395
+ ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
12396
+ );
12397
+ }
12398
+ __name(getAcceptedChars, "getAcceptedChars");
12399
+ function getReplaceChars({
12400
+ isOligo,
12401
+ isProtein,
12402
+ isRna,
12403
+ isMixedRnaAndDna
12404
+ } = {}) {
12405
+ return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
12406
+ //just plain old dna
12407
+ {}
12408
+ );
12409
+ }
12410
+ __name(getReplaceChars, "getReplaceChars");
12338
12411
  function tidyUpAnnotation(_annotation, {
12339
12412
  sequenceData = {},
12340
12413
  convertAnnotationsFromAAIndices,
@@ -12463,14 +12536,6 @@ function coerceLocation({
12463
12536
  }
12464
12537
  }
12465
12538
  __name(coerceLocation, "coerceLocation");
12466
- function filterAminoAcidSequenceString(sequenceString, options) {
12467
- options = options || {};
12468
- if (options.includeStopCodon) {
12469
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
12470
- }
12471
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
12472
- }
12473
- __name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
12474
12539
  function getDegenerateDnaStringFromAAString(aaString) {
12475
12540
  return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
12476
12541
  }
@@ -12482,11 +12547,10 @@ function tidyUpSequenceData(pSeqData, options = {}) {
12482
12547
  removeUnwantedChars,
12483
12548
  additionalValidChars,
12484
12549
  noTranslationData,
12485
- charOverrides,
12486
12550
  doNotProvideIdsForAnnotations,
12487
- proteinFilterOptions,
12488
12551
  noCdsTranslations,
12489
- convertAnnotationsFromAAIndices
12552
+ convertAnnotationsFromAAIndices,
12553
+ topLevelSeqData
12490
12554
  } = options;
12491
12555
  let seqData = lodashExports.cloneDeep(pSeqData);
12492
12556
  const response = {
@@ -12516,16 +12580,15 @@ function tidyUpSequenceData(pSeqData, options = {}) {
12516
12580
  }
12517
12581
  if (removeUnwantedChars) {
12518
12582
  if (seqData.isProtein) {
12519
- seqData.proteinSequence = filterAminoAcidSequenceString(
12520
- seqData.proteinSequence,
12521
- __spreadValues({ includeStopCodon: true }, proteinFilterOptions)
12522
- );
12583
+ const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({
12584
+ includeStopCodon: true
12585
+ }, topLevelSeqData || seqData));
12586
+ seqData.proteinSequence = newSeq;
12523
12587
  } else {
12524
- seqData.sequence = filterSequenceString(
12525
- seqData.sequence,
12526
- `${additionalValidChars || ""}${seqData.isRna || seqData.isMixedRnaAndDna ? "u" : ""}`,
12527
- charOverrides
12528
- );
12588
+ const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
12589
+ additionalValidChars
12590
+ }, topLevelSeqData || seqData));
12591
+ seqData.sequence = newSeq;
12529
12592
  }
12530
12593
  }
12531
12594
  if (seqData.isProtein) {
@@ -22665,7 +22728,6 @@ exports.degenerateRnaToAminoAcidMap = degenerateRnaToAminoAcidMap;
22665
22728
  exports.deleteSequenceDataAtRange = deleteSequenceDataAtRange;
22666
22729
  exports.doesEnzymeChopOutsideOfRecognitionSite = doesEnzymeChopOutsideOfRecognitionSite;
22667
22730
  exports.featureColors = featureColors;
22668
- exports.filterAminoAcidSequenceString = filterAminoAcidSequenceString;
22669
22731
  exports.filterSequenceString = filterSequenceString;
22670
22732
  exports.findNearestRangeOfSequenceOverlapToPosition = findNearestRangeOfSequenceOverlapToPosition;
22671
22733
  exports.findOrfsInPlasmid = findOrfsInPlasmid;
package/index.mjs CHANGED
@@ -5997,6 +5997,7 @@ lodash.exports;
5997
5997
  })(lodash, lodash.exports);
5998
5998
  var lodashExports = lodash.exports;
5999
5999
  const protein_letters = "ACDEFGHIKLMNPQRSTVWY";
6000
+ const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
6000
6001
  const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO.*-";
6001
6002
  const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
6002
6003
  const unambiguous_dna_letters = "GATC";
@@ -6065,6 +6066,7 @@ const bioData = /* @__PURE__ */ Object.freeze(/* @__PURE__ */ Object.definePrope
6065
6066
  extended_protein_letters,
6066
6067
  extended_protein_values,
6067
6068
  protein_letters,
6069
+ protein_letters_withUandX,
6068
6070
  unambiguous_dna_letters,
6069
6071
  unambiguous_rna_letters
6070
6072
  }, Symbol.toStringTag, { value: "Module" }));
@@ -12319,20 +12321,91 @@ const modifiableTypes = [
12319
12321
  "primers",
12320
12322
  "guides"
12321
12323
  ];
12322
- function filterSequenceString(sequenceString, additionalValidChars = "", charOverrides) {
12323
- if (sequenceString) {
12324
- return sequenceString.replace(
12325
- new RegExp(
12326
- `[^${charOverrides || `atgcyrswkmbvdhnu${additionalValidChars.split("").join("\\")}`}]`,
12327
- "gi"
12328
- ),
12329
- ""
12324
+ function filterSequenceString(sequenceString, {
12325
+ additionalValidChars = "",
12326
+ isOligo,
12327
+ name,
12328
+ isProtein,
12329
+ isRna,
12330
+ isMixedRnaAndDna,
12331
+ includeStopCodon
12332
+ } = {}) {
12333
+ const acceptedChars = getAcceptedChars({
12334
+ isOligo,
12335
+ isProtein,
12336
+ isRna,
12337
+ isMixedRnaAndDna,
12338
+ includeStopCodon
12339
+ });
12340
+ const replaceChars = getReplaceChars({
12341
+ isOligo,
12342
+ isProtein,
12343
+ isRna,
12344
+ isMixedRnaAndDna
12345
+ });
12346
+ let sanitizedVal = "";
12347
+ const invalidChars = [];
12348
+ const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
12349
+ const warnings = [];
12350
+ const replaceCount = {};
12351
+ sequenceString.split("").forEach((letter) => {
12352
+ const lowerLetter = letter.toLowerCase();
12353
+ if (replaceChars && replaceChars[lowerLetter]) {
12354
+ if (!replaceCount[lowerLetter]) {
12355
+ replaceCount[lowerLetter] = 0;
12356
+ }
12357
+ replaceCount[lowerLetter]++;
12358
+ const isUpper = lowerLetter !== letter;
12359
+ sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
12360
+ } else if (chars.includes(lowerLetter)) {
12361
+ sanitizedVal += letter;
12362
+ } else {
12363
+ invalidChars.push(letter);
12364
+ }
12365
+ });
12366
+ Object.keys(replaceCount).forEach((letter) => {
12367
+ warnings.push(
12368
+ `Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
12330
12369
  );
12331
- } else {
12332
- return sequenceString;
12370
+ });
12371
+ if (sequenceString.length !== sanitizedVal.length) {
12372
+ warnings.push(
12373
+ `${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
12374
+ );
12375
+ }
12376
+ if (typeof window !== "undefined" && window.toastr && warnings.length) {
12377
+ warnings.forEach((warning) => {
12378
+ window.toastr.warning(warning);
12379
+ });
12333
12380
  }
12381
+ return [sanitizedVal, warnings];
12334
12382
  }
12335
12383
  __name(filterSequenceString, "filterSequenceString");
12384
+ function getAcceptedChars({
12385
+ isOligo,
12386
+ isProtein,
12387
+ isRna,
12388
+ isMixedRnaAndDna,
12389
+ includeStopCodon
12390
+ } = {}) {
12391
+ return isProtein ? `${protein_letters_withUandX.toLowerCase()}${includeStopCodon ? "*." : ""}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
12392
+ //just plain old dna
12393
+ ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
12394
+ );
12395
+ }
12396
+ __name(getAcceptedChars, "getAcceptedChars");
12397
+ function getReplaceChars({
12398
+ isOligo,
12399
+ isProtein,
12400
+ isRna,
12401
+ isMixedRnaAndDna
12402
+ } = {}) {
12403
+ return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
12404
+ //just plain old dna
12405
+ {}
12406
+ );
12407
+ }
12408
+ __name(getReplaceChars, "getReplaceChars");
12336
12409
  function tidyUpAnnotation(_annotation, {
12337
12410
  sequenceData = {},
12338
12411
  convertAnnotationsFromAAIndices,
@@ -12461,14 +12534,6 @@ function coerceLocation({
12461
12534
  }
12462
12535
  }
12463
12536
  __name(coerceLocation, "coerceLocation");
12464
- function filterAminoAcidSequenceString(sequenceString, options) {
12465
- options = options || {};
12466
- if (options.includeStopCodon) {
12467
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
12468
- }
12469
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
12470
- }
12471
- __name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
12472
12537
  function getDegenerateDnaStringFromAAString(aaString) {
12473
12538
  return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
12474
12539
  }
@@ -12480,11 +12545,10 @@ function tidyUpSequenceData(pSeqData, options = {}) {
12480
12545
  removeUnwantedChars,
12481
12546
  additionalValidChars,
12482
12547
  noTranslationData,
12483
- charOverrides,
12484
12548
  doNotProvideIdsForAnnotations,
12485
- proteinFilterOptions,
12486
12549
  noCdsTranslations,
12487
- convertAnnotationsFromAAIndices
12550
+ convertAnnotationsFromAAIndices,
12551
+ topLevelSeqData
12488
12552
  } = options;
12489
12553
  let seqData = lodashExports.cloneDeep(pSeqData);
12490
12554
  const response = {
@@ -12514,16 +12578,15 @@ function tidyUpSequenceData(pSeqData, options = {}) {
12514
12578
  }
12515
12579
  if (removeUnwantedChars) {
12516
12580
  if (seqData.isProtein) {
12517
- seqData.proteinSequence = filterAminoAcidSequenceString(
12518
- seqData.proteinSequence,
12519
- __spreadValues({ includeStopCodon: true }, proteinFilterOptions)
12520
- );
12581
+ const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({
12582
+ includeStopCodon: true
12583
+ }, topLevelSeqData || seqData));
12584
+ seqData.proteinSequence = newSeq;
12521
12585
  } else {
12522
- seqData.sequence = filterSequenceString(
12523
- seqData.sequence,
12524
- `${additionalValidChars || ""}${seqData.isRna || seqData.isMixedRnaAndDna ? "u" : ""}`,
12525
- charOverrides
12526
- );
12586
+ const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
12587
+ additionalValidChars
12588
+ }, topLevelSeqData || seqData));
12589
+ seqData.sequence = newSeq;
12527
12590
  }
12528
12591
  }
12529
12592
  if (seqData.isProtein) {
@@ -22664,7 +22727,6 @@ export {
22664
22727
  deleteSequenceDataAtRange,
22665
22728
  doesEnzymeChopOutsideOfRecognitionSite,
22666
22729
  featureColors,
22667
- filterAminoAcidSequenceString,
22668
22730
  filterSequenceString,
22669
22731
  findNearestRangeOfSequenceOverlapToPosition,
22670
22732
  findOrfsInPlasmid,
package/index.umd.js CHANGED
@@ -6001,6 +6001,7 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
6001
6001
  })(lodash, lodash.exports);
6002
6002
  var lodashExports = lodash.exports;
6003
6003
  const protein_letters = "ACDEFGHIKLMNPQRSTVWY";
6004
+ const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
6004
6005
  const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO.*-";
6005
6006
  const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
6006
6007
  const unambiguous_dna_letters = "GATC";
@@ -6069,6 +6070,7 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
6069
6070
  extended_protein_letters,
6070
6071
  extended_protein_values,
6071
6072
  protein_letters,
6073
+ protein_letters_withUandX,
6072
6074
  unambiguous_dna_letters,
6073
6075
  unambiguous_rna_letters
6074
6076
  }, Symbol.toStringTag, { value: "Module" }));
@@ -12323,20 +12325,91 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
12323
12325
  "primers",
12324
12326
  "guides"
12325
12327
  ];
12326
- function filterSequenceString(sequenceString, additionalValidChars = "", charOverrides) {
12327
- if (sequenceString) {
12328
- return sequenceString.replace(
12329
- new RegExp(
12330
- `[^${charOverrides || `atgcyrswkmbvdhnu${additionalValidChars.split("").join("\\")}`}]`,
12331
- "gi"
12332
- ),
12333
- ""
12328
+ function filterSequenceString(sequenceString, {
12329
+ additionalValidChars = "",
12330
+ isOligo,
12331
+ name,
12332
+ isProtein,
12333
+ isRna,
12334
+ isMixedRnaAndDna,
12335
+ includeStopCodon
12336
+ } = {}) {
12337
+ const acceptedChars = getAcceptedChars({
12338
+ isOligo,
12339
+ isProtein,
12340
+ isRna,
12341
+ isMixedRnaAndDna,
12342
+ includeStopCodon
12343
+ });
12344
+ const replaceChars = getReplaceChars({
12345
+ isOligo,
12346
+ isProtein,
12347
+ isRna,
12348
+ isMixedRnaAndDna
12349
+ });
12350
+ let sanitizedVal = "";
12351
+ const invalidChars = [];
12352
+ const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
12353
+ const warnings = [];
12354
+ const replaceCount = {};
12355
+ sequenceString.split("").forEach((letter) => {
12356
+ const lowerLetter = letter.toLowerCase();
12357
+ if (replaceChars && replaceChars[lowerLetter]) {
12358
+ if (!replaceCount[lowerLetter]) {
12359
+ replaceCount[lowerLetter] = 0;
12360
+ }
12361
+ replaceCount[lowerLetter]++;
12362
+ const isUpper = lowerLetter !== letter;
12363
+ sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
12364
+ } else if (chars.includes(lowerLetter)) {
12365
+ sanitizedVal += letter;
12366
+ } else {
12367
+ invalidChars.push(letter);
12368
+ }
12369
+ });
12370
+ Object.keys(replaceCount).forEach((letter) => {
12371
+ warnings.push(
12372
+ `Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
12334
12373
  );
12335
- } else {
12336
- return sequenceString;
12374
+ });
12375
+ if (sequenceString.length !== sanitizedVal.length) {
12376
+ warnings.push(
12377
+ `${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
12378
+ );
12379
+ }
12380
+ if (typeof window !== "undefined" && window.toastr && warnings.length) {
12381
+ warnings.forEach((warning) => {
12382
+ window.toastr.warning(warning);
12383
+ });
12337
12384
  }
12385
+ return [sanitizedVal, warnings];
12338
12386
  }
12339
12387
  __name(filterSequenceString, "filterSequenceString");
12388
+ function getAcceptedChars({
12389
+ isOligo,
12390
+ isProtein,
12391
+ isRna,
12392
+ isMixedRnaAndDna,
12393
+ includeStopCodon
12394
+ } = {}) {
12395
+ return isProtein ? `${protein_letters_withUandX.toLowerCase()}${includeStopCodon ? "*." : ""}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
12396
+ //just plain old dna
12397
+ ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
12398
+ );
12399
+ }
12400
+ __name(getAcceptedChars, "getAcceptedChars");
12401
+ function getReplaceChars({
12402
+ isOligo,
12403
+ isProtein,
12404
+ isRna,
12405
+ isMixedRnaAndDna
12406
+ } = {}) {
12407
+ return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
12408
+ //just plain old dna
12409
+ {}
12410
+ );
12411
+ }
12412
+ __name(getReplaceChars, "getReplaceChars");
12340
12413
  function tidyUpAnnotation(_annotation, {
12341
12414
  sequenceData = {},
12342
12415
  convertAnnotationsFromAAIndices,
@@ -12465,14 +12538,6 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
12465
12538
  }
12466
12539
  }
12467
12540
  __name(coerceLocation, "coerceLocation");
12468
- function filterAminoAcidSequenceString(sequenceString, options) {
12469
- options = options || {};
12470
- if (options.includeStopCodon) {
12471
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
12472
- }
12473
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
12474
- }
12475
- __name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
12476
12541
  function getDegenerateDnaStringFromAAString(aaString) {
12477
12542
  return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
12478
12543
  }
@@ -12484,11 +12549,10 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
12484
12549
  removeUnwantedChars,
12485
12550
  additionalValidChars,
12486
12551
  noTranslationData,
12487
- charOverrides,
12488
12552
  doNotProvideIdsForAnnotations,
12489
- proteinFilterOptions,
12490
12553
  noCdsTranslations,
12491
- convertAnnotationsFromAAIndices
12554
+ convertAnnotationsFromAAIndices,
12555
+ topLevelSeqData
12492
12556
  } = options;
12493
12557
  let seqData = lodashExports.cloneDeep(pSeqData);
12494
12558
  const response = {
@@ -12518,16 +12582,15 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
12518
12582
  }
12519
12583
  if (removeUnwantedChars) {
12520
12584
  if (seqData.isProtein) {
12521
- seqData.proteinSequence = filterAminoAcidSequenceString(
12522
- seqData.proteinSequence,
12523
- __spreadValues({ includeStopCodon: true }, proteinFilterOptions)
12524
- );
12585
+ const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({
12586
+ includeStopCodon: true
12587
+ }, topLevelSeqData || seqData));
12588
+ seqData.proteinSequence = newSeq;
12525
12589
  } else {
12526
- seqData.sequence = filterSequenceString(
12527
- seqData.sequence,
12528
- `${additionalValidChars || ""}${seqData.isRna || seqData.isMixedRnaAndDna ? "u" : ""}`,
12529
- charOverrides
12530
- );
12590
+ const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
12591
+ additionalValidChars
12592
+ }, topLevelSeqData || seqData));
12593
+ seqData.sequence = newSeq;
12531
12594
  }
12532
12595
  }
12533
12596
  if (seqData.isProtein) {
@@ -22667,7 +22730,6 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
22667
22730
  exports2.deleteSequenceDataAtRange = deleteSequenceDataAtRange;
22668
22731
  exports2.doesEnzymeChopOutsideOfRecognitionSite = doesEnzymeChopOutsideOfRecognitionSite;
22669
22732
  exports2.featureColors = featureColors;
22670
- exports2.filterAminoAcidSequenceString = filterAminoAcidSequenceString;
22671
22733
  exports2.filterSequenceString = filterSequenceString;
22672
22734
  exports2.findNearestRangeOfSequenceOverlapToPosition = findNearestRangeOfSequenceOverlapToPosition;
22673
22735
  exports2.findOrfsInPlasmid = findOrfsInPlasmid;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@teselagen/sequence-utils",
3
- "version": "0.3.8",
3
+ "version": "0.3.9",
4
4
  "dependencies": {
5
5
  "@teselagen/range-utils": "0.3.7",
6
6
  "bson-objectid": "^2.0.4",
package/src/bioData.js CHANGED
@@ -1,6 +1,7 @@
1
1
  //Adapted from biopython. Check the BIOPYTHON_LICENSE for licensing info
2
2
 
3
3
  export const protein_letters = "ACDEFGHIKLMNPQRSTVWY";
4
+ export const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
4
5
 
5
6
  export const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO.*-";
6
7
  export const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
@@ -29,7 +30,6 @@ export const ambiguous_dna_values = {
29
30
  N: "GATC"
30
31
  };
31
32
 
32
-
33
33
  export const extended_protein_values = {
34
34
  A: "A",
35
35
  B: "ND",
@@ -1,24 +1,117 @@
1
- // this is throwing a weird eslint error
1
+ import {
2
+ ambiguous_dna_letters,
3
+ ambiguous_rna_letters,
4
+ protein_letters_withUandX
5
+ } from "./bioData";
2
6
 
3
- //
4
7
  export default function filterSequenceString(
5
8
  sequenceString,
6
- additionalValidChars = "",
7
- charOverrides
9
+ {
10
+ additionalValidChars = "",
11
+ isOligo,
12
+ name,
13
+ isProtein,
14
+ isRna,
15
+ isMixedRnaAndDna,
16
+ includeStopCodon
17
+ } = {}
8
18
  ) {
9
- // ac.throw(ac.string,sequenceString);
10
- if (sequenceString) {
11
- return sequenceString.replace(
12
- new RegExp(
13
- `[^${
14
- charOverrides ||
15
- `atgcyrswkmbvdhnu${additionalValidChars.split("").join("\\")}`
16
- }]`,
17
- "gi"
18
- ),
19
- ""
19
+ const acceptedChars = getAcceptedChars({
20
+ isOligo,
21
+ isProtein,
22
+ isRna,
23
+ isMixedRnaAndDna,
24
+ includeStopCodon
25
+ });
26
+ const replaceChars = getReplaceChars({
27
+ isOligo,
28
+ isProtein,
29
+ isRna,
30
+ isMixedRnaAndDna
31
+ });
32
+
33
+ let sanitizedVal = "";
34
+ const invalidChars = [];
35
+ const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
36
+ const warnings = [];
37
+ const replaceCount = {};
38
+ sequenceString.split("").forEach(letter => {
39
+ const lowerLetter = letter.toLowerCase();
40
+ if (replaceChars && replaceChars[lowerLetter]) {
41
+ if (!replaceCount[lowerLetter]) {
42
+ replaceCount[lowerLetter] = 0;
43
+ }
44
+ replaceCount[lowerLetter]++;
45
+ const isUpper = lowerLetter !== letter;
46
+ sanitizedVal += isUpper
47
+ ? replaceChars[lowerLetter].toUpperCase()
48
+ : replaceChars[lowerLetter];
49
+ } else if (chars.includes(lowerLetter)) {
50
+ sanitizedVal += letter;
51
+ } else {
52
+ invalidChars.push(letter);
53
+ }
54
+ });
55
+ //add replace count warnings
56
+ Object.keys(replaceCount).forEach(letter => {
57
+ warnings.push(
58
+ `Replaced "${letter}" with "${replaceChars[letter]}"${
59
+ replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""
60
+ }`
61
+ );
62
+ });
63
+ if (sequenceString.length !== sanitizedVal.length) {
64
+ warnings.push(
65
+ `${
66
+ name ? `Sequence ${name}: ` : ""
67
+ }Invalid character(s) detected and removed: ${invalidChars
68
+ .slice(0, 100)
69
+ .join(", ")} `
20
70
  );
21
- } else {
22
- return sequenceString;
23
71
  }
72
+ if (typeof window !== "undefined" && window.toastr && warnings.length) {
73
+ warnings.forEach(warning => {
74
+ window.toastr.warning(warning);
75
+ });
76
+ }
77
+
78
+ return [sanitizedVal, warnings];
79
+ }
80
+
81
+ export function getAcceptedChars({
82
+ isOligo,
83
+ isProtein,
84
+ isRna,
85
+ isMixedRnaAndDna,
86
+ includeStopCodon
87
+ } = {}) {
88
+ return isProtein
89
+ ? `${protein_letters_withUandX.toLowerCase()}${
90
+ includeStopCodon ? "*." : ""
91
+ }}`
92
+ : isOligo
93
+ ? ambiguous_rna_letters.toLowerCase() + "t"
94
+ : isRna
95
+ ? ambiguous_rna_letters.toLowerCase() + "t"
96
+ : isMixedRnaAndDna
97
+ ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
98
+ : //just plain old dna
99
+ ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase();
100
+ }
101
+ export function getReplaceChars({
102
+ isOligo,
103
+ isProtein,
104
+ isRna,
105
+ isMixedRnaAndDna
106
+ } = {}) {
107
+ return isProtein
108
+ ? {}
109
+ : isOligo
110
+ ? {}
111
+ : isRna
112
+ ? { t: "u" }
113
+ : isMixedRnaAndDna
114
+ ? {}
115
+ : //just plain old dna
116
+ {};
24
117
  }
@@ -1,13 +1,72 @@
1
1
  import filterSequenceString from "./filterSequenceString";
2
+ import { expect } from "vitest";
2
3
 
3
4
  describe("filterSequenceString", () => {
5
+ it("should not filter u's and should convert t's to u's from isOligo=true seqs", () => {
6
+ const [str, warnings] = filterSequenceString("tatuuag--a", {
7
+ isOligo: true
8
+ });
9
+ expect(str).toBe("tatuuaga");
10
+ // expect(warnings[0]).toBe('Replaced "t" with "u" 2 times');
11
+ expect(warnings[0]).toBe(
12
+ "Invalid character(s) detected and removed: -, - "
13
+ );
14
+ });
15
+ it("should not convert u's to t's for isDna (default isDna=true) seqs", () => {
16
+ const [str, warnings] = filterSequenceString("tatuuag--a", {});
17
+ // expect(warnings[0]).toBe('Replaced "u" with "t" 2 times');
18
+ expect(warnings[0]).toBe(
19
+ "Invalid character(s) detected and removed: -, - "
20
+ );
21
+ expect(str).toBe("tatuuaga");
22
+ });
4
23
  it("should filter out unwanted chars", () => {
5
- expect(filterSequenceString("tatag--a")).toBe("tataga");
24
+ const [str, warnings] = filterSequenceString("tatag--a");
25
+ expect(warnings[0]).toBe(
26
+ "Invalid character(s) detected and removed: -, - "
27
+ );
28
+ expect(str).toBe("tataga");
6
29
  });
7
30
  it("should handle additional chars option", () => {
8
- expect(filterSequenceString("tatag--a", "-")).toBe("tatag--a");
31
+ const [str, warnings] = filterSequenceString("tatag--a", {
32
+ additionalValidChars: "-"
33
+ });
34
+ expect(warnings.length).toBe(0);
35
+ expect(str).toBe("tatag--a");
9
36
  });
10
37
  it("should handle additional chars option", () => {
11
- expect(filterSequenceString("tatag--a", "f-q")).toBe("tatag--a");
38
+ const [str, warnings] = filterSequenceString("tatag--a", {
39
+ additionalValidChars: "f-q"
40
+ });
41
+ expect(warnings.length).toBe(0);
42
+ expect(str).toBe("tatag--a");
43
+ });
44
+
45
+ it("when isProtein: true, should filter only valid amino acids by default", () => {
46
+ const [str, warnings] = filterSequenceString(
47
+ 'bbb342"""xtgalmfwkqespvicyhrnd,,../',
48
+ {
49
+ isProtein: true
50
+ }
51
+ );
52
+ expect(warnings[0]).toBe(
53
+ 'Invalid character(s) detected and removed: b, b, b, 3, 4, 2, ", ", ", ,, ,, ., ., / '
54
+ );
55
+ expect(str).toBe("xtgalmfwkqespvicyhrnd");
56
+ });
57
+ it("when isProtein: true, should handle upper case letters", () => {
58
+ const [str, warnings] = filterSequenceString("xtgalmfWKQEspvicyhrnd", {
59
+ isProtein: true
60
+ });
61
+ expect(warnings.length).toBe(0);
62
+ expect(str).toBe("xtgalmfWKQEspvicyhrnd");
63
+ });
64
+ it("when isProtein: true, should handle the option to includeStopCodon by allowing periods", () => {
65
+ const [str] = filterSequenceString('bbb342"""xtgalmfwkqespvicyhrnd,,../', {
66
+ isProtein: true,
67
+ includeStopCodon: true
68
+ });
69
+
70
+ expect(str).toBe("xtgalmfwkqespvicyhrnd..");
12
71
  });
13
72
  });
package/src/index.js CHANGED
@@ -51,7 +51,6 @@ export { default as aliasedEnzymesByName } from "./aliasedEnzymesByName";
51
51
  export { default as defaultEnzymesByName } from "./defaultEnzymesByName";
52
52
  export { default as generateSequenceData } from "./generateSequenceData";
53
53
  export { default as generateAnnotations } from "./generateAnnotations";
54
- export { default as filterAminoAcidSequenceString } from "./filterAminoAcidSequenceString";
55
54
  export { default as filterSequenceString } from "./filterSequenceString";
56
55
  export { default as findNearestRangeOfSequenceOverlapToPosition } from "./findNearestRangeOfSequenceOverlapToPosition";
57
56
  export { default as findOrfsInPlasmid } from "./findOrfsInPlasmid";
@@ -1,4 +1,3 @@
1
-
2
1
  const proteinAlphabet = {
3
2
  A: {
4
3
  value: "A",
@@ -18,7 +17,7 @@ const proteinAlphabet = {
18
17
  color: "hsl(258.1, 100%, 69%)",
19
18
  mass: 156.18568
20
19
  },
21
-
20
+
22
21
  N: {
23
22
  value: "N",
24
23
  name: "Asparagine",
@@ -46,7 +45,7 @@ const proteinAlphabet = {
46
45
  color: "hsl(335.1, 100%, 69%)",
47
46
  mass: 103.1429
48
47
  },
49
-
48
+
50
49
  E: {
51
50
  value: "E",
52
51
  name: "Glutamic acid",
@@ -75,7 +74,6 @@ const proteinAlphabet = {
75
74
  mass: 57.05132
76
75
  },
77
76
 
78
-
79
77
  H: {
80
78
  value: "H",
81
79
  name: "Histidine",
@@ -114,7 +112,6 @@ const proteinAlphabet = {
114
112
  mass: 128.17228
115
113
  },
116
114
 
117
-
118
115
  M: {
119
116
  value: "M",
120
117
  name: "Methionine",
@@ -6,7 +6,6 @@ import { cloneDeep, flatMap } from "lodash";
6
6
  import { annotationTypes } from "./annotationTypes";
7
7
  import filterSequenceString from "./filterSequenceString";
8
8
  import tidyUpAnnotation from "./tidyUpAnnotation";
9
- import filterAminoAcidSequenceString from "./filterAminoAcidSequenceString";
10
9
  import getDegenerateDnaStringFromAaString from "./getDegenerateDnaStringFromAAString";
11
10
  import { getFeatureTypes } from "./featureTypesAndColors";
12
11
 
@@ -17,11 +16,10 @@ export default function tidyUpSequenceData(pSeqData, options = {}) {
17
16
  removeUnwantedChars,
18
17
  additionalValidChars,
19
18
  noTranslationData,
20
- charOverrides,
21
19
  doNotProvideIdsForAnnotations,
22
- proteinFilterOptions,
23
20
  noCdsTranslations,
24
- convertAnnotationsFromAAIndices
21
+ convertAnnotationsFromAAIndices,
22
+ topLevelSeqData
25
23
  } = options;
26
24
  let seqData = cloneDeep(pSeqData); //sequence is usually immutable, so we clone it and return it
27
25
  const response = {
@@ -56,18 +54,17 @@ export default function tidyUpSequenceData(pSeqData, options = {}) {
56
54
  }
57
55
  if (removeUnwantedChars) {
58
56
  if (seqData.isProtein) {
59
- seqData.proteinSequence = filterAminoAcidSequenceString(
60
- seqData.proteinSequence,
61
- { includeStopCodon: true, ...proteinFilterOptions }
62
- );
57
+ const [newSeq] = filterSequenceString(seqData.proteinSequence, {
58
+ includeStopCodon: true,
59
+ ...(topLevelSeqData || seqData)
60
+ });
61
+ seqData.proteinSequence = newSeq;
63
62
  } else {
64
- seqData.sequence = filterSequenceString(
65
- seqData.sequence,
66
- `${additionalValidChars || ""}${
67
- seqData.isRna || seqData.isMixedRnaAndDna ? "u" : "" //if it is rna or mixed, allow u's
68
- }`,
69
- charOverrides
70
- );
63
+ const [newSeq] = filterSequenceString(seqData.sequence, {
64
+ additionalValidChars,
65
+ ...(topLevelSeqData || seqData)
66
+ });
67
+ seqData.sequence = newSeq;
71
68
  }
72
69
  }
73
70
  if (seqData.isProtein) {
@@ -1 +0,0 @@
1
- export default function filterAminoAcidSequenceString(sequenceString: any, options: any): any;
@@ -1 +0,0 @@
1
- export {};
@@ -1,10 +0,0 @@
1
- //
2
- export default function filterAminoAcidSequenceString(sequenceString, options) {
3
- options = options || {};
4
- if (options.includeStopCodon) {
5
- //tnrtodo this maybe needs the stop codon char in it?
6
- return sequenceString?.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
7
- }
8
- // ac.throw(ac.string, sequenceString);
9
- return sequenceString?.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
10
- }
@@ -1,24 +0,0 @@
1
- import assert from "assert";
2
- import filterAminoAcidSequenceString from "./filterAminoAcidSequenceString";
3
- describe("filterAminoAcidSequenceString", () => {
4
- it("should filter only valid amino acids by default", () => {
5
- const filteredString = filterAminoAcidSequenceString(
6
- 'bbb342"""xtgalmfwkqespvicyhrnd,,../'
7
- );
8
- assert.equal(filteredString, "xtgalmfwkqespvicyhrnd");
9
- });
10
- it("should handle upper case letters", () => {
11
- const filteredString = filterAminoAcidSequenceString(
12
- "xtgalmfWKQEspvicyhrnd"
13
- );
14
- assert.equal(filteredString, "xtgalmfWKQEspvicyhrnd");
15
- });
16
- it("should handle the option to includeStopCodon by allowing periods", () => {
17
- const options = { includeStopCodon: true };
18
- const filteredString = filterAminoAcidSequenceString(
19
- 'bbb342"""xtgalmfwkqespvicyhrnd,,../',
20
- options
21
- );
22
- assert.equal(filteredString, "xtgalmfwkqespvicyhrnd..");
23
- });
24
- });