@teselagen/bio-parsers 0.3.10 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.js CHANGED
@@ -6170,7 +6170,9 @@ lodash.exports;
6170
6170
  })(lodash, lodash.exports);
6171
6171
  var lodashExports = lodash.exports;
6172
6172
  const _ = /* @__PURE__ */ getDefaultExportFromCjs(lodashExports);
6173
+ const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
6173
6174
  const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
6175
+ const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
6174
6176
  const aminoAcidToDegenerateDnaMap = {
6175
6177
  "-": "---",
6176
6178
  ".": "...",
@@ -11273,20 +11275,91 @@ const annotationTypes = [
11273
11275
  "primers",
11274
11276
  "guides"
11275
11277
  ];
11276
- function filterSequenceString(sequenceString, additionalValidChars = "", charOverrides) {
11277
- if (sequenceString) {
11278
- return sequenceString.replace(
11279
- new RegExp(
11280
- `[^${charOverrides || `atgcyrswkmbvdhnu${additionalValidChars.split("").join("\\")}`}]`,
11281
- "gi"
11282
- ),
11283
- ""
11278
+ function filterSequenceString(sequenceString, {
11279
+ additionalValidChars = "",
11280
+ isOligo,
11281
+ name,
11282
+ isProtein,
11283
+ isRna,
11284
+ isMixedRnaAndDna,
11285
+ includeStopCodon
11286
+ } = {}) {
11287
+ const acceptedChars = getAcceptedChars({
11288
+ isOligo,
11289
+ isProtein,
11290
+ isRna,
11291
+ isMixedRnaAndDna,
11292
+ includeStopCodon
11293
+ });
11294
+ const replaceChars = getReplaceChars({
11295
+ isOligo,
11296
+ isProtein,
11297
+ isRna,
11298
+ isMixedRnaAndDna
11299
+ });
11300
+ let sanitizedVal = "";
11301
+ const invalidChars = [];
11302
+ const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
11303
+ const warnings = [];
11304
+ const replaceCount = {};
11305
+ sequenceString.split("").forEach((letter) => {
11306
+ const lowerLetter = letter.toLowerCase();
11307
+ if (replaceChars && replaceChars[lowerLetter]) {
11308
+ if (!replaceCount[lowerLetter]) {
11309
+ replaceCount[lowerLetter] = 0;
11310
+ }
11311
+ replaceCount[lowerLetter]++;
11312
+ const isUpper = lowerLetter !== letter;
11313
+ sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
11314
+ } else if (chars.includes(lowerLetter)) {
11315
+ sanitizedVal += letter;
11316
+ } else {
11317
+ invalidChars.push(letter);
11318
+ }
11319
+ });
11320
+ Object.keys(replaceCount).forEach((letter) => {
11321
+ warnings.push(
11322
+ `Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
11323
+ );
11324
+ });
11325
+ if (sequenceString.length !== sanitizedVal.length) {
11326
+ warnings.push(
11327
+ `${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
11284
11328
  );
11285
- } else {
11286
- return sequenceString;
11287
11329
  }
11330
+ if (typeof window !== "undefined" && window.toastr && warnings.length) {
11331
+ warnings.forEach((warning) => {
11332
+ window.toastr.warning(warning);
11333
+ });
11334
+ }
11335
+ return [sanitizedVal, warnings];
11288
11336
  }
11289
11337
  __name(filterSequenceString, "filterSequenceString");
11338
+ function getAcceptedChars({
11339
+ isOligo,
11340
+ isProtein,
11341
+ isRna,
11342
+ isMixedRnaAndDna,
11343
+ includeStopCodon
11344
+ } = {}) {
11345
+ return isProtein ? `${protein_letters_withUandX.toLowerCase()}${includeStopCodon ? "*." : ""}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
11346
+ //just plain old dna
11347
+ ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
11348
+ );
11349
+ }
11350
+ __name(getAcceptedChars, "getAcceptedChars");
11351
+ function getReplaceChars({
11352
+ isOligo,
11353
+ isProtein,
11354
+ isRna,
11355
+ isMixedRnaAndDna
11356
+ } = {}) {
11357
+ return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
11358
+ //just plain old dna
11359
+ {}
11360
+ );
11361
+ }
11362
+ __name(getReplaceChars, "getReplaceChars");
11290
11363
  function tidyUpAnnotation(_annotation, {
11291
11364
  sequenceData = {},
11292
11365
  convertAnnotationsFromAAIndices,
@@ -11415,14 +11488,6 @@ function coerceLocation({
11415
11488
  }
11416
11489
  }
11417
11490
  __name(coerceLocation, "coerceLocation");
11418
- function filterAminoAcidSequenceString(sequenceString, options) {
11419
- options = options || {};
11420
- if (options.includeStopCodon) {
11421
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
11422
- }
11423
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
11424
- }
11425
- __name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
11426
11491
  function getDegenerateDnaStringFromAAString(aaString) {
11427
11492
  return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
11428
11493
  }
@@ -11434,11 +11499,10 @@ function tidyUpSequenceData(pSeqData, options = {}) {
11434
11499
  removeUnwantedChars,
11435
11500
  additionalValidChars,
11436
11501
  noTranslationData,
11437
- charOverrides,
11438
11502
  doNotProvideIdsForAnnotations,
11439
- proteinFilterOptions,
11440
11503
  noCdsTranslations,
11441
- convertAnnotationsFromAAIndices
11504
+ convertAnnotationsFromAAIndices,
11505
+ topLevelSeqData
11442
11506
  } = options;
11443
11507
  let seqData = lodashExports.cloneDeep(pSeqData);
11444
11508
  const response = {
@@ -11468,16 +11532,15 @@ function tidyUpSequenceData(pSeqData, options = {}) {
11468
11532
  }
11469
11533
  if (removeUnwantedChars) {
11470
11534
  if (seqData.isProtein) {
11471
- seqData.proteinSequence = filterAminoAcidSequenceString(
11472
- seqData.proteinSequence,
11473
- __spreadValues({ includeStopCodon: true }, proteinFilterOptions)
11474
- );
11535
+ const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({
11536
+ includeStopCodon: true
11537
+ }, topLevelSeqData || seqData));
11538
+ seqData.proteinSequence = newSeq;
11475
11539
  } else {
11476
- seqData.sequence = filterSequenceString(
11477
- seqData.sequence,
11478
- `${additionalValidChars || ""}${seqData.isRna || seqData.isMixedRnaAndDna ? "u" : ""}`,
11479
- charOverrides
11480
- );
11540
+ const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
11541
+ additionalValidChars
11542
+ }, topLevelSeqData || seqData));
11543
+ seqData.sequence = newSeq;
11481
11544
  }
11482
11545
  }
11483
11546
  if (seqData.isProtein) {
@@ -19312,7 +19375,8 @@ function validateSequence(sequence, options = {}) {
19312
19375
  inclusive1BasedEnd,
19313
19376
  additionalValidChars,
19314
19377
  allowOverflowAnnotations,
19315
- coerceFeatureTypes
19378
+ coerceFeatureTypes,
19379
+ includeStopCodon
19316
19380
  } = options;
19317
19381
  [
19318
19382
  "isDNA",
@@ -19362,7 +19426,6 @@ function validateSequence(sequence, options = {}) {
19362
19426
  response.messages.push("No sequence detected");
19363
19427
  sequence.sequence = "";
19364
19428
  }
19365
- let validChars;
19366
19429
  if (sequence.isProtein === void 0 && guessIfProtein) {
19367
19430
  sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
19368
19431
  sequence.sequence,
@@ -19370,12 +19433,15 @@ function validateSequence(sequence, options = {}) {
19370
19433
  );
19371
19434
  }
19372
19435
  if (sequence.isProtein) {
19373
- validChars = filterAminoAcidSequenceString(sequence.sequence);
19436
+ const [validChars, warnings] = filterSequenceString(sequence.sequence, {
19437
+ name: sequence.name,
19438
+ isProtein: true,
19439
+ additionalValidChars,
19440
+ includeStopCodon
19441
+ });
19374
19442
  if (validChars !== sequence.sequence) {
19375
19443
  sequence.sequence = validChars;
19376
- response.messages.push(
19377
- "Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
19378
- );
19444
+ response.messages.push(...warnings);
19379
19445
  }
19380
19446
  sequence.type = "PROTEIN";
19381
19447
  sequence.isProtein = true;
@@ -19397,12 +19463,12 @@ function validateSequence(sequence, options = {}) {
19397
19463
  } else {
19398
19464
  sequence.type = "DNA";
19399
19465
  }
19400
- validChars = filterSequenceString(sequence.sequence, additionalValidChars);
19466
+ const [validChars, warnings] = filterSequenceString(sequence.sequence, __spreadValues({
19467
+ additionalValidChars
19468
+ }, sequence));
19401
19469
  if (validChars !== sequence.sequence) {
19402
19470
  sequence.sequence = validChars;
19403
- response.messages.push(
19404
- "Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
19405
- );
19471
+ response.messages.push(...warnings);
19406
19472
  }
19407
19473
  }
19408
19474
  if (!sequence.size) {
package/index.mjs CHANGED
@@ -6168,7 +6168,9 @@ lodash.exports;
6168
6168
  })(lodash, lodash.exports);
6169
6169
  var lodashExports = lodash.exports;
6170
6170
  const _ = /* @__PURE__ */ getDefaultExportFromCjs(lodashExports);
6171
+ const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
6171
6172
  const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
6173
+ const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
6172
6174
  const aminoAcidToDegenerateDnaMap = {
6173
6175
  "-": "---",
6174
6176
  ".": "...",
@@ -11271,20 +11273,91 @@ const annotationTypes = [
11271
11273
  "primers",
11272
11274
  "guides"
11273
11275
  ];
11274
- function filterSequenceString(sequenceString, additionalValidChars = "", charOverrides) {
11275
- if (sequenceString) {
11276
- return sequenceString.replace(
11277
- new RegExp(
11278
- `[^${charOverrides || `atgcyrswkmbvdhnu${additionalValidChars.split("").join("\\")}`}]`,
11279
- "gi"
11280
- ),
11281
- ""
11276
+ function filterSequenceString(sequenceString, {
11277
+ additionalValidChars = "",
11278
+ isOligo,
11279
+ name,
11280
+ isProtein,
11281
+ isRna,
11282
+ isMixedRnaAndDna,
11283
+ includeStopCodon
11284
+ } = {}) {
11285
+ const acceptedChars = getAcceptedChars({
11286
+ isOligo,
11287
+ isProtein,
11288
+ isRna,
11289
+ isMixedRnaAndDna,
11290
+ includeStopCodon
11291
+ });
11292
+ const replaceChars = getReplaceChars({
11293
+ isOligo,
11294
+ isProtein,
11295
+ isRna,
11296
+ isMixedRnaAndDna
11297
+ });
11298
+ let sanitizedVal = "";
11299
+ const invalidChars = [];
11300
+ const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
11301
+ const warnings = [];
11302
+ const replaceCount = {};
11303
+ sequenceString.split("").forEach((letter) => {
11304
+ const lowerLetter = letter.toLowerCase();
11305
+ if (replaceChars && replaceChars[lowerLetter]) {
11306
+ if (!replaceCount[lowerLetter]) {
11307
+ replaceCount[lowerLetter] = 0;
11308
+ }
11309
+ replaceCount[lowerLetter]++;
11310
+ const isUpper = lowerLetter !== letter;
11311
+ sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
11312
+ } else if (chars.includes(lowerLetter)) {
11313
+ sanitizedVal += letter;
11314
+ } else {
11315
+ invalidChars.push(letter);
11316
+ }
11317
+ });
11318
+ Object.keys(replaceCount).forEach((letter) => {
11319
+ warnings.push(
11320
+ `Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
11321
+ );
11322
+ });
11323
+ if (sequenceString.length !== sanitizedVal.length) {
11324
+ warnings.push(
11325
+ `${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
11282
11326
  );
11283
- } else {
11284
- return sequenceString;
11285
11327
  }
11328
+ if (typeof window !== "undefined" && window.toastr && warnings.length) {
11329
+ warnings.forEach((warning) => {
11330
+ window.toastr.warning(warning);
11331
+ });
11332
+ }
11333
+ return [sanitizedVal, warnings];
11286
11334
  }
11287
11335
  __name(filterSequenceString, "filterSequenceString");
11336
+ function getAcceptedChars({
11337
+ isOligo,
11338
+ isProtein,
11339
+ isRna,
11340
+ isMixedRnaAndDna,
11341
+ includeStopCodon
11342
+ } = {}) {
11343
+ return isProtein ? `${protein_letters_withUandX.toLowerCase()}${includeStopCodon ? "*." : ""}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
11344
+ //just plain old dna
11345
+ ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
11346
+ );
11347
+ }
11348
+ __name(getAcceptedChars, "getAcceptedChars");
11349
+ function getReplaceChars({
11350
+ isOligo,
11351
+ isProtein,
11352
+ isRna,
11353
+ isMixedRnaAndDna
11354
+ } = {}) {
11355
+ return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
11356
+ //just plain old dna
11357
+ {}
11358
+ );
11359
+ }
11360
+ __name(getReplaceChars, "getReplaceChars");
11288
11361
  function tidyUpAnnotation(_annotation, {
11289
11362
  sequenceData = {},
11290
11363
  convertAnnotationsFromAAIndices,
@@ -11413,14 +11486,6 @@ function coerceLocation({
11413
11486
  }
11414
11487
  }
11415
11488
  __name(coerceLocation, "coerceLocation");
11416
- function filterAminoAcidSequenceString(sequenceString, options) {
11417
- options = options || {};
11418
- if (options.includeStopCodon) {
11419
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
11420
- }
11421
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
11422
- }
11423
- __name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
11424
11489
  function getDegenerateDnaStringFromAAString(aaString) {
11425
11490
  return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
11426
11491
  }
@@ -11432,11 +11497,10 @@ function tidyUpSequenceData(pSeqData, options = {}) {
11432
11497
  removeUnwantedChars,
11433
11498
  additionalValidChars,
11434
11499
  noTranslationData,
11435
- charOverrides,
11436
11500
  doNotProvideIdsForAnnotations,
11437
- proteinFilterOptions,
11438
11501
  noCdsTranslations,
11439
- convertAnnotationsFromAAIndices
11502
+ convertAnnotationsFromAAIndices,
11503
+ topLevelSeqData
11440
11504
  } = options;
11441
11505
  let seqData = lodashExports.cloneDeep(pSeqData);
11442
11506
  const response = {
@@ -11466,16 +11530,15 @@ function tidyUpSequenceData(pSeqData, options = {}) {
11466
11530
  }
11467
11531
  if (removeUnwantedChars) {
11468
11532
  if (seqData.isProtein) {
11469
- seqData.proteinSequence = filterAminoAcidSequenceString(
11470
- seqData.proteinSequence,
11471
- __spreadValues({ includeStopCodon: true }, proteinFilterOptions)
11472
- );
11533
+ const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({
11534
+ includeStopCodon: true
11535
+ }, topLevelSeqData || seqData));
11536
+ seqData.proteinSequence = newSeq;
11473
11537
  } else {
11474
- seqData.sequence = filterSequenceString(
11475
- seqData.sequence,
11476
- `${additionalValidChars || ""}${seqData.isRna || seqData.isMixedRnaAndDna ? "u" : ""}`,
11477
- charOverrides
11478
- );
11538
+ const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
11539
+ additionalValidChars
11540
+ }, topLevelSeqData || seqData));
11541
+ seqData.sequence = newSeq;
11479
11542
  }
11480
11543
  }
11481
11544
  if (seqData.isProtein) {
@@ -19310,7 +19373,8 @@ function validateSequence(sequence, options = {}) {
19310
19373
  inclusive1BasedEnd,
19311
19374
  additionalValidChars,
19312
19375
  allowOverflowAnnotations,
19313
- coerceFeatureTypes
19376
+ coerceFeatureTypes,
19377
+ includeStopCodon
19314
19378
  } = options;
19315
19379
  [
19316
19380
  "isDNA",
@@ -19360,7 +19424,6 @@ function validateSequence(sequence, options = {}) {
19360
19424
  response.messages.push("No sequence detected");
19361
19425
  sequence.sequence = "";
19362
19426
  }
19363
- let validChars;
19364
19427
  if (sequence.isProtein === void 0 && guessIfProtein) {
19365
19428
  sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
19366
19429
  sequence.sequence,
@@ -19368,12 +19431,15 @@ function validateSequence(sequence, options = {}) {
19368
19431
  );
19369
19432
  }
19370
19433
  if (sequence.isProtein) {
19371
- validChars = filterAminoAcidSequenceString(sequence.sequence);
19434
+ const [validChars, warnings] = filterSequenceString(sequence.sequence, {
19435
+ name: sequence.name,
19436
+ isProtein: true,
19437
+ additionalValidChars,
19438
+ includeStopCodon
19439
+ });
19372
19440
  if (validChars !== sequence.sequence) {
19373
19441
  sequence.sequence = validChars;
19374
- response.messages.push(
19375
- "Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
19376
- );
19442
+ response.messages.push(...warnings);
19377
19443
  }
19378
19444
  sequence.type = "PROTEIN";
19379
19445
  sequence.isProtein = true;
@@ -19395,12 +19461,12 @@ function validateSequence(sequence, options = {}) {
19395
19461
  } else {
19396
19462
  sequence.type = "DNA";
19397
19463
  }
19398
- validChars = filterSequenceString(sequence.sequence, additionalValidChars);
19464
+ const [validChars, warnings] = filterSequenceString(sequence.sequence, __spreadValues({
19465
+ additionalValidChars
19466
+ }, sequence));
19399
19467
  if (validChars !== sequence.sequence) {
19400
19468
  sequence.sequence = validChars;
19401
- response.messages.push(
19402
- "Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
19403
- );
19469
+ response.messages.push(...warnings);
19404
19470
  }
19405
19471
  }
19406
19472
  if (!sequence.size) {
package/index.umd.js CHANGED
@@ -6172,7 +6172,9 @@ var __async = (__this, __arguments, generator) => {
6172
6172
  })(lodash, lodash.exports);
6173
6173
  var lodashExports = lodash.exports;
6174
6174
  const _ = /* @__PURE__ */ getDefaultExportFromCjs(lodashExports);
6175
+ const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
6175
6176
  const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
6177
+ const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
6176
6178
  const aminoAcidToDegenerateDnaMap = {
6177
6179
  "-": "---",
6178
6180
  ".": "...",
@@ -11275,20 +11277,91 @@ var __async = (__this, __arguments, generator) => {
11275
11277
  "primers",
11276
11278
  "guides"
11277
11279
  ];
11278
- function filterSequenceString(sequenceString, additionalValidChars = "", charOverrides) {
11279
- if (sequenceString) {
11280
- return sequenceString.replace(
11281
- new RegExp(
11282
- `[^${charOverrides || `atgcyrswkmbvdhnu${additionalValidChars.split("").join("\\")}`}]`,
11283
- "gi"
11284
- ),
11285
- ""
11280
+ function filterSequenceString(sequenceString, {
11281
+ additionalValidChars = "",
11282
+ isOligo,
11283
+ name: name2,
11284
+ isProtein,
11285
+ isRna,
11286
+ isMixedRnaAndDna,
11287
+ includeStopCodon
11288
+ } = {}) {
11289
+ const acceptedChars = getAcceptedChars({
11290
+ isOligo,
11291
+ isProtein,
11292
+ isRna,
11293
+ isMixedRnaAndDna,
11294
+ includeStopCodon
11295
+ });
11296
+ const replaceChars = getReplaceChars({
11297
+ isOligo,
11298
+ isProtein,
11299
+ isRna,
11300
+ isMixedRnaAndDna
11301
+ });
11302
+ let sanitizedVal = "";
11303
+ const invalidChars = [];
11304
+ const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
11305
+ const warnings = [];
11306
+ const replaceCount = {};
11307
+ sequenceString.split("").forEach((letter) => {
11308
+ const lowerLetter = letter.toLowerCase();
11309
+ if (replaceChars && replaceChars[lowerLetter]) {
11310
+ if (!replaceCount[lowerLetter]) {
11311
+ replaceCount[lowerLetter] = 0;
11312
+ }
11313
+ replaceCount[lowerLetter]++;
11314
+ const isUpper = lowerLetter !== letter;
11315
+ sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
11316
+ } else if (chars.includes(lowerLetter)) {
11317
+ sanitizedVal += letter;
11318
+ } else {
11319
+ invalidChars.push(letter);
11320
+ }
11321
+ });
11322
+ Object.keys(replaceCount).forEach((letter) => {
11323
+ warnings.push(
11324
+ `Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
11325
+ );
11326
+ });
11327
+ if (sequenceString.length !== sanitizedVal.length) {
11328
+ warnings.push(
11329
+ `${name2 ? `Sequence ${name2}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
11286
11330
  );
11287
- } else {
11288
- return sequenceString;
11289
11331
  }
11332
+ if (typeof window !== "undefined" && window.toastr && warnings.length) {
11333
+ warnings.forEach((warning) => {
11334
+ window.toastr.warning(warning);
11335
+ });
11336
+ }
11337
+ return [sanitizedVal, warnings];
11290
11338
  }
11291
11339
  __name(filterSequenceString, "filterSequenceString");
11340
+ function getAcceptedChars({
11341
+ isOligo,
11342
+ isProtein,
11343
+ isRna,
11344
+ isMixedRnaAndDna,
11345
+ includeStopCodon
11346
+ } = {}) {
11347
+ return isProtein ? `${protein_letters_withUandX.toLowerCase()}${includeStopCodon ? "*." : ""}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
11348
+ //just plain old dna
11349
+ ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
11350
+ );
11351
+ }
11352
+ __name(getAcceptedChars, "getAcceptedChars");
11353
+ function getReplaceChars({
11354
+ isOligo,
11355
+ isProtein,
11356
+ isRna,
11357
+ isMixedRnaAndDna
11358
+ } = {}) {
11359
+ return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
11360
+ //just plain old dna
11361
+ {}
11362
+ );
11363
+ }
11364
+ __name(getReplaceChars, "getReplaceChars");
11292
11365
  function tidyUpAnnotation(_annotation, {
11293
11366
  sequenceData = {},
11294
11367
  convertAnnotationsFromAAIndices,
@@ -11417,14 +11490,6 @@ var __async = (__this, __arguments, generator) => {
11417
11490
  }
11418
11491
  }
11419
11492
  __name(coerceLocation, "coerceLocation");
11420
- function filterAminoAcidSequenceString(sequenceString, options) {
11421
- options = options || {};
11422
- if (options.includeStopCodon) {
11423
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
11424
- }
11425
- return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
11426
- }
11427
- __name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
11428
11493
  function getDegenerateDnaStringFromAAString(aaString) {
11429
11494
  return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
11430
11495
  }
@@ -11436,11 +11501,10 @@ var __async = (__this, __arguments, generator) => {
11436
11501
  removeUnwantedChars,
11437
11502
  additionalValidChars,
11438
11503
  noTranslationData,
11439
- charOverrides,
11440
11504
  doNotProvideIdsForAnnotations,
11441
- proteinFilterOptions,
11442
11505
  noCdsTranslations,
11443
- convertAnnotationsFromAAIndices
11506
+ convertAnnotationsFromAAIndices,
11507
+ topLevelSeqData
11444
11508
  } = options;
11445
11509
  let seqData = lodashExports.cloneDeep(pSeqData);
11446
11510
  const response = {
@@ -11470,16 +11534,15 @@ var __async = (__this, __arguments, generator) => {
11470
11534
  }
11471
11535
  if (removeUnwantedChars) {
11472
11536
  if (seqData.isProtein) {
11473
- seqData.proteinSequence = filterAminoAcidSequenceString(
11474
- seqData.proteinSequence,
11475
- __spreadValues({ includeStopCodon: true }, proteinFilterOptions)
11476
- );
11537
+ const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({
11538
+ includeStopCodon: true
11539
+ }, topLevelSeqData || seqData));
11540
+ seqData.proteinSequence = newSeq;
11477
11541
  } else {
11478
- seqData.sequence = filterSequenceString(
11479
- seqData.sequence,
11480
- `${additionalValidChars || ""}${seqData.isRna || seqData.isMixedRnaAndDna ? "u" : ""}`,
11481
- charOverrides
11482
- );
11542
+ const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
11543
+ additionalValidChars
11544
+ }, topLevelSeqData || seqData));
11545
+ seqData.sequence = newSeq;
11483
11546
  }
11484
11547
  }
11485
11548
  if (seqData.isProtein) {
@@ -19314,7 +19377,8 @@ var __async = (__this, __arguments, generator) => {
19314
19377
  inclusive1BasedEnd,
19315
19378
  additionalValidChars,
19316
19379
  allowOverflowAnnotations,
19317
- coerceFeatureTypes
19380
+ coerceFeatureTypes,
19381
+ includeStopCodon
19318
19382
  } = options;
19319
19383
  [
19320
19384
  "isDNA",
@@ -19364,7 +19428,6 @@ var __async = (__this, __arguments, generator) => {
19364
19428
  response.messages.push("No sequence detected");
19365
19429
  sequence.sequence = "";
19366
19430
  }
19367
- let validChars;
19368
19431
  if (sequence.isProtein === void 0 && guessIfProtein) {
19369
19432
  sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
19370
19433
  sequence.sequence,
@@ -19372,12 +19435,15 @@ var __async = (__this, __arguments, generator) => {
19372
19435
  );
19373
19436
  }
19374
19437
  if (sequence.isProtein) {
19375
- validChars = filterAminoAcidSequenceString(sequence.sequence);
19438
+ const [validChars, warnings] = filterSequenceString(sequence.sequence, {
19439
+ name: sequence.name,
19440
+ isProtein: true,
19441
+ additionalValidChars,
19442
+ includeStopCodon
19443
+ });
19376
19444
  if (validChars !== sequence.sequence) {
19377
19445
  sequence.sequence = validChars;
19378
- response.messages.push(
19379
- "Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
19380
- );
19446
+ response.messages.push(...warnings);
19381
19447
  }
19382
19448
  sequence.type = "PROTEIN";
19383
19449
  sequence.isProtein = true;
@@ -19399,12 +19465,12 @@ var __async = (__this, __arguments, generator) => {
19399
19465
  } else {
19400
19466
  sequence.type = "DNA";
19401
19467
  }
19402
- validChars = filterSequenceString(sequence.sequence, additionalValidChars);
19468
+ const [validChars, warnings] = filterSequenceString(sequence.sequence, __spreadValues({
19469
+ additionalValidChars
19470
+ }, sequence));
19403
19471
  if (validChars !== sequence.sequence) {
19404
19472
  sequence.sequence = validChars;
19405
- response.messages.push(
19406
- "Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
19407
- );
19473
+ response.messages.push(...warnings);
19408
19474
  }
19409
19475
  }
19410
19476
  if (!sequence.size) {
package/package.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "name": "@teselagen/bio-parsers",
3
- "version": "0.3.10",
3
+ "version": "0.4.1",
4
4
  "dependencies": {
5
- "@teselagen/sequence-utils": "0.3.8",
5
+ "@teselagen/sequence-utils": "0.3.9",
6
6
  "@teselagen/range-utils": "0.3.7",
7
7
  "@gmod/gff": "^1.2.1",
8
8
  "buffer": "^6.0.3",
@@ -1,7 +1,6 @@
1
1
  import areNonNegativeIntegers from "validate.io-nonnegative-integer-array";
2
2
  import { getFeatureTypes } from "@teselagen/sequence-utils";
3
3
  import {
4
- filterAminoAcidSequenceString,
5
4
  filterSequenceString,
6
5
  guessIfSequenceIsDnaAndNotProtein
7
6
  } from "@teselagen/sequence-utils";
@@ -30,7 +29,8 @@ export default function validateSequence(sequence, options = {}) {
30
29
  inclusive1BasedEnd,
31
30
  additionalValidChars,
32
31
  allowOverflowAnnotations,
33
- coerceFeatureTypes
32
+ coerceFeatureTypes,
33
+ includeStopCodon
34
34
  } = options;
35
35
  [
36
36
  "isDNA",
@@ -84,7 +84,7 @@ export default function validateSequence(sequence, options = {}) {
84
84
  response.messages.push("No sequence detected");
85
85
  sequence.sequence = "";
86
86
  }
87
- let validChars;
87
+
88
88
  if (sequence.isProtein === undefined && guessIfProtein) {
89
89
  sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
90
90
  sequence.sequence,
@@ -93,12 +93,15 @@ export default function validateSequence(sequence, options = {}) {
93
93
  }
94
94
  if (sequence.isProtein) {
95
95
  //tnr: add code to strip invalid protein data..
96
- validChars = filterAminoAcidSequenceString(sequence.sequence);
96
+ const [validChars, warnings] = filterSequenceString(sequence.sequence, {
97
+ name: sequence.name,
98
+ isProtein: true,
99
+ additionalValidChars,
100
+ includeStopCodon
101
+ });
97
102
  if (validChars !== sequence.sequence) {
98
103
  sequence.sequence = validChars;
99
- response.messages.push(
100
- "Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
101
- );
104
+ response.messages.push(...warnings);
102
105
  }
103
106
  sequence.type = "PROTEIN";
104
107
  sequence.isProtein = true;
@@ -126,12 +129,13 @@ export default function validateSequence(sequence, options = {}) {
126
129
  sequence.type = "DNA";
127
130
  }
128
131
 
129
- validChars = filterSequenceString(sequence.sequence, additionalValidChars);
132
+ const [validChars, warnings] = filterSequenceString(sequence.sequence, {
133
+ additionalValidChars,
134
+ ...sequence
135
+ });
130
136
  if (validChars !== sequence.sequence) {
131
137
  sequence.sequence = validChars;
132
- response.messages.push(
133
- "Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
134
- );
138
+ response.messages.push(...warnings);
135
139
  }
136
140
  }
137
141