@teselagen/bio-parsers 0.3.10 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +100 -41
- package/index.mjs +100 -41
- package/index.umd.js +100 -41
- package/package.json +2 -2
- package/src/utils/validateSequence.js +13 -11
package/index.js
CHANGED
|
@@ -6170,7 +6170,9 @@ lodash.exports;
|
|
|
6170
6170
|
})(lodash, lodash.exports);
|
|
6171
6171
|
var lodashExports = lodash.exports;
|
|
6172
6172
|
const _ = /* @__PURE__ */ getDefaultExportFromCjs(lodashExports);
|
|
6173
|
+
const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO*";
|
|
6173
6174
|
const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
|
|
6175
|
+
const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
|
|
6174
6176
|
const aminoAcidToDegenerateDnaMap = {
|
|
6175
6177
|
"-": "---",
|
|
6176
6178
|
".": "...",
|
|
@@ -11273,20 +11275,88 @@ const annotationTypes = [
|
|
|
11273
11275
|
"primers",
|
|
11274
11276
|
"guides"
|
|
11275
11277
|
];
|
|
11276
|
-
function filterSequenceString(sequenceString,
|
|
11277
|
-
|
|
11278
|
-
|
|
11279
|
-
|
|
11280
|
-
|
|
11281
|
-
|
|
11282
|
-
|
|
11283
|
-
|
|
11278
|
+
function filterSequenceString(sequenceString, {
|
|
11279
|
+
additionalValidChars = "",
|
|
11280
|
+
isOligo,
|
|
11281
|
+
name,
|
|
11282
|
+
isProtein,
|
|
11283
|
+
isRna,
|
|
11284
|
+
isMixedRnaAndDna
|
|
11285
|
+
} = {}) {
|
|
11286
|
+
const acceptedChars = getAcceptedChars({
|
|
11287
|
+
isOligo,
|
|
11288
|
+
isProtein,
|
|
11289
|
+
isRna,
|
|
11290
|
+
isMixedRnaAndDna
|
|
11291
|
+
});
|
|
11292
|
+
const replaceChars = getReplaceChars({
|
|
11293
|
+
isOligo,
|
|
11294
|
+
isProtein,
|
|
11295
|
+
isRna,
|
|
11296
|
+
isMixedRnaAndDna
|
|
11297
|
+
});
|
|
11298
|
+
let sanitizedVal = "";
|
|
11299
|
+
const invalidChars = [];
|
|
11300
|
+
const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
|
|
11301
|
+
const warnings = [];
|
|
11302
|
+
const replaceCount = {};
|
|
11303
|
+
sequenceString.split("").forEach((letter) => {
|
|
11304
|
+
const lowerLetter = letter.toLowerCase();
|
|
11305
|
+
if (replaceChars && replaceChars[lowerLetter]) {
|
|
11306
|
+
if (!replaceCount[lowerLetter]) {
|
|
11307
|
+
replaceCount[lowerLetter] = 0;
|
|
11308
|
+
}
|
|
11309
|
+
replaceCount[lowerLetter]++;
|
|
11310
|
+
const isUpper = lowerLetter !== letter;
|
|
11311
|
+
sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
|
|
11312
|
+
} else if (chars.includes(lowerLetter)) {
|
|
11313
|
+
sanitizedVal += letter;
|
|
11314
|
+
} else {
|
|
11315
|
+
invalidChars.push(letter);
|
|
11316
|
+
}
|
|
11317
|
+
});
|
|
11318
|
+
Object.keys(replaceCount).forEach((letter) => {
|
|
11319
|
+
warnings.push(
|
|
11320
|
+
`Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
|
|
11321
|
+
);
|
|
11322
|
+
});
|
|
11323
|
+
if (sequenceString.length !== sanitizedVal.length) {
|
|
11324
|
+
warnings.push(
|
|
11325
|
+
`${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
|
|
11284
11326
|
);
|
|
11285
|
-
} else {
|
|
11286
|
-
return sequenceString;
|
|
11287
11327
|
}
|
|
11328
|
+
if (typeof window !== "undefined" && window.toastr && warnings.length) {
|
|
11329
|
+
warnings.forEach((warning) => {
|
|
11330
|
+
window.toastr.warning(warning);
|
|
11331
|
+
});
|
|
11332
|
+
}
|
|
11333
|
+
return [sanitizedVal, warnings];
|
|
11288
11334
|
}
|
|
11289
11335
|
__name(filterSequenceString, "filterSequenceString");
|
|
11336
|
+
function getAcceptedChars({
|
|
11337
|
+
isOligo,
|
|
11338
|
+
isProtein,
|
|
11339
|
+
isRna,
|
|
11340
|
+
isMixedRnaAndDna
|
|
11341
|
+
} = {}) {
|
|
11342
|
+
return isProtein ? `${extended_protein_letters.toLowerCase()}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
|
|
11343
|
+
//just plain old dna
|
|
11344
|
+
ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
|
|
11345
|
+
);
|
|
11346
|
+
}
|
|
11347
|
+
__name(getAcceptedChars, "getAcceptedChars");
|
|
11348
|
+
function getReplaceChars({
|
|
11349
|
+
isOligo,
|
|
11350
|
+
isProtein,
|
|
11351
|
+
isRna,
|
|
11352
|
+
isMixedRnaAndDna
|
|
11353
|
+
} = {}) {
|
|
11354
|
+
return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
|
|
11355
|
+
//just plain old dna
|
|
11356
|
+
{}
|
|
11357
|
+
);
|
|
11358
|
+
}
|
|
11359
|
+
__name(getReplaceChars, "getReplaceChars");
|
|
11290
11360
|
function tidyUpAnnotation(_annotation, {
|
|
11291
11361
|
sequenceData = {},
|
|
11292
11362
|
convertAnnotationsFromAAIndices,
|
|
@@ -11415,14 +11485,6 @@ function coerceLocation({
|
|
|
11415
11485
|
}
|
|
11416
11486
|
}
|
|
11417
11487
|
__name(coerceLocation, "coerceLocation");
|
|
11418
|
-
function filterAminoAcidSequenceString(sequenceString, options) {
|
|
11419
|
-
options = options || {};
|
|
11420
|
-
if (options.includeStopCodon) {
|
|
11421
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
|
|
11422
|
-
}
|
|
11423
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
|
|
11424
|
-
}
|
|
11425
|
-
__name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
|
|
11426
11488
|
function getDegenerateDnaStringFromAAString(aaString) {
|
|
11427
11489
|
return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
|
|
11428
11490
|
}
|
|
@@ -11431,14 +11493,13 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
11431
11493
|
const {
|
|
11432
11494
|
annotationsAsObjects,
|
|
11433
11495
|
logMessages,
|
|
11434
|
-
|
|
11496
|
+
doNotRemoveInvalidChars,
|
|
11435
11497
|
additionalValidChars,
|
|
11436
11498
|
noTranslationData,
|
|
11437
|
-
charOverrides,
|
|
11438
11499
|
doNotProvideIdsForAnnotations,
|
|
11439
|
-
proteinFilterOptions,
|
|
11440
11500
|
noCdsTranslations,
|
|
11441
|
-
convertAnnotationsFromAAIndices
|
|
11501
|
+
convertAnnotationsFromAAIndices,
|
|
11502
|
+
topLevelSeqData
|
|
11442
11503
|
} = options;
|
|
11443
11504
|
let seqData = lodashExports.cloneDeep(pSeqData);
|
|
11444
11505
|
const response = {
|
|
@@ -11466,18 +11527,15 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
11466
11527
|
if (seqData.isRna) {
|
|
11467
11528
|
seqData.sequence = seqData.sequence.replace(/t/gi, "u");
|
|
11468
11529
|
}
|
|
11469
|
-
if (
|
|
11530
|
+
if (!doNotRemoveInvalidChars) {
|
|
11470
11531
|
if (seqData.isProtein) {
|
|
11471
|
-
seqData.proteinSequence
|
|
11472
|
-
|
|
11473
|
-
__spreadValues({ includeStopCodon: true }, proteinFilterOptions)
|
|
11474
|
-
);
|
|
11532
|
+
const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({}, topLevelSeqData || seqData));
|
|
11533
|
+
seqData.proteinSequence = newSeq;
|
|
11475
11534
|
} else {
|
|
11476
|
-
|
|
11477
|
-
|
|
11478
|
-
|
|
11479
|
-
|
|
11480
|
-
);
|
|
11535
|
+
const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
|
|
11536
|
+
additionalValidChars
|
|
11537
|
+
}, topLevelSeqData || seqData));
|
|
11538
|
+
seqData.sequence = newSeq;
|
|
11481
11539
|
}
|
|
11482
11540
|
}
|
|
11483
11541
|
if (seqData.isProtein) {
|
|
@@ -19362,7 +19420,6 @@ function validateSequence(sequence, options = {}) {
|
|
|
19362
19420
|
response.messages.push("No sequence detected");
|
|
19363
19421
|
sequence.sequence = "";
|
|
19364
19422
|
}
|
|
19365
|
-
let validChars;
|
|
19366
19423
|
if (sequence.isProtein === void 0 && guessIfProtein) {
|
|
19367
19424
|
sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
|
|
19368
19425
|
sequence.sequence,
|
|
@@ -19370,12 +19427,14 @@ function validateSequence(sequence, options = {}) {
|
|
|
19370
19427
|
);
|
|
19371
19428
|
}
|
|
19372
19429
|
if (sequence.isProtein) {
|
|
19373
|
-
validChars =
|
|
19430
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, {
|
|
19431
|
+
name: sequence.name,
|
|
19432
|
+
isProtein: true,
|
|
19433
|
+
additionalValidChars
|
|
19434
|
+
});
|
|
19374
19435
|
if (validChars !== sequence.sequence) {
|
|
19375
19436
|
sequence.sequence = validChars;
|
|
19376
|
-
response.messages.push(
|
|
19377
|
-
"Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
|
|
19378
|
-
);
|
|
19437
|
+
response.messages.push(...warnings);
|
|
19379
19438
|
}
|
|
19380
19439
|
sequence.type = "PROTEIN";
|
|
19381
19440
|
sequence.isProtein = true;
|
|
@@ -19397,12 +19456,12 @@ function validateSequence(sequence, options = {}) {
|
|
|
19397
19456
|
} else {
|
|
19398
19457
|
sequence.type = "DNA";
|
|
19399
19458
|
}
|
|
19400
|
-
validChars = filterSequenceString(sequence.sequence,
|
|
19459
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, __spreadValues({
|
|
19460
|
+
additionalValidChars
|
|
19461
|
+
}, sequence));
|
|
19401
19462
|
if (validChars !== sequence.sequence) {
|
|
19402
19463
|
sequence.sequence = validChars;
|
|
19403
|
-
response.messages.push(
|
|
19404
|
-
"Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
|
|
19405
|
-
);
|
|
19464
|
+
response.messages.push(...warnings);
|
|
19406
19465
|
}
|
|
19407
19466
|
}
|
|
19408
19467
|
if (!sequence.size) {
|
package/index.mjs
CHANGED
|
@@ -6168,7 +6168,9 @@ lodash.exports;
|
|
|
6168
6168
|
})(lodash, lodash.exports);
|
|
6169
6169
|
var lodashExports = lodash.exports;
|
|
6170
6170
|
const _ = /* @__PURE__ */ getDefaultExportFromCjs(lodashExports);
|
|
6171
|
+
const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO*";
|
|
6171
6172
|
const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
|
|
6173
|
+
const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
|
|
6172
6174
|
const aminoAcidToDegenerateDnaMap = {
|
|
6173
6175
|
"-": "---",
|
|
6174
6176
|
".": "...",
|
|
@@ -11271,20 +11273,88 @@ const annotationTypes = [
|
|
|
11271
11273
|
"primers",
|
|
11272
11274
|
"guides"
|
|
11273
11275
|
];
|
|
11274
|
-
function filterSequenceString(sequenceString,
|
|
11275
|
-
|
|
11276
|
-
|
|
11277
|
-
|
|
11278
|
-
|
|
11279
|
-
|
|
11280
|
-
|
|
11281
|
-
|
|
11276
|
+
function filterSequenceString(sequenceString, {
|
|
11277
|
+
additionalValidChars = "",
|
|
11278
|
+
isOligo,
|
|
11279
|
+
name,
|
|
11280
|
+
isProtein,
|
|
11281
|
+
isRna,
|
|
11282
|
+
isMixedRnaAndDna
|
|
11283
|
+
} = {}) {
|
|
11284
|
+
const acceptedChars = getAcceptedChars({
|
|
11285
|
+
isOligo,
|
|
11286
|
+
isProtein,
|
|
11287
|
+
isRna,
|
|
11288
|
+
isMixedRnaAndDna
|
|
11289
|
+
});
|
|
11290
|
+
const replaceChars = getReplaceChars({
|
|
11291
|
+
isOligo,
|
|
11292
|
+
isProtein,
|
|
11293
|
+
isRna,
|
|
11294
|
+
isMixedRnaAndDna
|
|
11295
|
+
});
|
|
11296
|
+
let sanitizedVal = "";
|
|
11297
|
+
const invalidChars = [];
|
|
11298
|
+
const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
|
|
11299
|
+
const warnings = [];
|
|
11300
|
+
const replaceCount = {};
|
|
11301
|
+
sequenceString.split("").forEach((letter) => {
|
|
11302
|
+
const lowerLetter = letter.toLowerCase();
|
|
11303
|
+
if (replaceChars && replaceChars[lowerLetter]) {
|
|
11304
|
+
if (!replaceCount[lowerLetter]) {
|
|
11305
|
+
replaceCount[lowerLetter] = 0;
|
|
11306
|
+
}
|
|
11307
|
+
replaceCount[lowerLetter]++;
|
|
11308
|
+
const isUpper = lowerLetter !== letter;
|
|
11309
|
+
sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
|
|
11310
|
+
} else if (chars.includes(lowerLetter)) {
|
|
11311
|
+
sanitizedVal += letter;
|
|
11312
|
+
} else {
|
|
11313
|
+
invalidChars.push(letter);
|
|
11314
|
+
}
|
|
11315
|
+
});
|
|
11316
|
+
Object.keys(replaceCount).forEach((letter) => {
|
|
11317
|
+
warnings.push(
|
|
11318
|
+
`Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
|
|
11319
|
+
);
|
|
11320
|
+
});
|
|
11321
|
+
if (sequenceString.length !== sanitizedVal.length) {
|
|
11322
|
+
warnings.push(
|
|
11323
|
+
`${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
|
|
11282
11324
|
);
|
|
11283
|
-
} else {
|
|
11284
|
-
return sequenceString;
|
|
11285
11325
|
}
|
|
11326
|
+
if (typeof window !== "undefined" && window.toastr && warnings.length) {
|
|
11327
|
+
warnings.forEach((warning) => {
|
|
11328
|
+
window.toastr.warning(warning);
|
|
11329
|
+
});
|
|
11330
|
+
}
|
|
11331
|
+
return [sanitizedVal, warnings];
|
|
11286
11332
|
}
|
|
11287
11333
|
__name(filterSequenceString, "filterSequenceString");
|
|
11334
|
+
function getAcceptedChars({
|
|
11335
|
+
isOligo,
|
|
11336
|
+
isProtein,
|
|
11337
|
+
isRna,
|
|
11338
|
+
isMixedRnaAndDna
|
|
11339
|
+
} = {}) {
|
|
11340
|
+
return isProtein ? `${extended_protein_letters.toLowerCase()}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
|
|
11341
|
+
//just plain old dna
|
|
11342
|
+
ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
|
|
11343
|
+
);
|
|
11344
|
+
}
|
|
11345
|
+
__name(getAcceptedChars, "getAcceptedChars");
|
|
11346
|
+
function getReplaceChars({
|
|
11347
|
+
isOligo,
|
|
11348
|
+
isProtein,
|
|
11349
|
+
isRna,
|
|
11350
|
+
isMixedRnaAndDna
|
|
11351
|
+
} = {}) {
|
|
11352
|
+
return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
|
|
11353
|
+
//just plain old dna
|
|
11354
|
+
{}
|
|
11355
|
+
);
|
|
11356
|
+
}
|
|
11357
|
+
__name(getReplaceChars, "getReplaceChars");
|
|
11288
11358
|
function tidyUpAnnotation(_annotation, {
|
|
11289
11359
|
sequenceData = {},
|
|
11290
11360
|
convertAnnotationsFromAAIndices,
|
|
@@ -11413,14 +11483,6 @@ function coerceLocation({
|
|
|
11413
11483
|
}
|
|
11414
11484
|
}
|
|
11415
11485
|
__name(coerceLocation, "coerceLocation");
|
|
11416
|
-
function filterAminoAcidSequenceString(sequenceString, options) {
|
|
11417
|
-
options = options || {};
|
|
11418
|
-
if (options.includeStopCodon) {
|
|
11419
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
|
|
11420
|
-
}
|
|
11421
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
|
|
11422
|
-
}
|
|
11423
|
-
__name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
|
|
11424
11486
|
function getDegenerateDnaStringFromAAString(aaString) {
|
|
11425
11487
|
return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
|
|
11426
11488
|
}
|
|
@@ -11429,14 +11491,13 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
11429
11491
|
const {
|
|
11430
11492
|
annotationsAsObjects,
|
|
11431
11493
|
logMessages,
|
|
11432
|
-
|
|
11494
|
+
doNotRemoveInvalidChars,
|
|
11433
11495
|
additionalValidChars,
|
|
11434
11496
|
noTranslationData,
|
|
11435
|
-
charOverrides,
|
|
11436
11497
|
doNotProvideIdsForAnnotations,
|
|
11437
|
-
proteinFilterOptions,
|
|
11438
11498
|
noCdsTranslations,
|
|
11439
|
-
convertAnnotationsFromAAIndices
|
|
11499
|
+
convertAnnotationsFromAAIndices,
|
|
11500
|
+
topLevelSeqData
|
|
11440
11501
|
} = options;
|
|
11441
11502
|
let seqData = lodashExports.cloneDeep(pSeqData);
|
|
11442
11503
|
const response = {
|
|
@@ -11464,18 +11525,15 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
11464
11525
|
if (seqData.isRna) {
|
|
11465
11526
|
seqData.sequence = seqData.sequence.replace(/t/gi, "u");
|
|
11466
11527
|
}
|
|
11467
|
-
if (
|
|
11528
|
+
if (!doNotRemoveInvalidChars) {
|
|
11468
11529
|
if (seqData.isProtein) {
|
|
11469
|
-
seqData.proteinSequence
|
|
11470
|
-
|
|
11471
|
-
__spreadValues({ includeStopCodon: true }, proteinFilterOptions)
|
|
11472
|
-
);
|
|
11530
|
+
const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({}, topLevelSeqData || seqData));
|
|
11531
|
+
seqData.proteinSequence = newSeq;
|
|
11473
11532
|
} else {
|
|
11474
|
-
|
|
11475
|
-
|
|
11476
|
-
|
|
11477
|
-
|
|
11478
|
-
);
|
|
11533
|
+
const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
|
|
11534
|
+
additionalValidChars
|
|
11535
|
+
}, topLevelSeqData || seqData));
|
|
11536
|
+
seqData.sequence = newSeq;
|
|
11479
11537
|
}
|
|
11480
11538
|
}
|
|
11481
11539
|
if (seqData.isProtein) {
|
|
@@ -19360,7 +19418,6 @@ function validateSequence(sequence, options = {}) {
|
|
|
19360
19418
|
response.messages.push("No sequence detected");
|
|
19361
19419
|
sequence.sequence = "";
|
|
19362
19420
|
}
|
|
19363
|
-
let validChars;
|
|
19364
19421
|
if (sequence.isProtein === void 0 && guessIfProtein) {
|
|
19365
19422
|
sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
|
|
19366
19423
|
sequence.sequence,
|
|
@@ -19368,12 +19425,14 @@ function validateSequence(sequence, options = {}) {
|
|
|
19368
19425
|
);
|
|
19369
19426
|
}
|
|
19370
19427
|
if (sequence.isProtein) {
|
|
19371
|
-
validChars =
|
|
19428
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, {
|
|
19429
|
+
name: sequence.name,
|
|
19430
|
+
isProtein: true,
|
|
19431
|
+
additionalValidChars
|
|
19432
|
+
});
|
|
19372
19433
|
if (validChars !== sequence.sequence) {
|
|
19373
19434
|
sequence.sequence = validChars;
|
|
19374
|
-
response.messages.push(
|
|
19375
|
-
"Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
|
|
19376
|
-
);
|
|
19435
|
+
response.messages.push(...warnings);
|
|
19377
19436
|
}
|
|
19378
19437
|
sequence.type = "PROTEIN";
|
|
19379
19438
|
sequence.isProtein = true;
|
|
@@ -19395,12 +19454,12 @@ function validateSequence(sequence, options = {}) {
|
|
|
19395
19454
|
} else {
|
|
19396
19455
|
sequence.type = "DNA";
|
|
19397
19456
|
}
|
|
19398
|
-
validChars = filterSequenceString(sequence.sequence,
|
|
19457
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, __spreadValues({
|
|
19458
|
+
additionalValidChars
|
|
19459
|
+
}, sequence));
|
|
19399
19460
|
if (validChars !== sequence.sequence) {
|
|
19400
19461
|
sequence.sequence = validChars;
|
|
19401
|
-
response.messages.push(
|
|
19402
|
-
"Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
|
|
19403
|
-
);
|
|
19462
|
+
response.messages.push(...warnings);
|
|
19404
19463
|
}
|
|
19405
19464
|
}
|
|
19406
19465
|
if (!sequence.size) {
|
package/index.umd.js
CHANGED
|
@@ -6172,7 +6172,9 @@ var __async = (__this, __arguments, generator) => {
|
|
|
6172
6172
|
})(lodash, lodash.exports);
|
|
6173
6173
|
var lodashExports = lodash.exports;
|
|
6174
6174
|
const _ = /* @__PURE__ */ getDefaultExportFromCjs(lodashExports);
|
|
6175
|
+
const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO*";
|
|
6175
6176
|
const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
|
|
6177
|
+
const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
|
|
6176
6178
|
const aminoAcidToDegenerateDnaMap = {
|
|
6177
6179
|
"-": "---",
|
|
6178
6180
|
".": "...",
|
|
@@ -11275,20 +11277,88 @@ var __async = (__this, __arguments, generator) => {
|
|
|
11275
11277
|
"primers",
|
|
11276
11278
|
"guides"
|
|
11277
11279
|
];
|
|
11278
|
-
function filterSequenceString(sequenceString,
|
|
11279
|
-
|
|
11280
|
-
|
|
11281
|
-
|
|
11282
|
-
|
|
11283
|
-
|
|
11284
|
-
|
|
11285
|
-
|
|
11280
|
+
function filterSequenceString(sequenceString, {
|
|
11281
|
+
additionalValidChars = "",
|
|
11282
|
+
isOligo,
|
|
11283
|
+
name: name2,
|
|
11284
|
+
isProtein,
|
|
11285
|
+
isRna,
|
|
11286
|
+
isMixedRnaAndDna
|
|
11287
|
+
} = {}) {
|
|
11288
|
+
const acceptedChars = getAcceptedChars({
|
|
11289
|
+
isOligo,
|
|
11290
|
+
isProtein,
|
|
11291
|
+
isRna,
|
|
11292
|
+
isMixedRnaAndDna
|
|
11293
|
+
});
|
|
11294
|
+
const replaceChars = getReplaceChars({
|
|
11295
|
+
isOligo,
|
|
11296
|
+
isProtein,
|
|
11297
|
+
isRna,
|
|
11298
|
+
isMixedRnaAndDna
|
|
11299
|
+
});
|
|
11300
|
+
let sanitizedVal = "";
|
|
11301
|
+
const invalidChars = [];
|
|
11302
|
+
const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
|
|
11303
|
+
const warnings = [];
|
|
11304
|
+
const replaceCount = {};
|
|
11305
|
+
sequenceString.split("").forEach((letter) => {
|
|
11306
|
+
const lowerLetter = letter.toLowerCase();
|
|
11307
|
+
if (replaceChars && replaceChars[lowerLetter]) {
|
|
11308
|
+
if (!replaceCount[lowerLetter]) {
|
|
11309
|
+
replaceCount[lowerLetter] = 0;
|
|
11310
|
+
}
|
|
11311
|
+
replaceCount[lowerLetter]++;
|
|
11312
|
+
const isUpper = lowerLetter !== letter;
|
|
11313
|
+
sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
|
|
11314
|
+
} else if (chars.includes(lowerLetter)) {
|
|
11315
|
+
sanitizedVal += letter;
|
|
11316
|
+
} else {
|
|
11317
|
+
invalidChars.push(letter);
|
|
11318
|
+
}
|
|
11319
|
+
});
|
|
11320
|
+
Object.keys(replaceCount).forEach((letter) => {
|
|
11321
|
+
warnings.push(
|
|
11322
|
+
`Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
|
|
11323
|
+
);
|
|
11324
|
+
});
|
|
11325
|
+
if (sequenceString.length !== sanitizedVal.length) {
|
|
11326
|
+
warnings.push(
|
|
11327
|
+
`${name2 ? `Sequence ${name2}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
|
|
11286
11328
|
);
|
|
11287
|
-
} else {
|
|
11288
|
-
return sequenceString;
|
|
11289
11329
|
}
|
|
11330
|
+
if (typeof window !== "undefined" && window.toastr && warnings.length) {
|
|
11331
|
+
warnings.forEach((warning) => {
|
|
11332
|
+
window.toastr.warning(warning);
|
|
11333
|
+
});
|
|
11334
|
+
}
|
|
11335
|
+
return [sanitizedVal, warnings];
|
|
11290
11336
|
}
|
|
11291
11337
|
__name(filterSequenceString, "filterSequenceString");
|
|
11338
|
+
function getAcceptedChars({
|
|
11339
|
+
isOligo,
|
|
11340
|
+
isProtein,
|
|
11341
|
+
isRna,
|
|
11342
|
+
isMixedRnaAndDna
|
|
11343
|
+
} = {}) {
|
|
11344
|
+
return isProtein ? `${extended_protein_letters.toLowerCase()}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
|
|
11345
|
+
//just plain old dna
|
|
11346
|
+
ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
|
|
11347
|
+
);
|
|
11348
|
+
}
|
|
11349
|
+
__name(getAcceptedChars, "getAcceptedChars");
|
|
11350
|
+
function getReplaceChars({
|
|
11351
|
+
isOligo,
|
|
11352
|
+
isProtein,
|
|
11353
|
+
isRna,
|
|
11354
|
+
isMixedRnaAndDna
|
|
11355
|
+
} = {}) {
|
|
11356
|
+
return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
|
|
11357
|
+
//just plain old dna
|
|
11358
|
+
{}
|
|
11359
|
+
);
|
|
11360
|
+
}
|
|
11361
|
+
__name(getReplaceChars, "getReplaceChars");
|
|
11292
11362
|
function tidyUpAnnotation(_annotation, {
|
|
11293
11363
|
sequenceData = {},
|
|
11294
11364
|
convertAnnotationsFromAAIndices,
|
|
@@ -11417,14 +11487,6 @@ var __async = (__this, __arguments, generator) => {
|
|
|
11417
11487
|
}
|
|
11418
11488
|
}
|
|
11419
11489
|
__name(coerceLocation, "coerceLocation");
|
|
11420
|
-
function filterAminoAcidSequenceString(sequenceString, options) {
|
|
11421
|
-
options = options || {};
|
|
11422
|
-
if (options.includeStopCodon) {
|
|
11423
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
|
|
11424
|
-
}
|
|
11425
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
|
|
11426
|
-
}
|
|
11427
|
-
__name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
|
|
11428
11490
|
function getDegenerateDnaStringFromAAString(aaString) {
|
|
11429
11491
|
return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
|
|
11430
11492
|
}
|
|
@@ -11433,14 +11495,13 @@ var __async = (__this, __arguments, generator) => {
|
|
|
11433
11495
|
const {
|
|
11434
11496
|
annotationsAsObjects,
|
|
11435
11497
|
logMessages,
|
|
11436
|
-
|
|
11498
|
+
doNotRemoveInvalidChars,
|
|
11437
11499
|
additionalValidChars,
|
|
11438
11500
|
noTranslationData,
|
|
11439
|
-
charOverrides,
|
|
11440
11501
|
doNotProvideIdsForAnnotations,
|
|
11441
|
-
proteinFilterOptions,
|
|
11442
11502
|
noCdsTranslations,
|
|
11443
|
-
convertAnnotationsFromAAIndices
|
|
11503
|
+
convertAnnotationsFromAAIndices,
|
|
11504
|
+
topLevelSeqData
|
|
11444
11505
|
} = options;
|
|
11445
11506
|
let seqData = lodashExports.cloneDeep(pSeqData);
|
|
11446
11507
|
const response = {
|
|
@@ -11468,18 +11529,15 @@ var __async = (__this, __arguments, generator) => {
|
|
|
11468
11529
|
if (seqData.isRna) {
|
|
11469
11530
|
seqData.sequence = seqData.sequence.replace(/t/gi, "u");
|
|
11470
11531
|
}
|
|
11471
|
-
if (
|
|
11532
|
+
if (!doNotRemoveInvalidChars) {
|
|
11472
11533
|
if (seqData.isProtein) {
|
|
11473
|
-
seqData.proteinSequence
|
|
11474
|
-
|
|
11475
|
-
__spreadValues({ includeStopCodon: true }, proteinFilterOptions)
|
|
11476
|
-
);
|
|
11534
|
+
const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({}, topLevelSeqData || seqData));
|
|
11535
|
+
seqData.proteinSequence = newSeq;
|
|
11477
11536
|
} else {
|
|
11478
|
-
|
|
11479
|
-
|
|
11480
|
-
|
|
11481
|
-
|
|
11482
|
-
);
|
|
11537
|
+
const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
|
|
11538
|
+
additionalValidChars
|
|
11539
|
+
}, topLevelSeqData || seqData));
|
|
11540
|
+
seqData.sequence = newSeq;
|
|
11483
11541
|
}
|
|
11484
11542
|
}
|
|
11485
11543
|
if (seqData.isProtein) {
|
|
@@ -19364,7 +19422,6 @@ var __async = (__this, __arguments, generator) => {
|
|
|
19364
19422
|
response.messages.push("No sequence detected");
|
|
19365
19423
|
sequence.sequence = "";
|
|
19366
19424
|
}
|
|
19367
|
-
let validChars;
|
|
19368
19425
|
if (sequence.isProtein === void 0 && guessIfProtein) {
|
|
19369
19426
|
sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
|
|
19370
19427
|
sequence.sequence,
|
|
@@ -19372,12 +19429,14 @@ var __async = (__this, __arguments, generator) => {
|
|
|
19372
19429
|
);
|
|
19373
19430
|
}
|
|
19374
19431
|
if (sequence.isProtein) {
|
|
19375
|
-
validChars =
|
|
19432
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, {
|
|
19433
|
+
name: sequence.name,
|
|
19434
|
+
isProtein: true,
|
|
19435
|
+
additionalValidChars
|
|
19436
|
+
});
|
|
19376
19437
|
if (validChars !== sequence.sequence) {
|
|
19377
19438
|
sequence.sequence = validChars;
|
|
19378
|
-
response.messages.push(
|
|
19379
|
-
"Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
|
|
19380
|
-
);
|
|
19439
|
+
response.messages.push(...warnings);
|
|
19381
19440
|
}
|
|
19382
19441
|
sequence.type = "PROTEIN";
|
|
19383
19442
|
sequence.isProtein = true;
|
|
@@ -19399,12 +19458,12 @@ var __async = (__this, __arguments, generator) => {
|
|
|
19399
19458
|
} else {
|
|
19400
19459
|
sequence.type = "DNA";
|
|
19401
19460
|
}
|
|
19402
|
-
validChars = filterSequenceString(sequence.sequence,
|
|
19461
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, __spreadValues({
|
|
19462
|
+
additionalValidChars
|
|
19463
|
+
}, sequence));
|
|
19403
19464
|
if (validChars !== sequence.sequence) {
|
|
19404
19465
|
sequence.sequence = validChars;
|
|
19405
|
-
response.messages.push(
|
|
19406
|
-
"Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
|
|
19407
|
-
);
|
|
19466
|
+
response.messages.push(...warnings);
|
|
19408
19467
|
}
|
|
19409
19468
|
}
|
|
19410
19469
|
if (!sequence.size) {
|
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@teselagen/bio-parsers",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.2",
|
|
4
4
|
"dependencies": {
|
|
5
|
-
"@teselagen/sequence-utils": "0.3.
|
|
5
|
+
"@teselagen/sequence-utils": "0.3.10",
|
|
6
6
|
"@teselagen/range-utils": "0.3.7",
|
|
7
7
|
"@gmod/gff": "^1.2.1",
|
|
8
8
|
"buffer": "^6.0.3",
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import areNonNegativeIntegers from "validate.io-nonnegative-integer-array";
|
|
2
2
|
import { getFeatureTypes } from "@teselagen/sequence-utils";
|
|
3
3
|
import {
|
|
4
|
-
filterAminoAcidSequenceString,
|
|
5
4
|
filterSequenceString,
|
|
6
5
|
guessIfSequenceIsDnaAndNotProtein
|
|
7
6
|
} from "@teselagen/sequence-utils";
|
|
@@ -30,7 +29,7 @@ export default function validateSequence(sequence, options = {}) {
|
|
|
30
29
|
inclusive1BasedEnd,
|
|
31
30
|
additionalValidChars,
|
|
32
31
|
allowOverflowAnnotations,
|
|
33
|
-
coerceFeatureTypes
|
|
32
|
+
coerceFeatureTypes,
|
|
34
33
|
} = options;
|
|
35
34
|
[
|
|
36
35
|
"isDNA",
|
|
@@ -84,7 +83,7 @@ export default function validateSequence(sequence, options = {}) {
|
|
|
84
83
|
response.messages.push("No sequence detected");
|
|
85
84
|
sequence.sequence = "";
|
|
86
85
|
}
|
|
87
|
-
|
|
86
|
+
|
|
88
87
|
if (sequence.isProtein === undefined && guessIfProtein) {
|
|
89
88
|
sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
|
|
90
89
|
sequence.sequence,
|
|
@@ -93,12 +92,14 @@ export default function validateSequence(sequence, options = {}) {
|
|
|
93
92
|
}
|
|
94
93
|
if (sequence.isProtein) {
|
|
95
94
|
//tnr: add code to strip invalid protein data..
|
|
96
|
-
validChars =
|
|
95
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, {
|
|
96
|
+
name: sequence.name,
|
|
97
|
+
isProtein: true,
|
|
98
|
+
additionalValidChars,
|
|
99
|
+
});
|
|
97
100
|
if (validChars !== sequence.sequence) {
|
|
98
101
|
sequence.sequence = validChars;
|
|
99
|
-
response.messages.push(
|
|
100
|
-
"Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
|
|
101
|
-
);
|
|
102
|
+
response.messages.push(...warnings);
|
|
102
103
|
}
|
|
103
104
|
sequence.type = "PROTEIN";
|
|
104
105
|
sequence.isProtein = true;
|
|
@@ -126,12 +127,13 @@ export default function validateSequence(sequence, options = {}) {
|
|
|
126
127
|
sequence.type = "DNA";
|
|
127
128
|
}
|
|
128
129
|
|
|
129
|
-
validChars = filterSequenceString(sequence.sequence,
|
|
130
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, {
|
|
131
|
+
additionalValidChars,
|
|
132
|
+
...sequence
|
|
133
|
+
});
|
|
130
134
|
if (validChars !== sequence.sequence) {
|
|
131
135
|
sequence.sequence = validChars;
|
|
132
|
-
response.messages.push(
|
|
133
|
-
"Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
|
|
134
|
-
);
|
|
136
|
+
response.messages.push(...warnings);
|
|
135
137
|
}
|
|
136
138
|
}
|
|
137
139
|
|