@teselagen/bio-parsers 0.3.10 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +106 -40
- package/index.mjs +106 -40
- package/index.umd.js +106 -40
- package/package.json +2 -2
- package/src/utils/validateSequence.js +15 -11
package/index.js
CHANGED
|
@@ -6170,7 +6170,9 @@ lodash.exports;
|
|
|
6170
6170
|
})(lodash, lodash.exports);
|
|
6171
6171
|
var lodashExports = lodash.exports;
|
|
6172
6172
|
const _ = /* @__PURE__ */ getDefaultExportFromCjs(lodashExports);
|
|
6173
|
+
const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
|
|
6173
6174
|
const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
|
|
6175
|
+
const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
|
|
6174
6176
|
const aminoAcidToDegenerateDnaMap = {
|
|
6175
6177
|
"-": "---",
|
|
6176
6178
|
".": "...",
|
|
@@ -11273,20 +11275,91 @@ const annotationTypes = [
|
|
|
11273
11275
|
"primers",
|
|
11274
11276
|
"guides"
|
|
11275
11277
|
];
|
|
11276
|
-
function filterSequenceString(sequenceString,
|
|
11277
|
-
|
|
11278
|
-
|
|
11279
|
-
|
|
11280
|
-
|
|
11281
|
-
|
|
11282
|
-
|
|
11283
|
-
|
|
11278
|
+
function filterSequenceString(sequenceString, {
|
|
11279
|
+
additionalValidChars = "",
|
|
11280
|
+
isOligo,
|
|
11281
|
+
name,
|
|
11282
|
+
isProtein,
|
|
11283
|
+
isRna,
|
|
11284
|
+
isMixedRnaAndDna,
|
|
11285
|
+
includeStopCodon
|
|
11286
|
+
} = {}) {
|
|
11287
|
+
const acceptedChars = getAcceptedChars({
|
|
11288
|
+
isOligo,
|
|
11289
|
+
isProtein,
|
|
11290
|
+
isRna,
|
|
11291
|
+
isMixedRnaAndDna,
|
|
11292
|
+
includeStopCodon
|
|
11293
|
+
});
|
|
11294
|
+
const replaceChars = getReplaceChars({
|
|
11295
|
+
isOligo,
|
|
11296
|
+
isProtein,
|
|
11297
|
+
isRna,
|
|
11298
|
+
isMixedRnaAndDna
|
|
11299
|
+
});
|
|
11300
|
+
let sanitizedVal = "";
|
|
11301
|
+
const invalidChars = [];
|
|
11302
|
+
const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
|
|
11303
|
+
const warnings = [];
|
|
11304
|
+
const replaceCount = {};
|
|
11305
|
+
sequenceString.split("").forEach((letter) => {
|
|
11306
|
+
const lowerLetter = letter.toLowerCase();
|
|
11307
|
+
if (replaceChars && replaceChars[lowerLetter]) {
|
|
11308
|
+
if (!replaceCount[lowerLetter]) {
|
|
11309
|
+
replaceCount[lowerLetter] = 0;
|
|
11310
|
+
}
|
|
11311
|
+
replaceCount[lowerLetter]++;
|
|
11312
|
+
const isUpper = lowerLetter !== letter;
|
|
11313
|
+
sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
|
|
11314
|
+
} else if (chars.includes(lowerLetter)) {
|
|
11315
|
+
sanitizedVal += letter;
|
|
11316
|
+
} else {
|
|
11317
|
+
invalidChars.push(letter);
|
|
11318
|
+
}
|
|
11319
|
+
});
|
|
11320
|
+
Object.keys(replaceCount).forEach((letter) => {
|
|
11321
|
+
warnings.push(
|
|
11322
|
+
`Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
|
|
11323
|
+
);
|
|
11324
|
+
});
|
|
11325
|
+
if (sequenceString.length !== sanitizedVal.length) {
|
|
11326
|
+
warnings.push(
|
|
11327
|
+
`${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
|
|
11284
11328
|
);
|
|
11285
|
-
} else {
|
|
11286
|
-
return sequenceString;
|
|
11287
11329
|
}
|
|
11330
|
+
if (typeof window !== "undefined" && window.toastr && warnings.length) {
|
|
11331
|
+
warnings.forEach((warning) => {
|
|
11332
|
+
window.toastr.warning(warning);
|
|
11333
|
+
});
|
|
11334
|
+
}
|
|
11335
|
+
return [sanitizedVal, warnings];
|
|
11288
11336
|
}
|
|
11289
11337
|
__name(filterSequenceString, "filterSequenceString");
|
|
11338
|
+
function getAcceptedChars({
|
|
11339
|
+
isOligo,
|
|
11340
|
+
isProtein,
|
|
11341
|
+
isRna,
|
|
11342
|
+
isMixedRnaAndDna,
|
|
11343
|
+
includeStopCodon
|
|
11344
|
+
} = {}) {
|
|
11345
|
+
return isProtein ? `${protein_letters_withUandX.toLowerCase()}${includeStopCodon ? "*." : ""}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
|
|
11346
|
+
//just plain old dna
|
|
11347
|
+
ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
|
|
11348
|
+
);
|
|
11349
|
+
}
|
|
11350
|
+
__name(getAcceptedChars, "getAcceptedChars");
|
|
11351
|
+
function getReplaceChars({
|
|
11352
|
+
isOligo,
|
|
11353
|
+
isProtein,
|
|
11354
|
+
isRna,
|
|
11355
|
+
isMixedRnaAndDna
|
|
11356
|
+
} = {}) {
|
|
11357
|
+
return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
|
|
11358
|
+
//just plain old dna
|
|
11359
|
+
{}
|
|
11360
|
+
);
|
|
11361
|
+
}
|
|
11362
|
+
__name(getReplaceChars, "getReplaceChars");
|
|
11290
11363
|
function tidyUpAnnotation(_annotation, {
|
|
11291
11364
|
sequenceData = {},
|
|
11292
11365
|
convertAnnotationsFromAAIndices,
|
|
@@ -11415,14 +11488,6 @@ function coerceLocation({
|
|
|
11415
11488
|
}
|
|
11416
11489
|
}
|
|
11417
11490
|
__name(coerceLocation, "coerceLocation");
|
|
11418
|
-
function filterAminoAcidSequenceString(sequenceString, options) {
|
|
11419
|
-
options = options || {};
|
|
11420
|
-
if (options.includeStopCodon) {
|
|
11421
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
|
|
11422
|
-
}
|
|
11423
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
|
|
11424
|
-
}
|
|
11425
|
-
__name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
|
|
11426
11491
|
function getDegenerateDnaStringFromAAString(aaString) {
|
|
11427
11492
|
return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
|
|
11428
11493
|
}
|
|
@@ -11434,11 +11499,10 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
11434
11499
|
removeUnwantedChars,
|
|
11435
11500
|
additionalValidChars,
|
|
11436
11501
|
noTranslationData,
|
|
11437
|
-
charOverrides,
|
|
11438
11502
|
doNotProvideIdsForAnnotations,
|
|
11439
|
-
proteinFilterOptions,
|
|
11440
11503
|
noCdsTranslations,
|
|
11441
|
-
convertAnnotationsFromAAIndices
|
|
11504
|
+
convertAnnotationsFromAAIndices,
|
|
11505
|
+
topLevelSeqData
|
|
11442
11506
|
} = options;
|
|
11443
11507
|
let seqData = lodashExports.cloneDeep(pSeqData);
|
|
11444
11508
|
const response = {
|
|
@@ -11468,16 +11532,15 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
11468
11532
|
}
|
|
11469
11533
|
if (removeUnwantedChars) {
|
|
11470
11534
|
if (seqData.isProtein) {
|
|
11471
|
-
seqData.proteinSequence
|
|
11472
|
-
|
|
11473
|
-
|
|
11474
|
-
|
|
11535
|
+
const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({
|
|
11536
|
+
includeStopCodon: true
|
|
11537
|
+
}, topLevelSeqData || seqData));
|
|
11538
|
+
seqData.proteinSequence = newSeq;
|
|
11475
11539
|
} else {
|
|
11476
|
-
|
|
11477
|
-
|
|
11478
|
-
|
|
11479
|
-
|
|
11480
|
-
);
|
|
11540
|
+
const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
|
|
11541
|
+
additionalValidChars
|
|
11542
|
+
}, topLevelSeqData || seqData));
|
|
11543
|
+
seqData.sequence = newSeq;
|
|
11481
11544
|
}
|
|
11482
11545
|
}
|
|
11483
11546
|
if (seqData.isProtein) {
|
|
@@ -19312,7 +19375,8 @@ function validateSequence(sequence, options = {}) {
|
|
|
19312
19375
|
inclusive1BasedEnd,
|
|
19313
19376
|
additionalValidChars,
|
|
19314
19377
|
allowOverflowAnnotations,
|
|
19315
|
-
coerceFeatureTypes
|
|
19378
|
+
coerceFeatureTypes,
|
|
19379
|
+
includeStopCodon
|
|
19316
19380
|
} = options;
|
|
19317
19381
|
[
|
|
19318
19382
|
"isDNA",
|
|
@@ -19362,7 +19426,6 @@ function validateSequence(sequence, options = {}) {
|
|
|
19362
19426
|
response.messages.push("No sequence detected");
|
|
19363
19427
|
sequence.sequence = "";
|
|
19364
19428
|
}
|
|
19365
|
-
let validChars;
|
|
19366
19429
|
if (sequence.isProtein === void 0 && guessIfProtein) {
|
|
19367
19430
|
sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
|
|
19368
19431
|
sequence.sequence,
|
|
@@ -19370,12 +19433,15 @@ function validateSequence(sequence, options = {}) {
|
|
|
19370
19433
|
);
|
|
19371
19434
|
}
|
|
19372
19435
|
if (sequence.isProtein) {
|
|
19373
|
-
validChars =
|
|
19436
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, {
|
|
19437
|
+
name: sequence.name,
|
|
19438
|
+
isProtein: true,
|
|
19439
|
+
additionalValidChars,
|
|
19440
|
+
includeStopCodon
|
|
19441
|
+
});
|
|
19374
19442
|
if (validChars !== sequence.sequence) {
|
|
19375
19443
|
sequence.sequence = validChars;
|
|
19376
|
-
response.messages.push(
|
|
19377
|
-
"Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
|
|
19378
|
-
);
|
|
19444
|
+
response.messages.push(...warnings);
|
|
19379
19445
|
}
|
|
19380
19446
|
sequence.type = "PROTEIN";
|
|
19381
19447
|
sequence.isProtein = true;
|
|
@@ -19397,12 +19463,12 @@ function validateSequence(sequence, options = {}) {
|
|
|
19397
19463
|
} else {
|
|
19398
19464
|
sequence.type = "DNA";
|
|
19399
19465
|
}
|
|
19400
|
-
validChars = filterSequenceString(sequence.sequence,
|
|
19466
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, __spreadValues({
|
|
19467
|
+
additionalValidChars
|
|
19468
|
+
}, sequence));
|
|
19401
19469
|
if (validChars !== sequence.sequence) {
|
|
19402
19470
|
sequence.sequence = validChars;
|
|
19403
|
-
response.messages.push(
|
|
19404
|
-
"Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
|
|
19405
|
-
);
|
|
19471
|
+
response.messages.push(...warnings);
|
|
19406
19472
|
}
|
|
19407
19473
|
}
|
|
19408
19474
|
if (!sequence.size) {
|
package/index.mjs
CHANGED
|
@@ -6168,7 +6168,9 @@ lodash.exports;
|
|
|
6168
6168
|
})(lodash, lodash.exports);
|
|
6169
6169
|
var lodashExports = lodash.exports;
|
|
6170
6170
|
const _ = /* @__PURE__ */ getDefaultExportFromCjs(lodashExports);
|
|
6171
|
+
const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
|
|
6171
6172
|
const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
|
|
6173
|
+
const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
|
|
6172
6174
|
const aminoAcidToDegenerateDnaMap = {
|
|
6173
6175
|
"-": "---",
|
|
6174
6176
|
".": "...",
|
|
@@ -11271,20 +11273,91 @@ const annotationTypes = [
|
|
|
11271
11273
|
"primers",
|
|
11272
11274
|
"guides"
|
|
11273
11275
|
];
|
|
11274
|
-
function filterSequenceString(sequenceString,
|
|
11275
|
-
|
|
11276
|
-
|
|
11277
|
-
|
|
11278
|
-
|
|
11279
|
-
|
|
11280
|
-
|
|
11281
|
-
|
|
11276
|
+
function filterSequenceString(sequenceString, {
|
|
11277
|
+
additionalValidChars = "",
|
|
11278
|
+
isOligo,
|
|
11279
|
+
name,
|
|
11280
|
+
isProtein,
|
|
11281
|
+
isRna,
|
|
11282
|
+
isMixedRnaAndDna,
|
|
11283
|
+
includeStopCodon
|
|
11284
|
+
} = {}) {
|
|
11285
|
+
const acceptedChars = getAcceptedChars({
|
|
11286
|
+
isOligo,
|
|
11287
|
+
isProtein,
|
|
11288
|
+
isRna,
|
|
11289
|
+
isMixedRnaAndDna,
|
|
11290
|
+
includeStopCodon
|
|
11291
|
+
});
|
|
11292
|
+
const replaceChars = getReplaceChars({
|
|
11293
|
+
isOligo,
|
|
11294
|
+
isProtein,
|
|
11295
|
+
isRna,
|
|
11296
|
+
isMixedRnaAndDna
|
|
11297
|
+
});
|
|
11298
|
+
let sanitizedVal = "";
|
|
11299
|
+
const invalidChars = [];
|
|
11300
|
+
const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
|
|
11301
|
+
const warnings = [];
|
|
11302
|
+
const replaceCount = {};
|
|
11303
|
+
sequenceString.split("").forEach((letter) => {
|
|
11304
|
+
const lowerLetter = letter.toLowerCase();
|
|
11305
|
+
if (replaceChars && replaceChars[lowerLetter]) {
|
|
11306
|
+
if (!replaceCount[lowerLetter]) {
|
|
11307
|
+
replaceCount[lowerLetter] = 0;
|
|
11308
|
+
}
|
|
11309
|
+
replaceCount[lowerLetter]++;
|
|
11310
|
+
const isUpper = lowerLetter !== letter;
|
|
11311
|
+
sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
|
|
11312
|
+
} else if (chars.includes(lowerLetter)) {
|
|
11313
|
+
sanitizedVal += letter;
|
|
11314
|
+
} else {
|
|
11315
|
+
invalidChars.push(letter);
|
|
11316
|
+
}
|
|
11317
|
+
});
|
|
11318
|
+
Object.keys(replaceCount).forEach((letter) => {
|
|
11319
|
+
warnings.push(
|
|
11320
|
+
`Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
|
|
11321
|
+
);
|
|
11322
|
+
});
|
|
11323
|
+
if (sequenceString.length !== sanitizedVal.length) {
|
|
11324
|
+
warnings.push(
|
|
11325
|
+
`${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
|
|
11282
11326
|
);
|
|
11283
|
-
} else {
|
|
11284
|
-
return sequenceString;
|
|
11285
11327
|
}
|
|
11328
|
+
if (typeof window !== "undefined" && window.toastr && warnings.length) {
|
|
11329
|
+
warnings.forEach((warning) => {
|
|
11330
|
+
window.toastr.warning(warning);
|
|
11331
|
+
});
|
|
11332
|
+
}
|
|
11333
|
+
return [sanitizedVal, warnings];
|
|
11286
11334
|
}
|
|
11287
11335
|
__name(filterSequenceString, "filterSequenceString");
|
|
11336
|
+
function getAcceptedChars({
|
|
11337
|
+
isOligo,
|
|
11338
|
+
isProtein,
|
|
11339
|
+
isRna,
|
|
11340
|
+
isMixedRnaAndDna,
|
|
11341
|
+
includeStopCodon
|
|
11342
|
+
} = {}) {
|
|
11343
|
+
return isProtein ? `${protein_letters_withUandX.toLowerCase()}${includeStopCodon ? "*." : ""}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
|
|
11344
|
+
//just plain old dna
|
|
11345
|
+
ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
|
|
11346
|
+
);
|
|
11347
|
+
}
|
|
11348
|
+
__name(getAcceptedChars, "getAcceptedChars");
|
|
11349
|
+
function getReplaceChars({
|
|
11350
|
+
isOligo,
|
|
11351
|
+
isProtein,
|
|
11352
|
+
isRna,
|
|
11353
|
+
isMixedRnaAndDna
|
|
11354
|
+
} = {}) {
|
|
11355
|
+
return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
|
|
11356
|
+
//just plain old dna
|
|
11357
|
+
{}
|
|
11358
|
+
);
|
|
11359
|
+
}
|
|
11360
|
+
__name(getReplaceChars, "getReplaceChars");
|
|
11288
11361
|
function tidyUpAnnotation(_annotation, {
|
|
11289
11362
|
sequenceData = {},
|
|
11290
11363
|
convertAnnotationsFromAAIndices,
|
|
@@ -11413,14 +11486,6 @@ function coerceLocation({
|
|
|
11413
11486
|
}
|
|
11414
11487
|
}
|
|
11415
11488
|
__name(coerceLocation, "coerceLocation");
|
|
11416
|
-
function filterAminoAcidSequenceString(sequenceString, options) {
|
|
11417
|
-
options = options || {};
|
|
11418
|
-
if (options.includeStopCodon) {
|
|
11419
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
|
|
11420
|
-
}
|
|
11421
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
|
|
11422
|
-
}
|
|
11423
|
-
__name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
|
|
11424
11489
|
function getDegenerateDnaStringFromAAString(aaString) {
|
|
11425
11490
|
return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
|
|
11426
11491
|
}
|
|
@@ -11432,11 +11497,10 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
11432
11497
|
removeUnwantedChars,
|
|
11433
11498
|
additionalValidChars,
|
|
11434
11499
|
noTranslationData,
|
|
11435
|
-
charOverrides,
|
|
11436
11500
|
doNotProvideIdsForAnnotations,
|
|
11437
|
-
proteinFilterOptions,
|
|
11438
11501
|
noCdsTranslations,
|
|
11439
|
-
convertAnnotationsFromAAIndices
|
|
11502
|
+
convertAnnotationsFromAAIndices,
|
|
11503
|
+
topLevelSeqData
|
|
11440
11504
|
} = options;
|
|
11441
11505
|
let seqData = lodashExports.cloneDeep(pSeqData);
|
|
11442
11506
|
const response = {
|
|
@@ -11466,16 +11530,15 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
11466
11530
|
}
|
|
11467
11531
|
if (removeUnwantedChars) {
|
|
11468
11532
|
if (seqData.isProtein) {
|
|
11469
|
-
seqData.proteinSequence
|
|
11470
|
-
|
|
11471
|
-
|
|
11472
|
-
|
|
11533
|
+
const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({
|
|
11534
|
+
includeStopCodon: true
|
|
11535
|
+
}, topLevelSeqData || seqData));
|
|
11536
|
+
seqData.proteinSequence = newSeq;
|
|
11473
11537
|
} else {
|
|
11474
|
-
|
|
11475
|
-
|
|
11476
|
-
|
|
11477
|
-
|
|
11478
|
-
);
|
|
11538
|
+
const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
|
|
11539
|
+
additionalValidChars
|
|
11540
|
+
}, topLevelSeqData || seqData));
|
|
11541
|
+
seqData.sequence = newSeq;
|
|
11479
11542
|
}
|
|
11480
11543
|
}
|
|
11481
11544
|
if (seqData.isProtein) {
|
|
@@ -19310,7 +19373,8 @@ function validateSequence(sequence, options = {}) {
|
|
|
19310
19373
|
inclusive1BasedEnd,
|
|
19311
19374
|
additionalValidChars,
|
|
19312
19375
|
allowOverflowAnnotations,
|
|
19313
|
-
coerceFeatureTypes
|
|
19376
|
+
coerceFeatureTypes,
|
|
19377
|
+
includeStopCodon
|
|
19314
19378
|
} = options;
|
|
19315
19379
|
[
|
|
19316
19380
|
"isDNA",
|
|
@@ -19360,7 +19424,6 @@ function validateSequence(sequence, options = {}) {
|
|
|
19360
19424
|
response.messages.push("No sequence detected");
|
|
19361
19425
|
sequence.sequence = "";
|
|
19362
19426
|
}
|
|
19363
|
-
let validChars;
|
|
19364
19427
|
if (sequence.isProtein === void 0 && guessIfProtein) {
|
|
19365
19428
|
sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
|
|
19366
19429
|
sequence.sequence,
|
|
@@ -19368,12 +19431,15 @@ function validateSequence(sequence, options = {}) {
|
|
|
19368
19431
|
);
|
|
19369
19432
|
}
|
|
19370
19433
|
if (sequence.isProtein) {
|
|
19371
|
-
validChars =
|
|
19434
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, {
|
|
19435
|
+
name: sequence.name,
|
|
19436
|
+
isProtein: true,
|
|
19437
|
+
additionalValidChars,
|
|
19438
|
+
includeStopCodon
|
|
19439
|
+
});
|
|
19372
19440
|
if (validChars !== sequence.sequence) {
|
|
19373
19441
|
sequence.sequence = validChars;
|
|
19374
|
-
response.messages.push(
|
|
19375
|
-
"Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
|
|
19376
|
-
);
|
|
19442
|
+
response.messages.push(...warnings);
|
|
19377
19443
|
}
|
|
19378
19444
|
sequence.type = "PROTEIN";
|
|
19379
19445
|
sequence.isProtein = true;
|
|
@@ -19395,12 +19461,12 @@ function validateSequence(sequence, options = {}) {
|
|
|
19395
19461
|
} else {
|
|
19396
19462
|
sequence.type = "DNA";
|
|
19397
19463
|
}
|
|
19398
|
-
validChars = filterSequenceString(sequence.sequence,
|
|
19464
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, __spreadValues({
|
|
19465
|
+
additionalValidChars
|
|
19466
|
+
}, sequence));
|
|
19399
19467
|
if (validChars !== sequence.sequence) {
|
|
19400
19468
|
sequence.sequence = validChars;
|
|
19401
|
-
response.messages.push(
|
|
19402
|
-
"Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
|
|
19403
|
-
);
|
|
19469
|
+
response.messages.push(...warnings);
|
|
19404
19470
|
}
|
|
19405
19471
|
}
|
|
19406
19472
|
if (!sequence.size) {
|
package/index.umd.js
CHANGED
|
@@ -6172,7 +6172,9 @@ var __async = (__this, __arguments, generator) => {
|
|
|
6172
6172
|
})(lodash, lodash.exports);
|
|
6173
6173
|
var lodashExports = lodash.exports;
|
|
6174
6174
|
const _ = /* @__PURE__ */ getDefaultExportFromCjs(lodashExports);
|
|
6175
|
+
const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
|
|
6175
6176
|
const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
|
|
6177
|
+
const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
|
|
6176
6178
|
const aminoAcidToDegenerateDnaMap = {
|
|
6177
6179
|
"-": "---",
|
|
6178
6180
|
".": "...",
|
|
@@ -11275,20 +11277,91 @@ var __async = (__this, __arguments, generator) => {
|
|
|
11275
11277
|
"primers",
|
|
11276
11278
|
"guides"
|
|
11277
11279
|
];
|
|
11278
|
-
function filterSequenceString(sequenceString,
|
|
11279
|
-
|
|
11280
|
-
|
|
11281
|
-
|
|
11282
|
-
|
|
11283
|
-
|
|
11284
|
-
|
|
11285
|
-
|
|
11280
|
+
function filterSequenceString(sequenceString, {
|
|
11281
|
+
additionalValidChars = "",
|
|
11282
|
+
isOligo,
|
|
11283
|
+
name: name2,
|
|
11284
|
+
isProtein,
|
|
11285
|
+
isRna,
|
|
11286
|
+
isMixedRnaAndDna,
|
|
11287
|
+
includeStopCodon
|
|
11288
|
+
} = {}) {
|
|
11289
|
+
const acceptedChars = getAcceptedChars({
|
|
11290
|
+
isOligo,
|
|
11291
|
+
isProtein,
|
|
11292
|
+
isRna,
|
|
11293
|
+
isMixedRnaAndDna,
|
|
11294
|
+
includeStopCodon
|
|
11295
|
+
});
|
|
11296
|
+
const replaceChars = getReplaceChars({
|
|
11297
|
+
isOligo,
|
|
11298
|
+
isProtein,
|
|
11299
|
+
isRna,
|
|
11300
|
+
isMixedRnaAndDna
|
|
11301
|
+
});
|
|
11302
|
+
let sanitizedVal = "";
|
|
11303
|
+
const invalidChars = [];
|
|
11304
|
+
const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
|
|
11305
|
+
const warnings = [];
|
|
11306
|
+
const replaceCount = {};
|
|
11307
|
+
sequenceString.split("").forEach((letter) => {
|
|
11308
|
+
const lowerLetter = letter.toLowerCase();
|
|
11309
|
+
if (replaceChars && replaceChars[lowerLetter]) {
|
|
11310
|
+
if (!replaceCount[lowerLetter]) {
|
|
11311
|
+
replaceCount[lowerLetter] = 0;
|
|
11312
|
+
}
|
|
11313
|
+
replaceCount[lowerLetter]++;
|
|
11314
|
+
const isUpper = lowerLetter !== letter;
|
|
11315
|
+
sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
|
|
11316
|
+
} else if (chars.includes(lowerLetter)) {
|
|
11317
|
+
sanitizedVal += letter;
|
|
11318
|
+
} else {
|
|
11319
|
+
invalidChars.push(letter);
|
|
11320
|
+
}
|
|
11321
|
+
});
|
|
11322
|
+
Object.keys(replaceCount).forEach((letter) => {
|
|
11323
|
+
warnings.push(
|
|
11324
|
+
`Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
|
|
11325
|
+
);
|
|
11326
|
+
});
|
|
11327
|
+
if (sequenceString.length !== sanitizedVal.length) {
|
|
11328
|
+
warnings.push(
|
|
11329
|
+
`${name2 ? `Sequence ${name2}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
|
|
11286
11330
|
);
|
|
11287
|
-
} else {
|
|
11288
|
-
return sequenceString;
|
|
11289
11331
|
}
|
|
11332
|
+
if (typeof window !== "undefined" && window.toastr && warnings.length) {
|
|
11333
|
+
warnings.forEach((warning) => {
|
|
11334
|
+
window.toastr.warning(warning);
|
|
11335
|
+
});
|
|
11336
|
+
}
|
|
11337
|
+
return [sanitizedVal, warnings];
|
|
11290
11338
|
}
|
|
11291
11339
|
__name(filterSequenceString, "filterSequenceString");
|
|
11340
|
+
function getAcceptedChars({
|
|
11341
|
+
isOligo,
|
|
11342
|
+
isProtein,
|
|
11343
|
+
isRna,
|
|
11344
|
+
isMixedRnaAndDna,
|
|
11345
|
+
includeStopCodon
|
|
11346
|
+
} = {}) {
|
|
11347
|
+
return isProtein ? `${protein_letters_withUandX.toLowerCase()}${includeStopCodon ? "*." : ""}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
|
|
11348
|
+
//just plain old dna
|
|
11349
|
+
ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
|
|
11350
|
+
);
|
|
11351
|
+
}
|
|
11352
|
+
__name(getAcceptedChars, "getAcceptedChars");
|
|
11353
|
+
function getReplaceChars({
|
|
11354
|
+
isOligo,
|
|
11355
|
+
isProtein,
|
|
11356
|
+
isRna,
|
|
11357
|
+
isMixedRnaAndDna
|
|
11358
|
+
} = {}) {
|
|
11359
|
+
return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
|
|
11360
|
+
//just plain old dna
|
|
11361
|
+
{}
|
|
11362
|
+
);
|
|
11363
|
+
}
|
|
11364
|
+
__name(getReplaceChars, "getReplaceChars");
|
|
11292
11365
|
function tidyUpAnnotation(_annotation, {
|
|
11293
11366
|
sequenceData = {},
|
|
11294
11367
|
convertAnnotationsFromAAIndices,
|
|
@@ -11417,14 +11490,6 @@ var __async = (__this, __arguments, generator) => {
|
|
|
11417
11490
|
}
|
|
11418
11491
|
}
|
|
11419
11492
|
__name(coerceLocation, "coerceLocation");
|
|
11420
|
-
function filterAminoAcidSequenceString(sequenceString, options) {
|
|
11421
|
-
options = options || {};
|
|
11422
|
-
if (options.includeStopCodon) {
|
|
11423
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
|
|
11424
|
-
}
|
|
11425
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
|
|
11426
|
-
}
|
|
11427
|
-
__name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
|
|
11428
11493
|
function getDegenerateDnaStringFromAAString(aaString) {
|
|
11429
11494
|
return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
|
|
11430
11495
|
}
|
|
@@ -11436,11 +11501,10 @@ var __async = (__this, __arguments, generator) => {
|
|
|
11436
11501
|
removeUnwantedChars,
|
|
11437
11502
|
additionalValidChars,
|
|
11438
11503
|
noTranslationData,
|
|
11439
|
-
charOverrides,
|
|
11440
11504
|
doNotProvideIdsForAnnotations,
|
|
11441
|
-
proteinFilterOptions,
|
|
11442
11505
|
noCdsTranslations,
|
|
11443
|
-
convertAnnotationsFromAAIndices
|
|
11506
|
+
convertAnnotationsFromAAIndices,
|
|
11507
|
+
topLevelSeqData
|
|
11444
11508
|
} = options;
|
|
11445
11509
|
let seqData = lodashExports.cloneDeep(pSeqData);
|
|
11446
11510
|
const response = {
|
|
@@ -11470,16 +11534,15 @@ var __async = (__this, __arguments, generator) => {
|
|
|
11470
11534
|
}
|
|
11471
11535
|
if (removeUnwantedChars) {
|
|
11472
11536
|
if (seqData.isProtein) {
|
|
11473
|
-
seqData.proteinSequence
|
|
11474
|
-
|
|
11475
|
-
|
|
11476
|
-
|
|
11537
|
+
const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({
|
|
11538
|
+
includeStopCodon: true
|
|
11539
|
+
}, topLevelSeqData || seqData));
|
|
11540
|
+
seqData.proteinSequence = newSeq;
|
|
11477
11541
|
} else {
|
|
11478
|
-
|
|
11479
|
-
|
|
11480
|
-
|
|
11481
|
-
|
|
11482
|
-
);
|
|
11542
|
+
const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
|
|
11543
|
+
additionalValidChars
|
|
11544
|
+
}, topLevelSeqData || seqData));
|
|
11545
|
+
seqData.sequence = newSeq;
|
|
11483
11546
|
}
|
|
11484
11547
|
}
|
|
11485
11548
|
if (seqData.isProtein) {
|
|
@@ -19314,7 +19377,8 @@ var __async = (__this, __arguments, generator) => {
|
|
|
19314
19377
|
inclusive1BasedEnd,
|
|
19315
19378
|
additionalValidChars,
|
|
19316
19379
|
allowOverflowAnnotations,
|
|
19317
|
-
coerceFeatureTypes
|
|
19380
|
+
coerceFeatureTypes,
|
|
19381
|
+
includeStopCodon
|
|
19318
19382
|
} = options;
|
|
19319
19383
|
[
|
|
19320
19384
|
"isDNA",
|
|
@@ -19364,7 +19428,6 @@ var __async = (__this, __arguments, generator) => {
|
|
|
19364
19428
|
response.messages.push("No sequence detected");
|
|
19365
19429
|
sequence.sequence = "";
|
|
19366
19430
|
}
|
|
19367
|
-
let validChars;
|
|
19368
19431
|
if (sequence.isProtein === void 0 && guessIfProtein) {
|
|
19369
19432
|
sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
|
|
19370
19433
|
sequence.sequence,
|
|
@@ -19372,12 +19435,15 @@ var __async = (__this, __arguments, generator) => {
|
|
|
19372
19435
|
);
|
|
19373
19436
|
}
|
|
19374
19437
|
if (sequence.isProtein) {
|
|
19375
|
-
validChars =
|
|
19438
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, {
|
|
19439
|
+
name: sequence.name,
|
|
19440
|
+
isProtein: true,
|
|
19441
|
+
additionalValidChars,
|
|
19442
|
+
includeStopCodon
|
|
19443
|
+
});
|
|
19376
19444
|
if (validChars !== sequence.sequence) {
|
|
19377
19445
|
sequence.sequence = validChars;
|
|
19378
|
-
response.messages.push(
|
|
19379
|
-
"Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
|
|
19380
|
-
);
|
|
19446
|
+
response.messages.push(...warnings);
|
|
19381
19447
|
}
|
|
19382
19448
|
sequence.type = "PROTEIN";
|
|
19383
19449
|
sequence.isProtein = true;
|
|
@@ -19399,12 +19465,12 @@ var __async = (__this, __arguments, generator) => {
|
|
|
19399
19465
|
} else {
|
|
19400
19466
|
sequence.type = "DNA";
|
|
19401
19467
|
}
|
|
19402
|
-
validChars = filterSequenceString(sequence.sequence,
|
|
19468
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, __spreadValues({
|
|
19469
|
+
additionalValidChars
|
|
19470
|
+
}, sequence));
|
|
19403
19471
|
if (validChars !== sequence.sequence) {
|
|
19404
19472
|
sequence.sequence = validChars;
|
|
19405
|
-
response.messages.push(
|
|
19406
|
-
"Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
|
|
19407
|
-
);
|
|
19473
|
+
response.messages.push(...warnings);
|
|
19408
19474
|
}
|
|
19409
19475
|
}
|
|
19410
19476
|
if (!sequence.size) {
|
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@teselagen/bio-parsers",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.1",
|
|
4
4
|
"dependencies": {
|
|
5
|
-
"@teselagen/sequence-utils": "0.3.
|
|
5
|
+
"@teselagen/sequence-utils": "0.3.9",
|
|
6
6
|
"@teselagen/range-utils": "0.3.7",
|
|
7
7
|
"@gmod/gff": "^1.2.1",
|
|
8
8
|
"buffer": "^6.0.3",
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import areNonNegativeIntegers from "validate.io-nonnegative-integer-array";
|
|
2
2
|
import { getFeatureTypes } from "@teselagen/sequence-utils";
|
|
3
3
|
import {
|
|
4
|
-
filterAminoAcidSequenceString,
|
|
5
4
|
filterSequenceString,
|
|
6
5
|
guessIfSequenceIsDnaAndNotProtein
|
|
7
6
|
} from "@teselagen/sequence-utils";
|
|
@@ -30,7 +29,8 @@ export default function validateSequence(sequence, options = {}) {
|
|
|
30
29
|
inclusive1BasedEnd,
|
|
31
30
|
additionalValidChars,
|
|
32
31
|
allowOverflowAnnotations,
|
|
33
|
-
coerceFeatureTypes
|
|
32
|
+
coerceFeatureTypes,
|
|
33
|
+
includeStopCodon
|
|
34
34
|
} = options;
|
|
35
35
|
[
|
|
36
36
|
"isDNA",
|
|
@@ -84,7 +84,7 @@ export default function validateSequence(sequence, options = {}) {
|
|
|
84
84
|
response.messages.push("No sequence detected");
|
|
85
85
|
sequence.sequence = "";
|
|
86
86
|
}
|
|
87
|
-
|
|
87
|
+
|
|
88
88
|
if (sequence.isProtein === undefined && guessIfProtein) {
|
|
89
89
|
sequence.isProtein = !guessIfSequenceIsDnaAndNotProtein(
|
|
90
90
|
sequence.sequence,
|
|
@@ -93,12 +93,15 @@ export default function validateSequence(sequence, options = {}) {
|
|
|
93
93
|
}
|
|
94
94
|
if (sequence.isProtein) {
|
|
95
95
|
//tnr: add code to strip invalid protein data..
|
|
96
|
-
validChars =
|
|
96
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, {
|
|
97
|
+
name: sequence.name,
|
|
98
|
+
isProtein: true,
|
|
99
|
+
additionalValidChars,
|
|
100
|
+
includeStopCodon
|
|
101
|
+
});
|
|
97
102
|
if (validChars !== sequence.sequence) {
|
|
98
103
|
sequence.sequence = validChars;
|
|
99
|
-
response.messages.push(
|
|
100
|
-
"Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
|
|
101
|
-
);
|
|
104
|
+
response.messages.push(...warnings);
|
|
102
105
|
}
|
|
103
106
|
sequence.type = "PROTEIN";
|
|
104
107
|
sequence.isProtein = true;
|
|
@@ -126,12 +129,13 @@ export default function validateSequence(sequence, options = {}) {
|
|
|
126
129
|
sequence.type = "DNA";
|
|
127
130
|
}
|
|
128
131
|
|
|
129
|
-
validChars = filterSequenceString(sequence.sequence,
|
|
132
|
+
const [validChars, warnings] = filterSequenceString(sequence.sequence, {
|
|
133
|
+
additionalValidChars,
|
|
134
|
+
...sequence
|
|
135
|
+
});
|
|
130
136
|
if (validChars !== sequence.sequence) {
|
|
131
137
|
sequence.sequence = validChars;
|
|
132
|
-
response.messages.push(
|
|
133
|
-
"Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
|
|
134
|
-
);
|
|
138
|
+
response.messages.push(...warnings);
|
|
135
139
|
}
|
|
136
140
|
}
|
|
137
141
|
|