@teselagen/sequence-utils 0.3.8 → 0.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bioData.d.ts +2 -1
- package/filterSequenceString.d.ts +24 -1
- package/index.d.ts +0 -1
- package/index.js +91 -34
- package/index.mjs +91 -34
- package/index.umd.js +91 -34
- package/package.json +1 -1
- package/src/bioData.js +2 -3
- package/src/filterSequenceString.js +106 -17
- package/src/filterSequenceString.test.js +60 -3
- package/src/index.js +0 -1
- package/src/proteinAlphabet.js +2 -5
- package/src/tidyUpSequenceData.js +13 -17
- package/src/tidyUpSequenceData.test.js +12 -66
- package/filterAminoAcidSequenceString.d.ts +0 -1
- package/filterAminoAcidSequenceString.test.d.ts +0 -1
- package/src/filterAminoAcidSequenceString.js +0 -10
- package/src/filterAminoAcidSequenceString.test.js +0 -24
package/bioData.d.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
export const protein_letters: "ACDEFGHIKLMNPQRSTVWY";
|
|
2
|
-
export const
|
|
2
|
+
export const protein_letters_withUandX: "ACDEFGHIKLMNPQRSTVWYUX";
|
|
3
|
+
export const extended_protein_letters: "ACDEFGHIKLMNPQRSTVWYBXZJUO*";
|
|
3
4
|
export const ambiguous_dna_letters: "GATCRYWSMKHBVDN";
|
|
4
5
|
export const unambiguous_dna_letters: "GATC";
|
|
5
6
|
export const ambiguous_rna_letters: "GAUCRYWSMKHBVDN";
|
|
@@ -1 +1,24 @@
|
|
|
1
|
-
export default function filterSequenceString(sequenceString: any, additionalValidChars
|
|
1
|
+
export default function filterSequenceString(sequenceString: any, { additionalValidChars, isOligo, name, isProtein, isRna, isMixedRnaAndDna, }?: {
|
|
2
|
+
additionalValidChars?: string | undefined;
|
|
3
|
+
isOligo: any;
|
|
4
|
+
name: any;
|
|
5
|
+
isProtein: any;
|
|
6
|
+
isRna: any;
|
|
7
|
+
isMixedRnaAndDna: any;
|
|
8
|
+
}): (string | string[])[];
|
|
9
|
+
export function getAcceptedChars({ isOligo, isProtein, isRna, isMixedRnaAndDna, }?: {
|
|
10
|
+
isOligo: any;
|
|
11
|
+
isProtein: any;
|
|
12
|
+
isRna: any;
|
|
13
|
+
isMixedRnaAndDna: any;
|
|
14
|
+
}): string;
|
|
15
|
+
export function getReplaceChars({ isOligo, isProtein, isRna, isMixedRnaAndDna }?: {
|
|
16
|
+
isOligo: any;
|
|
17
|
+
isProtein: any;
|
|
18
|
+
isRna: any;
|
|
19
|
+
isMixedRnaAndDna: any;
|
|
20
|
+
}): {
|
|
21
|
+
t?: undefined;
|
|
22
|
+
} | {
|
|
23
|
+
t: string;
|
|
24
|
+
};
|
package/index.d.ts
CHANGED
|
@@ -20,7 +20,6 @@ export { default as aliasedEnzymesByName } from "./aliasedEnzymesByName";
|
|
|
20
20
|
export { default as defaultEnzymesByName } from "./defaultEnzymesByName";
|
|
21
21
|
export { default as generateSequenceData } from "./generateSequenceData";
|
|
22
22
|
export { default as generateAnnotations } from "./generateAnnotations";
|
|
23
|
-
export { default as filterAminoAcidSequenceString } from "./filterAminoAcidSequenceString";
|
|
24
23
|
export { default as filterSequenceString } from "./filterSequenceString";
|
|
25
24
|
export { default as findNearestRangeOfSequenceOverlapToPosition } from "./findNearestRangeOfSequenceOverlapToPosition";
|
|
26
25
|
export { default as findOrfsInPlasmid } from "./findOrfsInPlasmid";
|
package/index.js
CHANGED
|
@@ -5999,7 +5999,8 @@ lodash.exports;
|
|
|
5999
5999
|
})(lodash, lodash.exports);
|
|
6000
6000
|
var lodashExports = lodash.exports;
|
|
6001
6001
|
const protein_letters = "ACDEFGHIKLMNPQRSTVWY";
|
|
6002
|
-
const
|
|
6002
|
+
const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
|
|
6003
|
+
const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO*";
|
|
6003
6004
|
const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
|
|
6004
6005
|
const unambiguous_dna_letters = "GATC";
|
|
6005
6006
|
const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
|
|
@@ -6067,6 +6068,7 @@ const bioData = /* @__PURE__ */ Object.freeze(/* @__PURE__ */ Object.definePrope
|
|
|
6067
6068
|
extended_protein_letters,
|
|
6068
6069
|
extended_protein_values,
|
|
6069
6070
|
protein_letters,
|
|
6071
|
+
protein_letters_withUandX,
|
|
6070
6072
|
unambiguous_dna_letters,
|
|
6071
6073
|
unambiguous_rna_letters
|
|
6072
6074
|
}, Symbol.toStringTag, { value: "Module" }));
|
|
@@ -12321,20 +12323,88 @@ const modifiableTypes = [
|
|
|
12321
12323
|
"primers",
|
|
12322
12324
|
"guides"
|
|
12323
12325
|
];
|
|
12324
|
-
function filterSequenceString(sequenceString,
|
|
12325
|
-
|
|
12326
|
-
|
|
12327
|
-
|
|
12328
|
-
|
|
12329
|
-
|
|
12330
|
-
|
|
12331
|
-
|
|
12326
|
+
function filterSequenceString(sequenceString, {
|
|
12327
|
+
additionalValidChars = "",
|
|
12328
|
+
isOligo,
|
|
12329
|
+
name,
|
|
12330
|
+
isProtein,
|
|
12331
|
+
isRna,
|
|
12332
|
+
isMixedRnaAndDna
|
|
12333
|
+
} = {}) {
|
|
12334
|
+
const acceptedChars = getAcceptedChars({
|
|
12335
|
+
isOligo,
|
|
12336
|
+
isProtein,
|
|
12337
|
+
isRna,
|
|
12338
|
+
isMixedRnaAndDna
|
|
12339
|
+
});
|
|
12340
|
+
const replaceChars = getReplaceChars({
|
|
12341
|
+
isOligo,
|
|
12342
|
+
isProtein,
|
|
12343
|
+
isRna,
|
|
12344
|
+
isMixedRnaAndDna
|
|
12345
|
+
});
|
|
12346
|
+
let sanitizedVal = "";
|
|
12347
|
+
const invalidChars = [];
|
|
12348
|
+
const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
|
|
12349
|
+
const warnings = [];
|
|
12350
|
+
const replaceCount = {};
|
|
12351
|
+
sequenceString.split("").forEach((letter) => {
|
|
12352
|
+
const lowerLetter = letter.toLowerCase();
|
|
12353
|
+
if (replaceChars && replaceChars[lowerLetter]) {
|
|
12354
|
+
if (!replaceCount[lowerLetter]) {
|
|
12355
|
+
replaceCount[lowerLetter] = 0;
|
|
12356
|
+
}
|
|
12357
|
+
replaceCount[lowerLetter]++;
|
|
12358
|
+
const isUpper = lowerLetter !== letter;
|
|
12359
|
+
sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
|
|
12360
|
+
} else if (chars.includes(lowerLetter)) {
|
|
12361
|
+
sanitizedVal += letter;
|
|
12362
|
+
} else {
|
|
12363
|
+
invalidChars.push(letter);
|
|
12364
|
+
}
|
|
12365
|
+
});
|
|
12366
|
+
Object.keys(replaceCount).forEach((letter) => {
|
|
12367
|
+
warnings.push(
|
|
12368
|
+
`Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
|
|
12332
12369
|
);
|
|
12333
|
-
}
|
|
12334
|
-
|
|
12370
|
+
});
|
|
12371
|
+
if (sequenceString.length !== sanitizedVal.length) {
|
|
12372
|
+
warnings.push(
|
|
12373
|
+
`${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
|
|
12374
|
+
);
|
|
12375
|
+
}
|
|
12376
|
+
if (typeof window !== "undefined" && window.toastr && warnings.length) {
|
|
12377
|
+
warnings.forEach((warning) => {
|
|
12378
|
+
window.toastr.warning(warning);
|
|
12379
|
+
});
|
|
12335
12380
|
}
|
|
12381
|
+
return [sanitizedVal, warnings];
|
|
12336
12382
|
}
|
|
12337
12383
|
__name(filterSequenceString, "filterSequenceString");
|
|
12384
|
+
function getAcceptedChars({
|
|
12385
|
+
isOligo,
|
|
12386
|
+
isProtein,
|
|
12387
|
+
isRna,
|
|
12388
|
+
isMixedRnaAndDna
|
|
12389
|
+
} = {}) {
|
|
12390
|
+
return isProtein ? `${extended_protein_letters.toLowerCase()}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
|
|
12391
|
+
//just plain old dna
|
|
12392
|
+
ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
|
|
12393
|
+
);
|
|
12394
|
+
}
|
|
12395
|
+
__name(getAcceptedChars, "getAcceptedChars");
|
|
12396
|
+
function getReplaceChars({
|
|
12397
|
+
isOligo,
|
|
12398
|
+
isProtein,
|
|
12399
|
+
isRna,
|
|
12400
|
+
isMixedRnaAndDna
|
|
12401
|
+
} = {}) {
|
|
12402
|
+
return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
|
|
12403
|
+
//just plain old dna
|
|
12404
|
+
{}
|
|
12405
|
+
);
|
|
12406
|
+
}
|
|
12407
|
+
__name(getReplaceChars, "getReplaceChars");
|
|
12338
12408
|
function tidyUpAnnotation(_annotation, {
|
|
12339
12409
|
sequenceData = {},
|
|
12340
12410
|
convertAnnotationsFromAAIndices,
|
|
@@ -12463,14 +12533,6 @@ function coerceLocation({
|
|
|
12463
12533
|
}
|
|
12464
12534
|
}
|
|
12465
12535
|
__name(coerceLocation, "coerceLocation");
|
|
12466
|
-
function filterAminoAcidSequenceString(sequenceString, options) {
|
|
12467
|
-
options = options || {};
|
|
12468
|
-
if (options.includeStopCodon) {
|
|
12469
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
|
|
12470
|
-
}
|
|
12471
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
|
|
12472
|
-
}
|
|
12473
|
-
__name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
|
|
12474
12536
|
function getDegenerateDnaStringFromAAString(aaString) {
|
|
12475
12537
|
return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
|
|
12476
12538
|
}
|
|
@@ -12479,14 +12541,13 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
12479
12541
|
const {
|
|
12480
12542
|
annotationsAsObjects,
|
|
12481
12543
|
logMessages,
|
|
12482
|
-
|
|
12544
|
+
doNotRemoveInvalidChars,
|
|
12483
12545
|
additionalValidChars,
|
|
12484
12546
|
noTranslationData,
|
|
12485
|
-
charOverrides,
|
|
12486
12547
|
doNotProvideIdsForAnnotations,
|
|
12487
|
-
proteinFilterOptions,
|
|
12488
12548
|
noCdsTranslations,
|
|
12489
|
-
convertAnnotationsFromAAIndices
|
|
12549
|
+
convertAnnotationsFromAAIndices,
|
|
12550
|
+
topLevelSeqData
|
|
12490
12551
|
} = options;
|
|
12491
12552
|
let seqData = lodashExports.cloneDeep(pSeqData);
|
|
12492
12553
|
const response = {
|
|
@@ -12514,18 +12575,15 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
12514
12575
|
if (seqData.isRna) {
|
|
12515
12576
|
seqData.sequence = seqData.sequence.replace(/t/gi, "u");
|
|
12516
12577
|
}
|
|
12517
|
-
if (
|
|
12578
|
+
if (!doNotRemoveInvalidChars) {
|
|
12518
12579
|
if (seqData.isProtein) {
|
|
12519
|
-
seqData.proteinSequence
|
|
12520
|
-
|
|
12521
|
-
__spreadValues({ includeStopCodon: true }, proteinFilterOptions)
|
|
12522
|
-
);
|
|
12580
|
+
const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({}, topLevelSeqData || seqData));
|
|
12581
|
+
seqData.proteinSequence = newSeq;
|
|
12523
12582
|
} else {
|
|
12524
|
-
|
|
12525
|
-
|
|
12526
|
-
|
|
12527
|
-
|
|
12528
|
-
);
|
|
12583
|
+
const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
|
|
12584
|
+
additionalValidChars
|
|
12585
|
+
}, topLevelSeqData || seqData));
|
|
12586
|
+
seqData.sequence = newSeq;
|
|
12529
12587
|
}
|
|
12530
12588
|
}
|
|
12531
12589
|
if (seqData.isProtein) {
|
|
@@ -22665,7 +22723,6 @@ exports.degenerateRnaToAminoAcidMap = degenerateRnaToAminoAcidMap;
|
|
|
22665
22723
|
exports.deleteSequenceDataAtRange = deleteSequenceDataAtRange;
|
|
22666
22724
|
exports.doesEnzymeChopOutsideOfRecognitionSite = doesEnzymeChopOutsideOfRecognitionSite;
|
|
22667
22725
|
exports.featureColors = featureColors;
|
|
22668
|
-
exports.filterAminoAcidSequenceString = filterAminoAcidSequenceString;
|
|
22669
22726
|
exports.filterSequenceString = filterSequenceString;
|
|
22670
22727
|
exports.findNearestRangeOfSequenceOverlapToPosition = findNearestRangeOfSequenceOverlapToPosition;
|
|
22671
22728
|
exports.findOrfsInPlasmid = findOrfsInPlasmid;
|
package/index.mjs
CHANGED
|
@@ -5997,7 +5997,8 @@ lodash.exports;
|
|
|
5997
5997
|
})(lodash, lodash.exports);
|
|
5998
5998
|
var lodashExports = lodash.exports;
|
|
5999
5999
|
const protein_letters = "ACDEFGHIKLMNPQRSTVWY";
|
|
6000
|
-
const
|
|
6000
|
+
const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
|
|
6001
|
+
const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO*";
|
|
6001
6002
|
const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
|
|
6002
6003
|
const unambiguous_dna_letters = "GATC";
|
|
6003
6004
|
const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
|
|
@@ -6065,6 +6066,7 @@ const bioData = /* @__PURE__ */ Object.freeze(/* @__PURE__ */ Object.definePrope
|
|
|
6065
6066
|
extended_protein_letters,
|
|
6066
6067
|
extended_protein_values,
|
|
6067
6068
|
protein_letters,
|
|
6069
|
+
protein_letters_withUandX,
|
|
6068
6070
|
unambiguous_dna_letters,
|
|
6069
6071
|
unambiguous_rna_letters
|
|
6070
6072
|
}, Symbol.toStringTag, { value: "Module" }));
|
|
@@ -12319,20 +12321,88 @@ const modifiableTypes = [
|
|
|
12319
12321
|
"primers",
|
|
12320
12322
|
"guides"
|
|
12321
12323
|
];
|
|
12322
|
-
function filterSequenceString(sequenceString,
|
|
12323
|
-
|
|
12324
|
-
|
|
12325
|
-
|
|
12326
|
-
|
|
12327
|
-
|
|
12328
|
-
|
|
12329
|
-
|
|
12324
|
+
function filterSequenceString(sequenceString, {
|
|
12325
|
+
additionalValidChars = "",
|
|
12326
|
+
isOligo,
|
|
12327
|
+
name,
|
|
12328
|
+
isProtein,
|
|
12329
|
+
isRna,
|
|
12330
|
+
isMixedRnaAndDna
|
|
12331
|
+
} = {}) {
|
|
12332
|
+
const acceptedChars = getAcceptedChars({
|
|
12333
|
+
isOligo,
|
|
12334
|
+
isProtein,
|
|
12335
|
+
isRna,
|
|
12336
|
+
isMixedRnaAndDna
|
|
12337
|
+
});
|
|
12338
|
+
const replaceChars = getReplaceChars({
|
|
12339
|
+
isOligo,
|
|
12340
|
+
isProtein,
|
|
12341
|
+
isRna,
|
|
12342
|
+
isMixedRnaAndDna
|
|
12343
|
+
});
|
|
12344
|
+
let sanitizedVal = "";
|
|
12345
|
+
const invalidChars = [];
|
|
12346
|
+
const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
|
|
12347
|
+
const warnings = [];
|
|
12348
|
+
const replaceCount = {};
|
|
12349
|
+
sequenceString.split("").forEach((letter) => {
|
|
12350
|
+
const lowerLetter = letter.toLowerCase();
|
|
12351
|
+
if (replaceChars && replaceChars[lowerLetter]) {
|
|
12352
|
+
if (!replaceCount[lowerLetter]) {
|
|
12353
|
+
replaceCount[lowerLetter] = 0;
|
|
12354
|
+
}
|
|
12355
|
+
replaceCount[lowerLetter]++;
|
|
12356
|
+
const isUpper = lowerLetter !== letter;
|
|
12357
|
+
sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
|
|
12358
|
+
} else if (chars.includes(lowerLetter)) {
|
|
12359
|
+
sanitizedVal += letter;
|
|
12360
|
+
} else {
|
|
12361
|
+
invalidChars.push(letter);
|
|
12362
|
+
}
|
|
12363
|
+
});
|
|
12364
|
+
Object.keys(replaceCount).forEach((letter) => {
|
|
12365
|
+
warnings.push(
|
|
12366
|
+
`Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
|
|
12330
12367
|
);
|
|
12331
|
-
}
|
|
12332
|
-
|
|
12368
|
+
});
|
|
12369
|
+
if (sequenceString.length !== sanitizedVal.length) {
|
|
12370
|
+
warnings.push(
|
|
12371
|
+
`${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
|
|
12372
|
+
);
|
|
12373
|
+
}
|
|
12374
|
+
if (typeof window !== "undefined" && window.toastr && warnings.length) {
|
|
12375
|
+
warnings.forEach((warning) => {
|
|
12376
|
+
window.toastr.warning(warning);
|
|
12377
|
+
});
|
|
12333
12378
|
}
|
|
12379
|
+
return [sanitizedVal, warnings];
|
|
12334
12380
|
}
|
|
12335
12381
|
__name(filterSequenceString, "filterSequenceString");
|
|
12382
|
+
function getAcceptedChars({
|
|
12383
|
+
isOligo,
|
|
12384
|
+
isProtein,
|
|
12385
|
+
isRna,
|
|
12386
|
+
isMixedRnaAndDna
|
|
12387
|
+
} = {}) {
|
|
12388
|
+
return isProtein ? `${extended_protein_letters.toLowerCase()}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
|
|
12389
|
+
//just plain old dna
|
|
12390
|
+
ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
|
|
12391
|
+
);
|
|
12392
|
+
}
|
|
12393
|
+
__name(getAcceptedChars, "getAcceptedChars");
|
|
12394
|
+
function getReplaceChars({
|
|
12395
|
+
isOligo,
|
|
12396
|
+
isProtein,
|
|
12397
|
+
isRna,
|
|
12398
|
+
isMixedRnaAndDna
|
|
12399
|
+
} = {}) {
|
|
12400
|
+
return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
|
|
12401
|
+
//just plain old dna
|
|
12402
|
+
{}
|
|
12403
|
+
);
|
|
12404
|
+
}
|
|
12405
|
+
__name(getReplaceChars, "getReplaceChars");
|
|
12336
12406
|
function tidyUpAnnotation(_annotation, {
|
|
12337
12407
|
sequenceData = {},
|
|
12338
12408
|
convertAnnotationsFromAAIndices,
|
|
@@ -12461,14 +12531,6 @@ function coerceLocation({
|
|
|
12461
12531
|
}
|
|
12462
12532
|
}
|
|
12463
12533
|
__name(coerceLocation, "coerceLocation");
|
|
12464
|
-
function filterAminoAcidSequenceString(sequenceString, options) {
|
|
12465
|
-
options = options || {};
|
|
12466
|
-
if (options.includeStopCodon) {
|
|
12467
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
|
|
12468
|
-
}
|
|
12469
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
|
|
12470
|
-
}
|
|
12471
|
-
__name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
|
|
12472
12534
|
function getDegenerateDnaStringFromAAString(aaString) {
|
|
12473
12535
|
return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
|
|
12474
12536
|
}
|
|
@@ -12477,14 +12539,13 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
12477
12539
|
const {
|
|
12478
12540
|
annotationsAsObjects,
|
|
12479
12541
|
logMessages,
|
|
12480
|
-
|
|
12542
|
+
doNotRemoveInvalidChars,
|
|
12481
12543
|
additionalValidChars,
|
|
12482
12544
|
noTranslationData,
|
|
12483
|
-
charOverrides,
|
|
12484
12545
|
doNotProvideIdsForAnnotations,
|
|
12485
|
-
proteinFilterOptions,
|
|
12486
12546
|
noCdsTranslations,
|
|
12487
|
-
convertAnnotationsFromAAIndices
|
|
12547
|
+
convertAnnotationsFromAAIndices,
|
|
12548
|
+
topLevelSeqData
|
|
12488
12549
|
} = options;
|
|
12489
12550
|
let seqData = lodashExports.cloneDeep(pSeqData);
|
|
12490
12551
|
const response = {
|
|
@@ -12512,18 +12573,15 @@ function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
12512
12573
|
if (seqData.isRna) {
|
|
12513
12574
|
seqData.sequence = seqData.sequence.replace(/t/gi, "u");
|
|
12514
12575
|
}
|
|
12515
|
-
if (
|
|
12576
|
+
if (!doNotRemoveInvalidChars) {
|
|
12516
12577
|
if (seqData.isProtein) {
|
|
12517
|
-
seqData.proteinSequence
|
|
12518
|
-
|
|
12519
|
-
__spreadValues({ includeStopCodon: true }, proteinFilterOptions)
|
|
12520
|
-
);
|
|
12578
|
+
const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({}, topLevelSeqData || seqData));
|
|
12579
|
+
seqData.proteinSequence = newSeq;
|
|
12521
12580
|
} else {
|
|
12522
|
-
|
|
12523
|
-
|
|
12524
|
-
|
|
12525
|
-
|
|
12526
|
-
);
|
|
12581
|
+
const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
|
|
12582
|
+
additionalValidChars
|
|
12583
|
+
}, topLevelSeqData || seqData));
|
|
12584
|
+
seqData.sequence = newSeq;
|
|
12527
12585
|
}
|
|
12528
12586
|
}
|
|
12529
12587
|
if (seqData.isProtein) {
|
|
@@ -22664,7 +22722,6 @@ export {
|
|
|
22664
22722
|
deleteSequenceDataAtRange,
|
|
22665
22723
|
doesEnzymeChopOutsideOfRecognitionSite,
|
|
22666
22724
|
featureColors,
|
|
22667
|
-
filterAminoAcidSequenceString,
|
|
22668
22725
|
filterSequenceString,
|
|
22669
22726
|
findNearestRangeOfSequenceOverlapToPosition,
|
|
22670
22727
|
findOrfsInPlasmid,
|
package/index.umd.js
CHANGED
|
@@ -6001,7 +6001,8 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
|
|
|
6001
6001
|
})(lodash, lodash.exports);
|
|
6002
6002
|
var lodashExports = lodash.exports;
|
|
6003
6003
|
const protein_letters = "ACDEFGHIKLMNPQRSTVWY";
|
|
6004
|
-
const
|
|
6004
|
+
const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
|
|
6005
|
+
const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO*";
|
|
6005
6006
|
const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
|
|
6006
6007
|
const unambiguous_dna_letters = "GATC";
|
|
6007
6008
|
const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
|
|
@@ -6069,6 +6070,7 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
|
|
|
6069
6070
|
extended_protein_letters,
|
|
6070
6071
|
extended_protein_values,
|
|
6071
6072
|
protein_letters,
|
|
6073
|
+
protein_letters_withUandX,
|
|
6072
6074
|
unambiguous_dna_letters,
|
|
6073
6075
|
unambiguous_rna_letters
|
|
6074
6076
|
}, Symbol.toStringTag, { value: "Module" }));
|
|
@@ -12323,20 +12325,88 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
|
|
|
12323
12325
|
"primers",
|
|
12324
12326
|
"guides"
|
|
12325
12327
|
];
|
|
12326
|
-
function filterSequenceString(sequenceString,
|
|
12327
|
-
|
|
12328
|
-
|
|
12329
|
-
|
|
12330
|
-
|
|
12331
|
-
|
|
12332
|
-
|
|
12333
|
-
|
|
12328
|
+
function filterSequenceString(sequenceString, {
|
|
12329
|
+
additionalValidChars = "",
|
|
12330
|
+
isOligo,
|
|
12331
|
+
name,
|
|
12332
|
+
isProtein,
|
|
12333
|
+
isRna,
|
|
12334
|
+
isMixedRnaAndDna
|
|
12335
|
+
} = {}) {
|
|
12336
|
+
const acceptedChars = getAcceptedChars({
|
|
12337
|
+
isOligo,
|
|
12338
|
+
isProtein,
|
|
12339
|
+
isRna,
|
|
12340
|
+
isMixedRnaAndDna
|
|
12341
|
+
});
|
|
12342
|
+
const replaceChars = getReplaceChars({
|
|
12343
|
+
isOligo,
|
|
12344
|
+
isProtein,
|
|
12345
|
+
isRna,
|
|
12346
|
+
isMixedRnaAndDna
|
|
12347
|
+
});
|
|
12348
|
+
let sanitizedVal = "";
|
|
12349
|
+
const invalidChars = [];
|
|
12350
|
+
const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
|
|
12351
|
+
const warnings = [];
|
|
12352
|
+
const replaceCount = {};
|
|
12353
|
+
sequenceString.split("").forEach((letter) => {
|
|
12354
|
+
const lowerLetter = letter.toLowerCase();
|
|
12355
|
+
if (replaceChars && replaceChars[lowerLetter]) {
|
|
12356
|
+
if (!replaceCount[lowerLetter]) {
|
|
12357
|
+
replaceCount[lowerLetter] = 0;
|
|
12358
|
+
}
|
|
12359
|
+
replaceCount[lowerLetter]++;
|
|
12360
|
+
const isUpper = lowerLetter !== letter;
|
|
12361
|
+
sanitizedVal += isUpper ? replaceChars[lowerLetter].toUpperCase() : replaceChars[lowerLetter];
|
|
12362
|
+
} else if (chars.includes(lowerLetter)) {
|
|
12363
|
+
sanitizedVal += letter;
|
|
12364
|
+
} else {
|
|
12365
|
+
invalidChars.push(letter);
|
|
12366
|
+
}
|
|
12367
|
+
});
|
|
12368
|
+
Object.keys(replaceCount).forEach((letter) => {
|
|
12369
|
+
warnings.push(
|
|
12370
|
+
`Replaced "${letter}" with "${replaceChars[letter]}"${replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""}`
|
|
12334
12371
|
);
|
|
12335
|
-
}
|
|
12336
|
-
|
|
12372
|
+
});
|
|
12373
|
+
if (sequenceString.length !== sanitizedVal.length) {
|
|
12374
|
+
warnings.push(
|
|
12375
|
+
`${name ? `Sequence ${name}: ` : ""}Invalid character(s) detected and removed: ${invalidChars.slice(0, 100).join(", ")} `
|
|
12376
|
+
);
|
|
12377
|
+
}
|
|
12378
|
+
if (typeof window !== "undefined" && window.toastr && warnings.length) {
|
|
12379
|
+
warnings.forEach((warning) => {
|
|
12380
|
+
window.toastr.warning(warning);
|
|
12381
|
+
});
|
|
12337
12382
|
}
|
|
12383
|
+
return [sanitizedVal, warnings];
|
|
12338
12384
|
}
|
|
12339
12385
|
__name(filterSequenceString, "filterSequenceString");
|
|
12386
|
+
function getAcceptedChars({
|
|
12387
|
+
isOligo,
|
|
12388
|
+
isProtein,
|
|
12389
|
+
isRna,
|
|
12390
|
+
isMixedRnaAndDna
|
|
12391
|
+
} = {}) {
|
|
12392
|
+
return isProtein ? `${extended_protein_letters.toLowerCase()}}` : isOligo ? ambiguous_rna_letters.toLowerCase() + "t" : isRna ? ambiguous_rna_letters.toLowerCase() + "t" : isMixedRnaAndDna ? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase() : (
|
|
12393
|
+
//just plain old dna
|
|
12394
|
+
ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
|
|
12395
|
+
);
|
|
12396
|
+
}
|
|
12397
|
+
__name(getAcceptedChars, "getAcceptedChars");
|
|
12398
|
+
function getReplaceChars({
|
|
12399
|
+
isOligo,
|
|
12400
|
+
isProtein,
|
|
12401
|
+
isRna,
|
|
12402
|
+
isMixedRnaAndDna
|
|
12403
|
+
} = {}) {
|
|
12404
|
+
return isProtein ? {} : isOligo ? {} : isRna ? { t: "u" } : isMixedRnaAndDna ? {} : (
|
|
12405
|
+
//just plain old dna
|
|
12406
|
+
{}
|
|
12407
|
+
);
|
|
12408
|
+
}
|
|
12409
|
+
__name(getReplaceChars, "getReplaceChars");
|
|
12340
12410
|
function tidyUpAnnotation(_annotation, {
|
|
12341
12411
|
sequenceData = {},
|
|
12342
12412
|
convertAnnotationsFromAAIndices,
|
|
@@ -12465,14 +12535,6 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
|
|
|
12465
12535
|
}
|
|
12466
12536
|
}
|
|
12467
12537
|
__name(coerceLocation, "coerceLocation");
|
|
12468
|
-
function filterAminoAcidSequenceString(sequenceString, options) {
|
|
12469
|
-
options = options || {};
|
|
12470
|
-
if (options.includeStopCodon) {
|
|
12471
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
|
|
12472
|
-
}
|
|
12473
|
-
return sequenceString == null ? void 0 : sequenceString.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
|
|
12474
|
-
}
|
|
12475
|
-
__name(filterAminoAcidSequenceString, "filterAminoAcidSequenceString");
|
|
12476
12538
|
function getDegenerateDnaStringFromAAString(aaString) {
|
|
12477
12539
|
return aaString.split("").map((char) => aminoAcidToDegenerateDnaMap[char.toLowerCase()] || "nnn").join("");
|
|
12478
12540
|
}
|
|
@@ -12481,14 +12543,13 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
|
|
|
12481
12543
|
const {
|
|
12482
12544
|
annotationsAsObjects,
|
|
12483
12545
|
logMessages,
|
|
12484
|
-
|
|
12546
|
+
doNotRemoveInvalidChars,
|
|
12485
12547
|
additionalValidChars,
|
|
12486
12548
|
noTranslationData,
|
|
12487
|
-
charOverrides,
|
|
12488
12549
|
doNotProvideIdsForAnnotations,
|
|
12489
|
-
proteinFilterOptions,
|
|
12490
12550
|
noCdsTranslations,
|
|
12491
|
-
convertAnnotationsFromAAIndices
|
|
12551
|
+
convertAnnotationsFromAAIndices,
|
|
12552
|
+
topLevelSeqData
|
|
12492
12553
|
} = options;
|
|
12493
12554
|
let seqData = lodashExports.cloneDeep(pSeqData);
|
|
12494
12555
|
const response = {
|
|
@@ -12516,18 +12577,15 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
|
|
|
12516
12577
|
if (seqData.isRna) {
|
|
12517
12578
|
seqData.sequence = seqData.sequence.replace(/t/gi, "u");
|
|
12518
12579
|
}
|
|
12519
|
-
if (
|
|
12580
|
+
if (!doNotRemoveInvalidChars) {
|
|
12520
12581
|
if (seqData.isProtein) {
|
|
12521
|
-
seqData.proteinSequence
|
|
12522
|
-
|
|
12523
|
-
__spreadValues({ includeStopCodon: true }, proteinFilterOptions)
|
|
12524
|
-
);
|
|
12582
|
+
const [newSeq] = filterSequenceString(seqData.proteinSequence, __spreadValues({}, topLevelSeqData || seqData));
|
|
12583
|
+
seqData.proteinSequence = newSeq;
|
|
12525
12584
|
} else {
|
|
12526
|
-
|
|
12527
|
-
|
|
12528
|
-
|
|
12529
|
-
|
|
12530
|
-
);
|
|
12585
|
+
const [newSeq] = filterSequenceString(seqData.sequence, __spreadValues({
|
|
12586
|
+
additionalValidChars
|
|
12587
|
+
}, topLevelSeqData || seqData));
|
|
12588
|
+
seqData.sequence = newSeq;
|
|
12531
12589
|
}
|
|
12532
12590
|
}
|
|
12533
12591
|
if (seqData.isProtein) {
|
|
@@ -22667,7 +22725,6 @@ var __name = (target, value) => __defProp(target, "name", { value, configurable:
|
|
|
22667
22725
|
exports2.deleteSequenceDataAtRange = deleteSequenceDataAtRange;
|
|
22668
22726
|
exports2.doesEnzymeChopOutsideOfRecognitionSite = doesEnzymeChopOutsideOfRecognitionSite;
|
|
22669
22727
|
exports2.featureColors = featureColors;
|
|
22670
|
-
exports2.filterAminoAcidSequenceString = filterAminoAcidSequenceString;
|
|
22671
22728
|
exports2.filterSequenceString = filterSequenceString;
|
|
22672
22729
|
exports2.findNearestRangeOfSequenceOverlapToPosition = findNearestRangeOfSequenceOverlapToPosition;
|
|
22673
22730
|
exports2.findOrfsInPlasmid = findOrfsInPlasmid;
|
package/package.json
CHANGED
package/src/bioData.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
//Adapted from biopython. Check the BIOPYTHON_LICENSE for licensing info
|
|
2
2
|
|
|
3
3
|
export const protein_letters = "ACDEFGHIKLMNPQRSTVWY";
|
|
4
|
-
|
|
5
|
-
export const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO
|
|
4
|
+
export const protein_letters_withUandX = "ACDEFGHIKLMNPQRSTVWYUX";
|
|
5
|
+
export const extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO*";
|
|
6
6
|
export const ambiguous_dna_letters = "GATCRYWSMKHBVDN";
|
|
7
7
|
export const unambiguous_dna_letters = "GATC";
|
|
8
8
|
export const ambiguous_rna_letters = "GAUCRYWSMKHBVDN";
|
|
@@ -29,7 +29,6 @@ export const ambiguous_dna_values = {
|
|
|
29
29
|
N: "GATC"
|
|
30
30
|
};
|
|
31
31
|
|
|
32
|
-
|
|
33
32
|
export const extended_protein_values = {
|
|
34
33
|
A: "A",
|
|
35
34
|
B: "ND",
|
|
@@ -1,24 +1,113 @@
|
|
|
1
|
-
|
|
1
|
+
import {
|
|
2
|
+
ambiguous_dna_letters,
|
|
3
|
+
ambiguous_rna_letters,
|
|
4
|
+
extended_protein_letters,
|
|
5
|
+
} from "./bioData";
|
|
2
6
|
|
|
3
|
-
//
|
|
4
7
|
export default function filterSequenceString(
|
|
5
8
|
sequenceString,
|
|
6
|
-
|
|
7
|
-
|
|
9
|
+
{
|
|
10
|
+
additionalValidChars = "",
|
|
11
|
+
isOligo,
|
|
12
|
+
name,
|
|
13
|
+
isProtein,
|
|
14
|
+
isRna,
|
|
15
|
+
isMixedRnaAndDna,
|
|
16
|
+
} = {}
|
|
8
17
|
) {
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
const acceptedChars = getAcceptedChars({
|
|
19
|
+
isOligo,
|
|
20
|
+
isProtein,
|
|
21
|
+
isRna,
|
|
22
|
+
isMixedRnaAndDna,
|
|
23
|
+
});
|
|
24
|
+
const replaceChars = getReplaceChars({
|
|
25
|
+
isOligo,
|
|
26
|
+
isProtein,
|
|
27
|
+
isRna,
|
|
28
|
+
isMixedRnaAndDna
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
let sanitizedVal = "";
|
|
32
|
+
const invalidChars = [];
|
|
33
|
+
const chars = `${acceptedChars}${additionalValidChars.split("").join("\\")}`;
|
|
34
|
+
const warnings = [];
|
|
35
|
+
const replaceCount = {};
|
|
36
|
+
sequenceString.split("").forEach(letter => {
|
|
37
|
+
const lowerLetter = letter.toLowerCase();
|
|
38
|
+
if (replaceChars && replaceChars[lowerLetter]) {
|
|
39
|
+
if (!replaceCount[lowerLetter]) {
|
|
40
|
+
replaceCount[lowerLetter] = 0;
|
|
41
|
+
}
|
|
42
|
+
replaceCount[lowerLetter]++;
|
|
43
|
+
const isUpper = lowerLetter !== letter;
|
|
44
|
+
sanitizedVal += isUpper
|
|
45
|
+
? replaceChars[lowerLetter].toUpperCase()
|
|
46
|
+
: replaceChars[lowerLetter];
|
|
47
|
+
} else if (chars.includes(lowerLetter)) {
|
|
48
|
+
sanitizedVal += letter;
|
|
49
|
+
} else {
|
|
50
|
+
invalidChars.push(letter);
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
//add replace count warnings
|
|
54
|
+
Object.keys(replaceCount).forEach(letter => {
|
|
55
|
+
warnings.push(
|
|
56
|
+
`Replaced "${letter}" with "${replaceChars[letter]}"${
|
|
57
|
+
replaceCount[letter] > 1 ? ` ${replaceCount[letter]} times` : ""
|
|
58
|
+
}`
|
|
59
|
+
);
|
|
60
|
+
});
|
|
61
|
+
if (sequenceString.length !== sanitizedVal.length) {
|
|
62
|
+
warnings.push(
|
|
63
|
+
`${
|
|
64
|
+
name ? `Sequence ${name}: ` : ""
|
|
65
|
+
}Invalid character(s) detected and removed: ${invalidChars
|
|
66
|
+
.slice(0, 100)
|
|
67
|
+
.join(", ")} `
|
|
20
68
|
);
|
|
21
|
-
} else {
|
|
22
|
-
return sequenceString;
|
|
23
69
|
}
|
|
70
|
+
if (typeof window !== "undefined" && window.toastr && warnings.length) {
|
|
71
|
+
warnings.forEach(warning => {
|
|
72
|
+
window.toastr.warning(warning);
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return [sanitizedVal, warnings];
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export function getAcceptedChars({
|
|
80
|
+
isOligo,
|
|
81
|
+
isProtein,
|
|
82
|
+
isRna,
|
|
83
|
+
isMixedRnaAndDna,
|
|
84
|
+
} = {}) {
|
|
85
|
+
return isProtein
|
|
86
|
+
? `${extended_protein_letters.toLowerCase()}}`
|
|
87
|
+
: isOligo
|
|
88
|
+
? ambiguous_rna_letters.toLowerCase() + "t"
|
|
89
|
+
: isRna
|
|
90
|
+
? ambiguous_rna_letters.toLowerCase() + "t"
|
|
91
|
+
: isMixedRnaAndDna
|
|
92
|
+
? ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase()
|
|
93
|
+
: //just plain old dna
|
|
94
|
+
ambiguous_rna_letters.toLowerCase() + ambiguous_dna_letters.toLowerCase();
|
|
95
|
+
}
|
|
96
|
+
export function getReplaceChars({
|
|
97
|
+
isOligo,
|
|
98
|
+
isProtein,
|
|
99
|
+
isRna,
|
|
100
|
+
isMixedRnaAndDna
|
|
101
|
+
} = {}) {
|
|
102
|
+
return isProtein
|
|
103
|
+
? {}
|
|
104
|
+
// {".": "*"}
|
|
105
|
+
: isOligo
|
|
106
|
+
? {}
|
|
107
|
+
: isRna
|
|
108
|
+
? { t: "u" }
|
|
109
|
+
: isMixedRnaAndDna
|
|
110
|
+
? {}
|
|
111
|
+
: //just plain old dna
|
|
112
|
+
{};
|
|
24
113
|
}
|
|
@@ -1,13 +1,70 @@
|
|
|
1
1
|
import filterSequenceString from "./filterSequenceString";
|
|
2
|
+
import { expect } from "vitest";
|
|
2
3
|
|
|
3
4
|
describe("filterSequenceString", () => {
|
|
5
|
+
it("should not filter u's and should convert t's to u's from isOligo=true seqs", () => {
|
|
6
|
+
const [str, warnings] = filterSequenceString("tatuuag--a", {
|
|
7
|
+
isOligo: true
|
|
8
|
+
});
|
|
9
|
+
expect(str).toBe("tatuuaga");
|
|
10
|
+
// expect(warnings[0]).toBe('Replaced "t" with "u" 2 times');
|
|
11
|
+
expect(warnings[0]).toBe(
|
|
12
|
+
"Invalid character(s) detected and removed: -, - "
|
|
13
|
+
);
|
|
14
|
+
});
|
|
15
|
+
it("should not convert u's to t's for isDna (default isDna=true) seqs", () => {
|
|
16
|
+
const [str, warnings] = filterSequenceString("tatuuag--a", {});
|
|
17
|
+
// expect(warnings[0]).toBe('Replaced "u" with "t" 2 times');
|
|
18
|
+
expect(warnings[0]).toBe(
|
|
19
|
+
"Invalid character(s) detected and removed: -, - "
|
|
20
|
+
);
|
|
21
|
+
expect(str).toBe("tatuuaga");
|
|
22
|
+
});
|
|
4
23
|
it("should filter out unwanted chars", () => {
|
|
5
|
-
|
|
24
|
+
const [str, warnings] = filterSequenceString("tatag--a");
|
|
25
|
+
expect(warnings[0]).toBe(
|
|
26
|
+
"Invalid character(s) detected and removed: -, - "
|
|
27
|
+
);
|
|
28
|
+
expect(str).toBe("tataga");
|
|
6
29
|
});
|
|
7
30
|
it("should handle additional chars option", () => {
|
|
8
|
-
|
|
31
|
+
const [str, warnings] = filterSequenceString("tatag--a", {
|
|
32
|
+
additionalValidChars: "-"
|
|
33
|
+
});
|
|
34
|
+
expect(warnings.length).toBe(0);
|
|
35
|
+
expect(str).toBe("tatag--a");
|
|
9
36
|
});
|
|
10
37
|
it("should handle additional chars option", () => {
|
|
11
|
-
|
|
38
|
+
const [str, warnings] = filterSequenceString("tatag--a", {
|
|
39
|
+
additionalValidChars: "f-q"
|
|
40
|
+
});
|
|
41
|
+
expect(warnings.length).toBe(0);
|
|
42
|
+
expect(str).toBe("tatag--a");
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it("when isProtein: true, should filter only valid amino acids by default", () => {
|
|
46
|
+
const [str, warnings] = filterSequenceString(
|
|
47
|
+
'bbb342"""xtgalmfwkqespvicyhrnd,,../',
|
|
48
|
+
{
|
|
49
|
+
isProtein: true
|
|
50
|
+
}
|
|
51
|
+
);
|
|
52
|
+
// expect(warnings[0]).toBe(`Replaced "." with "*" 2 times`);
|
|
53
|
+
expect(warnings[0]).toBe( 'Invalid character(s) detected and removed: 3, 4, 2, ", ", ", ,, ,, ., ., / ');
|
|
54
|
+
expect(str).toBe("bbbxtgalmfwkqespvicyhrnd");
|
|
55
|
+
});
|
|
56
|
+
it("when isProtein: true, should handle upper case letters", () => {
|
|
57
|
+
const [str, warnings] = filterSequenceString("xtgalmfWKQEspvicyhrnd", {
|
|
58
|
+
isProtein: true
|
|
59
|
+
});
|
|
60
|
+
expect(warnings.length).toBe(0);
|
|
61
|
+
expect(str).toBe("xtgalmfWKQEspvicyhrnd");
|
|
62
|
+
});
|
|
63
|
+
it("when isProtein: true, it should convert . to *", () => {
|
|
64
|
+
const [str] = filterSequenceString('BXZJUO*bbb342"""xtgalbmfwkqespvicyhrnd,,../', {
|
|
65
|
+
isProtein: true,
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
expect(str).toBe("BXZJUO*bbbxtgalbmfwkqespvicyhrnd");
|
|
12
69
|
});
|
|
13
70
|
});
|
package/src/index.js
CHANGED
|
@@ -51,7 +51,6 @@ export { default as aliasedEnzymesByName } from "./aliasedEnzymesByName";
|
|
|
51
51
|
export { default as defaultEnzymesByName } from "./defaultEnzymesByName";
|
|
52
52
|
export { default as generateSequenceData } from "./generateSequenceData";
|
|
53
53
|
export { default as generateAnnotations } from "./generateAnnotations";
|
|
54
|
-
export { default as filterAminoAcidSequenceString } from "./filterAminoAcidSequenceString";
|
|
55
54
|
export { default as filterSequenceString } from "./filterSequenceString";
|
|
56
55
|
export { default as findNearestRangeOfSequenceOverlapToPosition } from "./findNearestRangeOfSequenceOverlapToPosition";
|
|
57
56
|
export { default as findOrfsInPlasmid } from "./findOrfsInPlasmid";
|
package/src/proteinAlphabet.js
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
const proteinAlphabet = {
|
|
3
2
|
A: {
|
|
4
3
|
value: "A",
|
|
@@ -18,7 +17,7 @@ const proteinAlphabet = {
|
|
|
18
17
|
color: "hsl(258.1, 100%, 69%)",
|
|
19
18
|
mass: 156.18568
|
|
20
19
|
},
|
|
21
|
-
|
|
20
|
+
|
|
22
21
|
N: {
|
|
23
22
|
value: "N",
|
|
24
23
|
name: "Asparagine",
|
|
@@ -46,7 +45,7 @@ const proteinAlphabet = {
|
|
|
46
45
|
color: "hsl(335.1, 100%, 69%)",
|
|
47
46
|
mass: 103.1429
|
|
48
47
|
},
|
|
49
|
-
|
|
48
|
+
|
|
50
49
|
E: {
|
|
51
50
|
value: "E",
|
|
52
51
|
name: "Glutamic acid",
|
|
@@ -75,7 +74,6 @@ const proteinAlphabet = {
|
|
|
75
74
|
mass: 57.05132
|
|
76
75
|
},
|
|
77
76
|
|
|
78
|
-
|
|
79
77
|
H: {
|
|
80
78
|
value: "H",
|
|
81
79
|
name: "Histidine",
|
|
@@ -114,7 +112,6 @@ const proteinAlphabet = {
|
|
|
114
112
|
mass: 128.17228
|
|
115
113
|
},
|
|
116
114
|
|
|
117
|
-
|
|
118
115
|
M: {
|
|
119
116
|
value: "M",
|
|
120
117
|
name: "Methionine",
|
|
@@ -6,7 +6,6 @@ import { cloneDeep, flatMap } from "lodash";
|
|
|
6
6
|
import { annotationTypes } from "./annotationTypes";
|
|
7
7
|
import filterSequenceString from "./filterSequenceString";
|
|
8
8
|
import tidyUpAnnotation from "./tidyUpAnnotation";
|
|
9
|
-
import filterAminoAcidSequenceString from "./filterAminoAcidSequenceString";
|
|
10
9
|
import getDegenerateDnaStringFromAaString from "./getDegenerateDnaStringFromAAString";
|
|
11
10
|
import { getFeatureTypes } from "./featureTypesAndColors";
|
|
12
11
|
|
|
@@ -14,14 +13,13 @@ export default function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
14
13
|
const {
|
|
15
14
|
annotationsAsObjects,
|
|
16
15
|
logMessages,
|
|
17
|
-
|
|
16
|
+
doNotRemoveInvalidChars,
|
|
18
17
|
additionalValidChars,
|
|
19
18
|
noTranslationData,
|
|
20
|
-
charOverrides,
|
|
21
19
|
doNotProvideIdsForAnnotations,
|
|
22
|
-
proteinFilterOptions,
|
|
23
20
|
noCdsTranslations,
|
|
24
|
-
convertAnnotationsFromAAIndices
|
|
21
|
+
convertAnnotationsFromAAIndices,
|
|
22
|
+
topLevelSeqData
|
|
25
23
|
} = options;
|
|
26
24
|
let seqData = cloneDeep(pSeqData); //sequence is usually immutable, so we clone it and return it
|
|
27
25
|
const response = {
|
|
@@ -54,20 +52,18 @@ export default function tidyUpSequenceData(pSeqData, options = {}) {
|
|
|
54
52
|
//flip all t's to u's
|
|
55
53
|
seqData.sequence = seqData.sequence.replace(/t/gi, "u");
|
|
56
54
|
}
|
|
57
|
-
if (
|
|
55
|
+
if (!doNotRemoveInvalidChars) {
|
|
58
56
|
if (seqData.isProtein) {
|
|
59
|
-
seqData.proteinSequence
|
|
60
|
-
seqData
|
|
61
|
-
|
|
62
|
-
|
|
57
|
+
const [newSeq] = filterSequenceString(seqData.proteinSequence, {
|
|
58
|
+
...(topLevelSeqData || seqData)
|
|
59
|
+
});
|
|
60
|
+
seqData.proteinSequence = newSeq;
|
|
63
61
|
} else {
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
charOverrides
|
|
70
|
-
);
|
|
62
|
+
const [newSeq] = filterSequenceString(seqData.sequence, {
|
|
63
|
+
additionalValidChars,
|
|
64
|
+
...(topLevelSeqData || seqData)
|
|
65
|
+
});
|
|
66
|
+
seqData.sequence = newSeq;
|
|
71
67
|
}
|
|
72
68
|
}
|
|
73
69
|
if (seqData.isProtein) {
|
|
@@ -5,7 +5,7 @@ import chaiSubset from "chai-subset";
|
|
|
5
5
|
chai.use(chaiSubset);
|
|
6
6
|
chai.should();
|
|
7
7
|
describe("tidyUpSequenceData", () => {
|
|
8
|
-
it("should remove
|
|
8
|
+
it("should remove invalid chars by default, while handling annotation start,end (and location start,end) truncation correctly", () => {
|
|
9
9
|
const res = tidyUpSequenceData(
|
|
10
10
|
{
|
|
11
11
|
sequence: "http://localhost:3344/Standalone",
|
|
@@ -26,7 +26,7 @@ describe("tidyUpSequenceData", () => {
|
|
|
26
26
|
}
|
|
27
27
|
]
|
|
28
28
|
},
|
|
29
|
-
|
|
29
|
+
|
|
30
30
|
);
|
|
31
31
|
res.should.containSubset({
|
|
32
32
|
sequence: "httcahstStandan",
|
|
@@ -49,15 +49,6 @@ describe("tidyUpSequenceData", () => {
|
|
|
49
49
|
]
|
|
50
50
|
});
|
|
51
51
|
});
|
|
52
|
-
// const res = tidyUpSequenceData(
|
|
53
|
-
// {
|
|
54
|
-
// isProtein: true,
|
|
55
|
-
// circular: true,
|
|
56
|
-
// proteinSequence: "gagiuhwgagalasjglj*.",
|
|
57
|
-
// features: [{ start: 3, end: 10 }, { start: 10, end: 20 }]
|
|
58
|
-
// },
|
|
59
|
-
// { convertAnnotationsFromAAIndices: true, removeUnwantedChars: true }
|
|
60
|
-
// );
|
|
61
52
|
|
|
62
53
|
it("should handle a protein sequence being passed in with isProtein set to true", () => {
|
|
63
54
|
const res = tidyUpSequenceData(
|
|
@@ -71,69 +62,24 @@ describe("tidyUpSequenceData", () => {
|
|
|
71
62
|
{ name: "iDon'tFit", start: 25, end: 35 }
|
|
72
63
|
]
|
|
73
64
|
},
|
|
74
|
-
{ convertAnnotationsFromAAIndices: true
|
|
65
|
+
{ convertAnnotationsFromAAIndices: true }
|
|
75
66
|
);
|
|
67
|
+
|
|
76
68
|
res.should.containSubset({
|
|
77
|
-
aminoAcidDataForEachBaseOfDNA: [
|
|
78
|
-
{
|
|
79
|
-
aminoAcid: {
|
|
80
|
-
value: ".",
|
|
81
|
-
name: "Gap",
|
|
82
|
-
threeLettersName: "Gap"
|
|
83
|
-
},
|
|
84
|
-
positionInCodon: 0,
|
|
85
|
-
aminoAcidIndex: 17,
|
|
86
|
-
sequenceIndex: 51,
|
|
87
|
-
codonRange: {
|
|
88
|
-
start: 51,
|
|
89
|
-
end: 53
|
|
90
|
-
},
|
|
91
|
-
fullCodon: true
|
|
92
|
-
},
|
|
93
|
-
{
|
|
94
|
-
aminoAcid: {
|
|
95
|
-
value: ".",
|
|
96
|
-
name: "Gap",
|
|
97
|
-
threeLettersName: "Gap"
|
|
98
|
-
},
|
|
99
|
-
positionInCodon: 1,
|
|
100
|
-
aminoAcidIndex: 17,
|
|
101
|
-
sequenceIndex: 52,
|
|
102
|
-
codonRange: {
|
|
103
|
-
start: 51,
|
|
104
|
-
end: 53
|
|
105
|
-
},
|
|
106
|
-
fullCodon: true
|
|
107
|
-
},
|
|
108
|
-
{
|
|
109
|
-
aminoAcid: {
|
|
110
|
-
value: ".",
|
|
111
|
-
name: "Gap",
|
|
112
|
-
threeLettersName: "Gap"
|
|
113
|
-
},
|
|
114
|
-
positionInCodon: 2,
|
|
115
|
-
aminoAcidIndex: 17,
|
|
116
|
-
sequenceIndex: 53,
|
|
117
|
-
codonRange: {
|
|
118
|
-
start: 51,
|
|
119
|
-
end: 53
|
|
120
|
-
},
|
|
121
|
-
fullCodon: true
|
|
122
|
-
}
|
|
123
|
-
],
|
|
69
|
+
aminoAcidDataForEachBaseOfDNA: [],
|
|
124
70
|
isProtein: true,
|
|
125
|
-
size:
|
|
126
|
-
proteinSize:
|
|
127
|
-
sequence: "
|
|
128
|
-
proteinSequence: "
|
|
71
|
+
size: 57, //size should refer to the DNA length
|
|
72
|
+
proteinSize: 19, //proteinSize should refer to the amino acid length
|
|
73
|
+
sequence: "ggngcnggnathtgacaytggggngcnggngcnytngcnwsnhtnggnytnhtntrr", //degenerate sequence
|
|
74
|
+
proteinSequence: "gagiuhwgagalasjglj*",
|
|
129
75
|
circular: false,
|
|
130
76
|
features: [
|
|
131
77
|
{ start: 9, end: 32, forward: true },
|
|
132
|
-
{ start: 30, end:
|
|
78
|
+
{ start: 30, end: 56, forward: true },
|
|
133
79
|
{
|
|
134
80
|
name: "iDon'tFit",
|
|
135
|
-
start:
|
|
136
|
-
end:
|
|
81
|
+
start: 54,
|
|
82
|
+
end: 56,
|
|
137
83
|
forward: true
|
|
138
84
|
}
|
|
139
85
|
]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export default function filterAminoAcidSequenceString(sequenceString: any, options: any): any;
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
//
|
|
2
|
-
export default function filterAminoAcidSequenceString(sequenceString, options) {
|
|
3
|
-
options = options || {};
|
|
4
|
-
if (options.includeStopCodon) {
|
|
5
|
-
//tnrtodo this maybe needs the stop codon char in it?
|
|
6
|
-
return sequenceString?.replace(/[^xtgalmfwkqespvicyhrndu.*]/gi, "");
|
|
7
|
-
}
|
|
8
|
-
// ac.throw(ac.string, sequenceString);
|
|
9
|
-
return sequenceString?.replace(/[^xtgalmfwkqespvicyhrndu]/gi, "");
|
|
10
|
-
}
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
import assert from "assert";
|
|
2
|
-
import filterAminoAcidSequenceString from "./filterAminoAcidSequenceString";
|
|
3
|
-
describe("filterAminoAcidSequenceString", () => {
|
|
4
|
-
it("should filter only valid amino acids by default", () => {
|
|
5
|
-
const filteredString = filterAminoAcidSequenceString(
|
|
6
|
-
'bbb342"""xtgalmfwkqespvicyhrnd,,../'
|
|
7
|
-
);
|
|
8
|
-
assert.equal(filteredString, "xtgalmfwkqespvicyhrnd");
|
|
9
|
-
});
|
|
10
|
-
it("should handle upper case letters", () => {
|
|
11
|
-
const filteredString = filterAminoAcidSequenceString(
|
|
12
|
-
"xtgalmfWKQEspvicyhrnd"
|
|
13
|
-
);
|
|
14
|
-
assert.equal(filteredString, "xtgalmfWKQEspvicyhrnd");
|
|
15
|
-
});
|
|
16
|
-
it("should handle the option to includeStopCodon by allowing periods", () => {
|
|
17
|
-
const options = { includeStopCodon: true };
|
|
18
|
-
const filteredString = filterAminoAcidSequenceString(
|
|
19
|
-
'bbb342"""xtgalmfwkqespvicyhrnd,,../',
|
|
20
|
-
options
|
|
21
|
-
);
|
|
22
|
-
assert.equal(filteredString, "xtgalmfwkqespvicyhrnd..");
|
|
23
|
-
});
|
|
24
|
-
});
|