@teselagen/sequence-utils 0.1.22 → 0.1.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +12030 -26126
- package/index.mjs +12119 -26124
- package/index.umd.js +24056 -38154
- package/package.json +4 -3
- package/src/DNAComplementMap.js +32 -0
- package/src/addGapsToSeqReads.js +417 -0
- package/src/addGapsToSeqReads.test.js +358 -0
- package/src/adjustAnnotationsToInsert.js +19 -0
- package/src/adjustBpsToReplaceOrInsert.js +50 -0
- package/src/adjustBpsToReplaceOrInsert.test.js +59 -0
- package/src/aliasedEnzymesByName.js +7363 -0
- package/src/aminoAcidToDegenerateDnaMap.js +32 -0
- package/src/aminoAcidToDegenerateRnaMap.js +32 -0
- package/src/aminoAcidToDnaRna.test.js +27 -0
- package/src/annotateSingleSeq.js +29 -0
- package/src/annotateSingleSeq.test.js +64 -0
- package/src/annotationTypes.js +23 -0
- package/src/autoAnnotate.js +242 -0
- package/src/autoAnnotate.test.js +1039 -0
- package/src/bioData.js +431 -0
- package/src/calculateNebTa.js +34 -0
- package/src/calculateNebTa.test.js +57 -0
- package/src/calculateNebTm.js +127 -0
- package/src/calculateNebTm.test.js +32 -0
- package/src/calculatePercentGC.js +3 -0
- package/src/calculatePercentGC.test.js +14 -0
- package/src/calculateTm.js +297 -0
- package/src/calculateTm.test.js +7 -0
- package/src/computeDigestFragments.js +179 -0
- package/src/computeDigestFragments.test.js +73 -0
- package/src/condensePairwiseAlignmentDifferences.js +85 -0
- package/src/condensePairwiseAlignmentDifferences.test.js +66 -0
- package/src/convertAACaretPositionOrRangeToDna.js +24 -0
- package/src/convertAACaretPositionOrRangeToDna.test.js +34 -0
- package/src/convertDnaCaretPositionOrRangeToAA.js +24 -0
- package/src/convertDnaCaretPositionOrRangeToAA.test.js +37 -0
- package/src/cutSequenceByRestrictionEnzyme.js +301 -0
- package/src/cutSequenceByRestrictionEnzyme.test.js +296 -0
- package/src/defaultEnzymesByName.js +278 -0
- package/src/degenerateDnaToAminoAcidMap.js +5 -0
- package/src/degenerateRnaToAminoAcidMap.js +5 -0
- package/src/deleteSequenceDataAtRange.js +5 -0
- package/src/deleteSequenceDataAtRange.test.js +146 -0
- package/src/diffUtils.js +64 -0
- package/src/diffUtils.test.js +74 -0
- package/src/doesEnzymeChopOutsideOfRecognitionSite.js +10 -0
- package/src/doesEnzymeChopOutsideOfRecognitionSite.test.js +41 -0
- package/src/featureTypesAndColors.js +152 -0
- package/src/featureTypesAndColors.test.js +52 -0
- package/src/filterAminoAcidSequenceString.js +13 -0
- package/src/filterAminoAcidSequenceString.test.js +22 -0
- package/src/filterSequenceString.js +22 -0
- package/src/filterSequenceString.test.js +13 -0
- package/src/findNearestRangeOfSequenceOverlapToPosition.js +39 -0
- package/src/findNearestRangeOfSequenceOverlapToPosition.test.js +31 -0
- package/src/findOrfsInPlasmid.js +26 -0
- package/src/findSequenceMatches.js +133 -0
- package/src/findSequenceMatches.test.js +286 -0
- package/src/generateAnnotations.js +34 -0
- package/src/generateSequenceData.js +206 -0
- package/src/generateSequenceData.test.js +22 -0
- package/src/getAllInsertionsInSeqReads.js +83 -0
- package/src/getAllInsertionsInSeqReads.test.js +26 -0
- package/src/getAminoAcidDataForEachBaseOfDna.js +163 -0
- package/src/getAminoAcidDataForEachBaseOfDna.test.js +424 -0
- package/src/getAminoAcidFromSequenceTriplet.js +22 -0
- package/src/getAminoAcidStringFromSequenceString.js +18 -0
- package/src/getAminoAcidStringFromSequenceString.test.js +18 -0
- package/src/getCodonRangeForAASliver.js +63 -0
- package/src/getComplementAminoAcidStringFromSequenceString.js +11 -0
- package/src/getComplementSequenceAndAnnotations.js +20 -0
- package/src/getComplementSequenceString.js +19 -0
- package/src/getComplementSequenceString.test.js +13 -0
- package/src/getCutsiteType.js +10 -0
- package/src/getCutsitesFromSequence.js +17 -0
- package/src/getDegenerateDnaStringFromAAString.js +8 -0
- package/src/getDegenerateRnaStringFromAAString.js +8 -0
- package/src/getDigestFragmentsForCutsites.js +105 -0
- package/src/getDigestFragmentsForRestrictionEnzymes.js +27 -0
- package/src/getDigestFragmentsForRestrictionEnzymes.test.js +228 -0
- package/src/getInsertBetweenVals.js +28 -0
- package/src/getInsertBetweenVals.test.js +33 -0
- package/src/getLeftAndRightOfSequenceInRangeGivenPosition.js +39 -0
- package/src/getLeftAndRightOfSequenceInRangeGivenPosition.test.js +80 -0
- package/src/getMassOfAaString.js +24 -0
- package/src/getMassofAaString.test.js +18 -0
- package/src/getOrfsFromSequence.js +124 -0
- package/src/getOrfsFromSequence.test.js +210 -0
- package/src/getOverlapBetweenTwoSequences.js +30 -0
- package/src/getOverlapBetweenTwoSequences.test.js +23 -0
- package/src/getPossiblePartsFromSequenceAndEnzymes.js +121 -0
- package/src/getPossiblePartsFromSequenceAndEnzymes.test.js +208 -0
- package/src/getReverseAminoAcidStringFromSequenceString.js +20 -0
- package/src/getReverseAminoAcidStringFromSequenceString.test.js +11 -0
- package/src/getReverseComplementAminoAcidStringFromSequenceString.js +7 -0
- package/src/getReverseComplementAnnotation.js +23 -0
- package/src/getReverseComplementAnnotation.test.js +44 -0
- package/src/getReverseComplementSequenceAndAnnotations.js +38 -0
- package/src/getReverseComplementSequenceAndAnnotations.test.js +105 -0
- package/src/getReverseComplementSequenceString.js +17 -0
- package/src/getReverseComplementSequenceString.test.js +11 -0
- package/src/getReverseSequenceString.js +12 -0
- package/src/getReverseSequenceString.test.js +9 -0
- package/src/getSequenceDataBetweenRange.js +131 -0
- package/src/getSequenceDataBetweenRange.test.js +474 -0
- package/src/getVirtualDigest.js +125 -0
- package/src/getVirtualDigest.test.js +134 -0
- package/src/guessIfSequenceIsDnaAndNotProtein.js +33 -0
- package/src/guessIfSequenceIsDnaAndNotProtein.test.js +34 -0
- package/src/index.js +106 -0
- package/src/index.test.js +38 -0
- package/src/insertGapsIntoRefSeq.js +38 -0
- package/src/insertGapsIntoRefSeq.test.js +20 -0
- package/src/insertSequenceDataAtPosition.js +2 -0
- package/src/insertSequenceDataAtPosition.test.js +75 -0
- package/src/insertSequenceDataAtPositionOrRange.js +249 -0
- package/src/insertSequenceDataAtPositionOrRange.test.js +547 -0
- package/src/isEnzymeType2S.js +3 -0
- package/src/mapAnnotationsToRows.js +174 -0
- package/src/mapAnnotationsToRows.test.js +425 -0
- package/src/prepareCircularViewData.js +17 -0
- package/src/prepareCircularViewData.test.js +196 -0
- package/src/prepareRowData.js +41 -0
- package/src/prepareRowData.test.js +36 -0
- package/src/prepareRowData_output1.json +391 -0
- package/src/proteinAlphabet.js +257 -0
- package/src/rotateBpsToPosition.js +13 -0
- package/src/rotateBpsToPosition.test.js +6 -0
- package/src/rotateSequenceDataToPosition.js +48 -0
- package/src/rotateSequenceDataToPosition.test.js +71 -0
- package/src/shiftAnnotationsByLen.js +17 -0
- package/src/threeLetterSequenceStringToAminoAcidMap.js +106 -0
- package/src/tidyUpAnnotation.js +182 -0
- package/src/tidyUpSequenceData.js +169 -0
- package/src/tidyUpSequenceData.test.js +332 -0
package/package.json
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@teselagen/sequence-utils",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.24",
|
|
4
4
|
"type": "commonjs",
|
|
5
5
|
"dependencies": {
|
|
6
|
-
"@teselagen/range-utils": "0.1.
|
|
6
|
+
"@teselagen/range-utils": "0.1.23",
|
|
7
7
|
"bson-objectid": "^2.0.4",
|
|
8
8
|
"escape-string-regexp": "^5.0.0",
|
|
9
9
|
"jsondiffpatch-rc": "0.4.2",
|
|
10
10
|
"string-splice": "^1.3.0"
|
|
11
|
-
}
|
|
11
|
+
},
|
|
12
|
+
"license": "MIT"
|
|
12
13
|
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
const DNAComplementMap = {
|
|
2
|
+
".": ".",
|
|
3
|
+
a: "t",
|
|
4
|
+
t: "a",
|
|
5
|
+
u: "a",
|
|
6
|
+
c: "g",
|
|
7
|
+
g: "c",
|
|
8
|
+
A: "T",
|
|
9
|
+
T: "A",
|
|
10
|
+
U: "A",
|
|
11
|
+
C: "G",
|
|
12
|
+
G: "C",
|
|
13
|
+
r: "y",
|
|
14
|
+
R: "Y",
|
|
15
|
+
y: "r",
|
|
16
|
+
Y: "R",
|
|
17
|
+
d: "h",
|
|
18
|
+
D: "H",
|
|
19
|
+
h: "d",
|
|
20
|
+
H: "D",
|
|
21
|
+
k: "m",
|
|
22
|
+
K: "M",
|
|
23
|
+
m: "k",
|
|
24
|
+
M: "K",
|
|
25
|
+
v: "b",
|
|
26
|
+
V: "B",
|
|
27
|
+
b: "v",
|
|
28
|
+
B: "V"
|
|
29
|
+
//tnrtodo add more letters here
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
export default DNAComplementMap;
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
import insertGapsIntoRefSeq from "./insertGapsIntoRefSeq.js";
|
|
2
|
+
|
|
3
|
+
import {cloneDeep} from "lodash";
|
|
4
|
+
|
|
5
|
+
// bam.seq: NTGTAAGTCGTGAAAAAANCNNNCATATTNCGGAGGTAAAAATGAAAA...
|
|
6
|
+
// bam.pos: 43
|
|
7
|
+
// bam.cigar: 36M2D917M3I17M7I2M1I6M5I4M1D6M12I8M
|
|
8
|
+
// (note: bam.cigar is null if the sequencing read is unaligned)
|
|
9
|
+
// bam.reversed: true (if reversed)
|
|
10
|
+
|
|
11
|
+
// refSeq should be an object { name, sequence }
|
|
12
|
+
// seqReads should be an array of objects [{name, seq, pos, cigar}, {name, seq, pos, cigar}, ...]
|
|
13
|
+
// add gaps into sequencing reads before starting bp pos and from own deletions & all seq reads' insertions, minus own insertions
|
|
14
|
+
export default function addGapsToSeqReads(refSeq, seqReads) {
|
|
15
|
+
// remove unaligned seq reads for now
|
|
16
|
+
for (let i = 0; i < seqReads.length; i++) {
|
|
17
|
+
if (seqReads[i].cigar === null) {
|
|
18
|
+
seqReads.splice(i, 1);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const refSeqWithGaps = insertGapsIntoRefSeq(refSeq.sequence, seqReads);
|
|
23
|
+
// first object is reference sequence with gaps, to be followed by seq reads with gaps
|
|
24
|
+
const seqReadsWithGaps = [
|
|
25
|
+
{ name: refSeq.name, sequence: refSeqWithGaps.toUpperCase() }
|
|
26
|
+
];
|
|
27
|
+
seqReads.forEach(seqRead => {
|
|
28
|
+
// get all insertions in seq reads
|
|
29
|
+
const allInsertionsInSeqReads = [];
|
|
30
|
+
seqReads.forEach(seqRead => {
|
|
31
|
+
// split cigar string at S, M, D, or I (soft-clipped, match, deletion, or insertion), e.g. ["5S", "2M", "3I", "39M", "3D"..."9S"]
|
|
32
|
+
const splitSeqRead = seqRead.cigar.match(/([0-9]*[SMDI])/g);
|
|
33
|
+
// adjust seqRead.pos, aka bp pos where the seq read starts aligning to the ref seq, if bps have been soft-clipped from the beginning of the seq read
|
|
34
|
+
let adjustedSeqReadPos = cloneDeep(seqRead.pos);
|
|
35
|
+
if (splitSeqRead[0].slice(-1) === "S") {
|
|
36
|
+
// # in #S at beginning of array, i.e. number of soft-clipped base pairs at beginning of the seq read
|
|
37
|
+
const numOfBeginningSoftClipped = splitSeqRead[0].slice(0, -1);
|
|
38
|
+
adjustedSeqReadPos = seqRead.pos - numOfBeginningSoftClipped;
|
|
39
|
+
}
|
|
40
|
+
for (let componentI = 0; componentI < splitSeqRead.length; componentI++) {
|
|
41
|
+
if (splitSeqRead[componentI].slice(-1) === "I") {
|
|
42
|
+
let bpPosOfInsertion = adjustedSeqReadPos;
|
|
43
|
+
const numberOfInsertions = Number(
|
|
44
|
+
splitSeqRead[componentI].slice(0, -1)
|
|
45
|
+
);
|
|
46
|
+
for (let i = 0; i < componentI; i++) {
|
|
47
|
+
if (splitSeqRead[i].slice(-1) !== "I") {
|
|
48
|
+
const previousComponentNumber = Number(
|
|
49
|
+
splitSeqRead[i].slice(0, -1)
|
|
50
|
+
);
|
|
51
|
+
bpPosOfInsertion += previousComponentNumber;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
const insertionInfo = {
|
|
55
|
+
// keeping bpPos 1-based
|
|
56
|
+
bpPos: bpPosOfInsertion,
|
|
57
|
+
number: numberOfInsertions
|
|
58
|
+
};
|
|
59
|
+
allInsertionsInSeqReads.push(insertionInfo);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
// 1) add gaps before starting bp pos
|
|
65
|
+
const splitSeqReadChunk = seqRead.cigar.match(/([0-9]*[SMDI])/g);
|
|
66
|
+
let adjustedSeqReadPos = cloneDeep(seqRead.pos);
|
|
67
|
+
if (splitSeqReadChunk[0].slice(-1) === "S") {
|
|
68
|
+
// # in #S at beginning of array, i.e. number of soft-clipped base pairs at beginning of the seq read
|
|
69
|
+
const numOfBeginningSoftClipped = splitSeqReadChunk[0].slice(0, -1);
|
|
70
|
+
adjustedSeqReadPos = seqRead.pos - numOfBeginningSoftClipped;
|
|
71
|
+
}
|
|
72
|
+
let eachSeqReadWithGaps = seqRead.seq.split("");
|
|
73
|
+
if (adjustedSeqReadPos > 0) {
|
|
74
|
+
eachSeqReadWithGaps.unshift("-".repeat(adjustedSeqReadPos - 1));
|
|
75
|
+
}
|
|
76
|
+
eachSeqReadWithGaps = eachSeqReadWithGaps.join("").split("");
|
|
77
|
+
|
|
78
|
+
// 2) add own deletions to own sequence
|
|
79
|
+
// get own deletions
|
|
80
|
+
const ownDeletions = [];
|
|
81
|
+
for (
|
|
82
|
+
let componentI = 0;
|
|
83
|
+
componentI < splitSeqReadChunk.length;
|
|
84
|
+
componentI++
|
|
85
|
+
) {
|
|
86
|
+
if (splitSeqReadChunk[componentI].slice(-1) === "D") {
|
|
87
|
+
let bpPosOfDeletion = adjustedSeqReadPos;
|
|
88
|
+
const numberOfDeletions = Number(
|
|
89
|
+
splitSeqReadChunk[componentI].slice(0, -1)
|
|
90
|
+
);
|
|
91
|
+
for (let i = 0; i < componentI; i++) {
|
|
92
|
+
const previousComponentNumber = Number(
|
|
93
|
+
splitSeqReadChunk[i].slice(0, -1)
|
|
94
|
+
);
|
|
95
|
+
bpPosOfDeletion += previousComponentNumber;
|
|
96
|
+
}
|
|
97
|
+
const deletionInfo = {
|
|
98
|
+
// keeping bpPos 1-based
|
|
99
|
+
bpPos: bpPosOfDeletion,
|
|
100
|
+
number: numberOfDeletions
|
|
101
|
+
};
|
|
102
|
+
ownDeletions.push(deletionInfo);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// sort deletions by ascending bp pos
|
|
106
|
+
const sortedOwnDeletions = ownDeletions.sort((a, b) => {
|
|
107
|
+
return a.bpPos - b.bpPos;
|
|
108
|
+
});
|
|
109
|
+
// add own deletions to own sequence
|
|
110
|
+
for (let ownD = 0; ownD < sortedOwnDeletions.length; ownD++) {
|
|
111
|
+
const bpPosOfDeletion = sortedOwnDeletions[ownD].bpPos;
|
|
112
|
+
const numberOfDeletions = sortedOwnDeletions[ownD].number;
|
|
113
|
+
// adding gaps at the bp pos
|
|
114
|
+
let deletionGaps = "";
|
|
115
|
+
for (let gapD = 0; gapD < numberOfDeletions; gapD++) {
|
|
116
|
+
deletionGaps += "-";
|
|
117
|
+
}
|
|
118
|
+
eachSeqReadWithGaps.splice(bpPosOfDeletion - 1, 0, deletionGaps);
|
|
119
|
+
eachSeqReadWithGaps = eachSeqReadWithGaps.join("").split("");
|
|
120
|
+
}
|
|
121
|
+
eachSeqReadWithGaps = eachSeqReadWithGaps.join("").split("");
|
|
122
|
+
|
|
123
|
+
// 3) remove own insertions from own sequence
|
|
124
|
+
// get own insertions
|
|
125
|
+
const ownInsertions = [];
|
|
126
|
+
const ownInsertionsBp = [];
|
|
127
|
+
for (
|
|
128
|
+
let componentI = 0;
|
|
129
|
+
componentI < splitSeqReadChunk.length;
|
|
130
|
+
componentI++
|
|
131
|
+
) {
|
|
132
|
+
if (splitSeqReadChunk[componentI].slice(-1) === "I") {
|
|
133
|
+
let bpPosOfInsertion = adjustedSeqReadPos;
|
|
134
|
+
const numberOfInsertions = Number(
|
|
135
|
+
splitSeqReadChunk[componentI].slice(0, -1)
|
|
136
|
+
);
|
|
137
|
+
const nucleotides = [];
|
|
138
|
+
for (let i = 0; i < componentI; i++) {
|
|
139
|
+
const previousComponentNumber = Number(
|
|
140
|
+
splitSeqReadChunk[i].slice(0, -1)
|
|
141
|
+
);
|
|
142
|
+
bpPosOfInsertion += previousComponentNumber;
|
|
143
|
+
}
|
|
144
|
+
for (let nucI = 0; nucI < numberOfInsertions; nucI++) {
|
|
145
|
+
nucleotides.push(eachSeqReadWithGaps[bpPosOfInsertion - 1 + nucI]);
|
|
146
|
+
}
|
|
147
|
+
const insertionInfo = {
|
|
148
|
+
// keeping bpPos 1-based
|
|
149
|
+
bpPos: bpPosOfInsertion,
|
|
150
|
+
number: numberOfInsertions
|
|
151
|
+
};
|
|
152
|
+
const insertionInfoBp = {
|
|
153
|
+
// keeping bpPos 1-based
|
|
154
|
+
bpPos: bpPosOfInsertion,
|
|
155
|
+
number: numberOfInsertions,
|
|
156
|
+
nucleotides: nucleotides
|
|
157
|
+
};
|
|
158
|
+
ownInsertions.push(insertionInfo);
|
|
159
|
+
ownInsertionsBp.push(insertionInfoBp);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
const ownInsertionsCompare = JSON.parse(JSON.stringify(ownInsertions));
|
|
163
|
+
// sort own insertions by ascending bp pos
|
|
164
|
+
const sortedOwnInsertions = ownInsertions.sort((a, b) => {
|
|
165
|
+
return a.bpPos - b.bpPos;
|
|
166
|
+
});
|
|
167
|
+
const sortedOwnInsertionsBp = ownInsertionsBp.sort((a, b) => {
|
|
168
|
+
return a.bpPos - b.bpPos;
|
|
169
|
+
});
|
|
170
|
+
// remove own insertions from own sequence
|
|
171
|
+
for (let ownI = 0; ownI < sortedOwnInsertions.length; ownI++) {
|
|
172
|
+
const bpPosOfInsertion = sortedOwnInsertions[ownI].bpPos;
|
|
173
|
+
const numberOfInsertions = sortedOwnInsertions[ownI].number;
|
|
174
|
+
for (let numI = 0; numI < numberOfInsertions; numI++) {
|
|
175
|
+
eachSeqReadWithGaps.splice(bpPosOfInsertion - 1, 1);
|
|
176
|
+
}
|
|
177
|
+
for (let posI = ownI + 1; posI < sortedOwnInsertions.length; posI++) {
|
|
178
|
+
sortedOwnInsertions[posI].bpPos -= numberOfInsertions;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// 4) add other seq reads' insertions to seq read
|
|
183
|
+
// get other seq reads' insertions (i.e. all insertions minus duplicates minus own insertions)
|
|
184
|
+
let otherInsertions = allInsertionsInSeqReads.sort((a, b) => {
|
|
185
|
+
return a.bpPos - b.bpPos;
|
|
186
|
+
});
|
|
187
|
+
// combine duplicates within all insertions, remove own insertions from all insertions, combine overlap between other insertions & own insertions
|
|
188
|
+
// first, combine duplicates within all insertions
|
|
189
|
+
otherInsertions = otherInsertions.filter(
|
|
190
|
+
(object, index) =>
|
|
191
|
+
index ===
|
|
192
|
+
otherInsertions.findIndex(
|
|
193
|
+
obj => JSON.stringify(obj) === JSON.stringify(object)
|
|
194
|
+
)
|
|
195
|
+
);
|
|
196
|
+
// 'i < otherInsertions.length - 1' because when at the end of the array, there is no 'i + 1' to compare to
|
|
197
|
+
for (let i = 0; i < otherInsertions.length - 1; i++) {
|
|
198
|
+
while (otherInsertions[i].bpPos === otherInsertions[i + 1].bpPos) {
|
|
199
|
+
if (otherInsertions[i].number > otherInsertions[i + 1].number) {
|
|
200
|
+
// remove the one with fewer number of gaps from array
|
|
201
|
+
otherInsertions.splice(i + 1, 1);
|
|
202
|
+
} else if (otherInsertions[i].number < otherInsertions[i + 1].number) {
|
|
203
|
+
otherInsertions.splice(i, 1);
|
|
204
|
+
} else if (
|
|
205
|
+
otherInsertions[i].number === otherInsertions[i + 1].number
|
|
206
|
+
) {
|
|
207
|
+
otherInsertions.splice(i, 1);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
// then remove own insertions from all insertions
|
|
212
|
+
for (let otherI = 0; otherI < ownInsertionsCompare.length; otherI++) {
|
|
213
|
+
const insertionInfoIndex = otherInsertions.findIndex(
|
|
214
|
+
e => e.bpPos === ownInsertionsCompare[otherI].bpPos
|
|
215
|
+
);
|
|
216
|
+
if (insertionInfoIndex !== -1) {
|
|
217
|
+
if (
|
|
218
|
+
otherInsertions[insertionInfoIndex].number >
|
|
219
|
+
ownInsertionsCompare[otherI].number
|
|
220
|
+
) {
|
|
221
|
+
otherInsertions[insertionInfoIndex].number =
|
|
222
|
+
otherInsertions[insertionInfoIndex].number -
|
|
223
|
+
ownInsertionsCompare[otherI].number;
|
|
224
|
+
} else if (
|
|
225
|
+
otherInsertions[insertionInfoIndex].number <=
|
|
226
|
+
ownInsertionsCompare[otherI].number
|
|
227
|
+
) {
|
|
228
|
+
otherInsertions.splice(insertionInfoIndex, 1);
|
|
229
|
+
otherI--;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
// then combine overlap between other insertions & own insertions
|
|
234
|
+
for (let overlapI = 0; overlapI < sortedOwnInsertions.length; overlapI++) {
|
|
235
|
+
const insertionInfoIndex = otherInsertions.findIndex(
|
|
236
|
+
e => e.bpPos === sortedOwnInsertions[overlapI].bpPos
|
|
237
|
+
);
|
|
238
|
+
if (insertionInfoIndex !== -1) {
|
|
239
|
+
if (
|
|
240
|
+
otherInsertions[insertionInfoIndex].number >
|
|
241
|
+
sortedOwnInsertions[overlapI].number
|
|
242
|
+
) {
|
|
243
|
+
otherInsertions[insertionInfoIndex].number =
|
|
244
|
+
otherInsertions[insertionInfoIndex].number -
|
|
245
|
+
sortedOwnInsertions[overlapI].number;
|
|
246
|
+
} else if (
|
|
247
|
+
otherInsertions[insertionInfoIndex].number <=
|
|
248
|
+
sortedOwnInsertions[overlapI].number
|
|
249
|
+
) {
|
|
250
|
+
otherInsertions.splice(insertionInfoIndex, 1);
|
|
251
|
+
overlapI--;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
// adjust own insertions according to other seq reads' insertions to be added (i.e. for all other reads' insertions with smaller bp pos, +1 to that own insertion's bp pos)
|
|
256
|
+
const adjustedOwnInsertionsBp = JSON.parse(
|
|
257
|
+
JSON.stringify(sortedOwnInsertionsBp)
|
|
258
|
+
);
|
|
259
|
+
for (let ownI = 0; ownI < adjustedOwnInsertionsBp.length; ownI++) {
|
|
260
|
+
let previousInserts = 0;
|
|
261
|
+
for (let i = 0; i < ownI; i++) {
|
|
262
|
+
previousInserts += adjustedOwnInsertionsBp[i].number - 1;
|
|
263
|
+
}
|
|
264
|
+
adjustedOwnInsertionsBp[ownI].bpPos =
|
|
265
|
+
adjustedOwnInsertionsBp[ownI].bpPos - previousInserts;
|
|
266
|
+
sortedOwnInsertionsBp[ownI].bpPos =
|
|
267
|
+
sortedOwnInsertionsBp[ownI].bpPos - previousInserts;
|
|
268
|
+
}
|
|
269
|
+
for (let otherI = 0; otherI < otherInsertions.length; otherI++) {
|
|
270
|
+
for (let ownI = 0; ownI < adjustedOwnInsertionsBp.length; ownI++) {
|
|
271
|
+
if (
|
|
272
|
+
otherInsertions[otherI].bpPos <= sortedOwnInsertionsBp[ownI].bpPos
|
|
273
|
+
) {
|
|
274
|
+
adjustedOwnInsertionsBp[ownI].bpPos += 1;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
// add other seq reads' insertions to sequence
|
|
279
|
+
for (
|
|
280
|
+
let otherI = 0;
|
|
281
|
+
otherI < otherInsertions.length &&
|
|
282
|
+
otherInsertions[otherI].bpPos <= eachSeqReadWithGaps.length;
|
|
283
|
+
otherI++
|
|
284
|
+
) {
|
|
285
|
+
const bpPosOfInsertion = otherInsertions[otherI].bpPos;
|
|
286
|
+
const numberOfInsertions = otherInsertions[otherI].number;
|
|
287
|
+
// adding gaps at the bp pos
|
|
288
|
+
let insertionGaps = "";
|
|
289
|
+
for (let gapI = 0; gapI < numberOfInsertions; gapI++) {
|
|
290
|
+
insertionGaps += "-";
|
|
291
|
+
}
|
|
292
|
+
eachSeqReadWithGaps.splice(bpPosOfInsertion - 1, 0, insertionGaps);
|
|
293
|
+
for (let posI = otherI + 1; posI < otherInsertions.length; posI++) {
|
|
294
|
+
otherInsertions[posI].bpPos += 1;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// 5) add own insertions to own sequence
|
|
299
|
+
for (let ownI = 0; ownI < adjustedOwnInsertionsBp.length; ownI++) {
|
|
300
|
+
const bpPosOfInsertion = adjustedOwnInsertionsBp[ownI].bpPos;
|
|
301
|
+
const nucleotides = adjustedOwnInsertionsBp[ownI].nucleotides.join("");
|
|
302
|
+
eachSeqReadWithGaps.splice(bpPosOfInsertion - 1, 0, nucleotides);
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// 6) add gaps after seq read for ref seq's length = seq read's length
|
|
306
|
+
eachSeqReadWithGaps = eachSeqReadWithGaps.join("").split("");
|
|
307
|
+
if (eachSeqReadWithGaps.length < refSeqWithGaps.length) {
|
|
308
|
+
eachSeqReadWithGaps.push(
|
|
309
|
+
"-".repeat(refSeqWithGaps.length - eachSeqReadWithGaps.length)
|
|
310
|
+
);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// eachSeqReadWithGaps is a string "GGGA--GA-C--ACC"
|
|
314
|
+
seqReadsWithGaps.push({
|
|
315
|
+
name: seqRead.name,
|
|
316
|
+
sequence: eachSeqReadWithGaps.join(""),
|
|
317
|
+
reversed: seqRead.reversed,
|
|
318
|
+
cigar: seqRead.cigar
|
|
319
|
+
});
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
// 7) add gaps before starting bp pos
|
|
323
|
+
// add gaps based on any seq reads that extend beyond beginning of the ref seq due to soft-clipped reads
|
|
324
|
+
// a) get the lengths of bps that extend beyond the beginning of the ref seq among all seq reads
|
|
325
|
+
const seqReadLengthsBeforeRefSeqStart = [];
|
|
326
|
+
seqReads.forEach(seq => {
|
|
327
|
+
const splitSeqReadChunk = seq.cigar.match(/([0-9]*[SMDI])/g);
|
|
328
|
+
let adjustedSeqReadPos = cloneDeep(seq.pos);
|
|
329
|
+
if (splitSeqReadChunk[0].slice(-1) === "S") {
|
|
330
|
+
// # in #S at beginning of array, i.e. number of soft-clipped base pairs at beginning of the seq read
|
|
331
|
+
const numOfBeginningSoftClipped = splitSeqReadChunk[0].slice(0, -1);
|
|
332
|
+
adjustedSeqReadPos = seq.pos - numOfBeginningSoftClipped;
|
|
333
|
+
// number of gaps to add if soft-clipped reads extend beyond beginning of ref seq
|
|
334
|
+
if (adjustedSeqReadPos < 0) {
|
|
335
|
+
seqReadLengthsBeforeRefSeqStart.push(Math.abs(adjustedSeqReadPos));
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
// number of gaps to add if seqRead.pos is negative (not sure if this is possible with bowtie2 outputs)
|
|
339
|
+
// if (seq.pos < 0) {
|
|
340
|
+
// seqReadLengthsBeforeRefSeqStart.push(Math.abs(seq.pos))
|
|
341
|
+
// }
|
|
342
|
+
});
|
|
343
|
+
// b) add gaps (to both ref seq and seq reads) based on any seq reads that extend beyond beginning of ref seq due to soft-clipped reads
|
|
344
|
+
let longestSeqReadLength = 0;
|
|
345
|
+
for (let i = 1; i < seqReadsWithGaps.length; i++) {
|
|
346
|
+
// turn seq read into an array ["A", "T", "C", "G"...]
|
|
347
|
+
const eachSeqReadWithGaps = seqReadsWithGaps[i].sequence.split("");
|
|
348
|
+
const splitSeqReadChunk = seqReads[i - 1].cigar.match(/([0-9]*[SMDI])/g);
|
|
349
|
+
let adjustedSeqReadPos = cloneDeep(seqReads[i - 1].pos);
|
|
350
|
+
// longest length of bps that extend beyond the beginning of the ref seq among all seq reads
|
|
351
|
+
if (seqReadLengthsBeforeRefSeqStart.length > 0) {
|
|
352
|
+
longestSeqReadLength = Math.max(...seqReadLengthsBeforeRefSeqStart);
|
|
353
|
+
}
|
|
354
|
+
if (splitSeqReadChunk[0].slice(-1) === "S") {
|
|
355
|
+
// # in #S at beginning of array, i.e. number of soft-clipped base pairs at beginning of the seq read
|
|
356
|
+
const numOfBeginningSoftClipped = splitSeqReadChunk[0].slice(0, -1);
|
|
357
|
+
adjustedSeqReadPos = seqReads[i - 1].pos - numOfBeginningSoftClipped;
|
|
358
|
+
if (adjustedSeqReadPos > 0) {
|
|
359
|
+
if (longestSeqReadLength > 0) {
|
|
360
|
+
eachSeqReadWithGaps.unshift("-".repeat(longestSeqReadLength + 1));
|
|
361
|
+
}
|
|
362
|
+
seqReadsWithGaps[i].sequence = eachSeqReadWithGaps.join("");
|
|
363
|
+
} else if (adjustedSeqReadPos < 0) {
|
|
364
|
+
if (longestSeqReadLength > 0) {
|
|
365
|
+
eachSeqReadWithGaps.unshift(
|
|
366
|
+
"-".repeat(longestSeqReadLength - Math.abs(adjustedSeqReadPos))
|
|
367
|
+
);
|
|
368
|
+
}
|
|
369
|
+
seqReadsWithGaps[i].sequence = eachSeqReadWithGaps.join("");
|
|
370
|
+
}
|
|
371
|
+
} else {
|
|
372
|
+
if (longestSeqReadLength > 0) {
|
|
373
|
+
eachSeqReadWithGaps.unshift("-".repeat(longestSeqReadLength + 1));
|
|
374
|
+
}
|
|
375
|
+
seqReadsWithGaps[i].sequence = eachSeqReadWithGaps.join("");
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// add gaps before ref seq based on the longest length of soft-clipped reads that extend beyond beginning of ref seq
|
|
380
|
+
if (longestSeqReadLength > 0) {
|
|
381
|
+
const splitRefSeqWithGaps = seqReadsWithGaps[0].sequence.split("");
|
|
382
|
+
splitRefSeqWithGaps.unshift("-".repeat(longestSeqReadLength + 1));
|
|
383
|
+
seqReadsWithGaps[0].sequence = splitRefSeqWithGaps.join("");
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
// 8) check if any seq read is longer than the ref seq, make ref seq & seq reads all the same length
|
|
387
|
+
const lengthsOfLongerSeqReads = [];
|
|
388
|
+
for (let i = 1; i < seqReadsWithGaps.length; i++) {
|
|
389
|
+
const refSeq = seqReadsWithGaps[0];
|
|
390
|
+
if (seqReadsWithGaps[i].sequence.length > refSeq.sequence.length) {
|
|
391
|
+
lengthsOfLongerSeqReads.push(seqReadsWithGaps[i].sequence.length);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
if (lengthsOfLongerSeqReads.length > 0) {
|
|
395
|
+
const longestSeqReadLength = Math.max(...lengthsOfLongerSeqReads);
|
|
396
|
+
for (let i = 0; i < seqReadsWithGaps.length; i++) {
|
|
397
|
+
if (seqReadsWithGaps[i].sequence.length < longestSeqReadLength) {
|
|
398
|
+
seqReadsWithGaps[i].sequence += "-".repeat(
|
|
399
|
+
longestSeqReadLength - seqReadsWithGaps[i].sequence.length
|
|
400
|
+
);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
// if any seq read shorter than ref seq, make ref seq & seq reads all the same length
|
|
405
|
+
for (let i = 1; i < seqReadsWithGaps.length; i++) {
|
|
406
|
+
const refSeq = seqReadsWithGaps[0];
|
|
407
|
+
if (seqReadsWithGaps[i].sequence.length < refSeq.sequence.length) {
|
|
408
|
+
seqReadsWithGaps[i].sequence += "-".repeat(
|
|
409
|
+
refSeq.sequence.length - seqReadsWithGaps[i].sequence.length
|
|
410
|
+
);
|
|
411
|
+
}
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// seqReadsWithGaps is an array of objects containing the ref seq with gaps first and then all seq reads with gaps
|
|
415
|
+
// e.g. [{ name: "ref seq", sequence: "GG---GA--GA-C--A---CC---"}, { name: "r1", sequence: "-----GATTGA-C-----------"}...]
|
|
416
|
+
return seqReadsWithGaps;
|
|
417
|
+
};
|