@teselagen/bio-parsers 0.1.27 → 0.1.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,21 +1,17 @@
1
1
  {
2
2
  "name": "@teselagen/bio-parsers",
3
- "version": "0.1.27",
3
+ "version": "0.1.28",
4
4
  "type": "commonjs",
5
5
  "dependencies": {
6
- "@teselagen/sequence-utils": "0.1.22",
7
- "@teselagen/range-utils": "0.1.21",
6
+ "@teselagen/sequence-utils": "0.1.23",
7
+ "@teselagen/range-utils": "0.1.22",
8
8
  "@gmod/gff": "^1.2.1",
9
- "bson-objectid": "2.0.4",
10
9
  "buffer": "^6.0.3",
11
10
  "bufferpack": "^0.0.6",
12
11
  "color": "^4.2.3",
13
- "escape-string-regexp": "1.0.5",
14
12
  "fast-xml-parser": "^4.2.5",
15
13
  "fflate": "^0.8.0",
16
- "jsondiffpatch-rc": "0.4.2",
17
14
  "lodash": "^4.17.21",
18
- "shortid": "^2.2.16",
19
15
  "string_decoder": "^1.3.0",
20
16
  "validate.io-nonnegative-integer-array": "^1.0.1"
21
17
  }
@@ -0,0 +1,177 @@
1
+ import createInitialSequence from "./utils/createInitialSequence";
2
+ import getArrayBufferFromFile from "./utils/getArrayBufferFromFile";
3
+
4
+ async function ab1ToJson(fileObj, options = {}) {
5
+ const arrayBuffer = await getArrayBufferFromFile(fileObj);
6
+ const dataview = new DataView(arrayBuffer);
7
+ const converter = new abConverter(dataview);
8
+ const chromatogramData = converter.getTraceData();
9
+ const returnVal = createInitialSequence(options);
10
+ returnVal.parsedSequence = {
11
+ ...returnVal.parsedSequence,
12
+ sequence: chromatogramData.baseCalls.join(""),
13
+ chromatogramData,
14
+ };
15
+ return [returnVal];
16
+ }
17
+
18
+ export default ab1ToJson;
19
+
20
+ function abConverter(inputArrayBuffer) {
21
+ const dirLocation = inputArrayBuffer.getInt32(26);
22
+ const numElements = inputArrayBuffer.getInt32(18);
23
+ const lastEntry = dirLocation + numElements * 28;
24
+
25
+
26
+ this.getNumber = (inOffset, numEntries) => {
27
+ const retArray = [];
28
+ for (let counter = 0; counter < numEntries; counter += 1) {
29
+ retArray.push(inputArrayBuffer.getInt8(inOffset + counter));
30
+ }
31
+ return retArray;
32
+ };
33
+
34
+ this.getChar = (inOffset, numEntries) => {
35
+ const retArray = [];
36
+ for (let counter = 0; counter < numEntries; counter += 1) {
37
+ retArray.push(
38
+ String.fromCharCode(inputArrayBuffer.getInt8(inOffset + counter))
39
+ );
40
+ }
41
+ return retArray;
42
+ };
43
+
44
+ this.getShort = (inOffset, numEntries) => {
45
+ const retArray = [];
46
+ for (let counter = 0; counter < numEntries; counter += 2) {
47
+ retArray.push(inputArrayBuffer.getInt16(inOffset + counter));
48
+ }
49
+ return retArray;
50
+ };
51
+
52
+
53
+ this.getTagName = (inOffset) => {
54
+ let name = "";
55
+ for (let loopOffset = inOffset; loopOffset < inOffset + 4; loopOffset++) {
56
+ name += String.fromCharCode(inputArrayBuffer.getInt8(loopOffset));
57
+ }
58
+ return name;
59
+ };
60
+
61
+ this.getDataTag = function(inTag) {
62
+ let output;
63
+ let curElem = dirLocation;
64
+ do {
65
+ const currTagName = this.getTagName(curElem);
66
+ const tagNum = inputArrayBuffer.getInt32(curElem + 4);
67
+ // eslint-disable-next-line eqeqeq
68
+ if (currTagName == inTag.tagName && tagNum === inTag.tagNum) {
69
+ const numEntries = inputArrayBuffer.getInt32(curElem + 16);
70
+ const entryOffset = inputArrayBuffer.getInt32(curElem + 20);
71
+ output = this[inTag.typeToReturn](entryOffset, numEntries);
72
+ }
73
+ curElem += 28;
74
+ } while (curElem < lastEntry);
75
+ return output;
76
+ };
77
+
78
+ this.getTraceData = function() {
79
+ const traceData = {};
80
+ traceData.aTrace = this.getDataTag(tagDict.colorDataA);
81
+ traceData.tTrace = this.getDataTag(tagDict.colorDataT);
82
+ traceData.gTrace = this.getDataTag(tagDict.colorDataG);
83
+ traceData.cTrace = this.getDataTag(tagDict.colorDataC);
84
+ traceData.basePos = this.getDataTag(tagDict.peakLocations);
85
+ traceData.baseCalls = this.getDataTag(tagDict.baseCalls2);
86
+ traceData.qualNums = this.getDataTag(tagDict.qualNums);
87
+ if (traceData.qualNums ) {
88
+ //tnr if we're only getting 1's and 0's as qualNums, that means that there weren't actual qual nums attached to the file
89
+ if (!traceData.qualNums.filter(q => (q!==1 && q!==0)).length) {
90
+ delete traceData.qualNums
91
+ }
92
+ }
93
+ return convertBasePosTraceToPerBpTrace(traceData);
94
+ };
95
+
96
+ this.getFirstEntry = () => {
97
+ let output = "";
98
+ for (let curElem = dirLocation; curElem < lastEntry; curElem += 28) {
99
+ let name = "";
100
+ for (let offset = curElem; offset < curElem + 4; offset++) {
101
+ name += String.fromCharCode(inputArrayBuffer.getInt8(offset));
102
+ }
103
+ output += ` - ${name}`;
104
+ }
105
+ return output;
106
+ };
107
+ }
108
+
109
+ const tagDict = {
110
+ baseCalls1: { tagName: "PBAS", tagNum: 1, typeToReturn: "getChar" },
111
+ baseCalls2: { tagName: "PBAS", tagNum: 2, typeToReturn: "getChar" },
112
+ qualNums: { tagName: "PCON", tagNum: 2, typeToReturn: "getNumber" },
113
+ peakLocations: { tagName: "PLOC", tagNum: 2, typeToReturn: "getShort" },
114
+ peakDev: { tagName: "P1RL", tagNum: 1, typeToReturn: "getShort" },
115
+ peakOneAmp: { tagName: "P1AM", tagNum: 1, typeToReturn: "getShort" },
116
+ colorDataA: { tagName: "DATA", tagNum: 10, typeToReturn: "getShort" },
117
+ colorDataT: { tagName: "DATA", tagNum: 11, typeToReturn: "getShort" },
118
+ colorDataG: { tagName: "DATA", tagNum: 9, typeToReturn: "getShort" },
119
+ colorDataC: { tagName: "DATA", tagNum: 12, typeToReturn: "getShort" },
120
+ };
121
+
122
+
123
+ const correctionAmount = 3
124
+ // tnr: this function takes in chromData which has 4 traces and a basePos (which describes where in the trace the base call lands)
125
+ // It "normalizes" that data into a baseTraces array so that each base has its own set of that data (having a per-base trace makes insertion/deletion/copy/paste actions all easier)
126
+ function convertBasePosTraceToPerBpTrace(chromData) {
127
+ const { basePos, aTrace } = chromData;
128
+ const traceLength = aTrace.length;
129
+ let startPos = 0;
130
+ let nextBasePos = basePos[1];
131
+ let endPos;
132
+ function setEndPos() {
133
+ if (nextBasePos) {
134
+ endPos = startPos + Math.ceil((nextBasePos - startPos) / 2);
135
+ } else {
136
+ endPos = traceLength;
137
+ }
138
+ }
139
+ setEndPos();
140
+ const baseTraces = [];
141
+ for (let i = 0; i < basePos.length; i++) {
142
+ const tracesForType = {
143
+ aTrace: [],
144
+ tTrace: [],
145
+ gTrace: [],
146
+ cTrace: []
147
+ };
148
+ baseTraces[i] = tracesForType;
149
+ [
150
+ "aTrace",
151
+ "tTrace",
152
+ "gTrace",
153
+ "cTrace"
154
+ // eslint-disable-next-line no-loop-func
155
+ ].forEach((type) => {
156
+ const traceForType = tracesForType[type];
157
+ const traceData = chromData[type];
158
+ for (let j = startPos; j < endPos + correctionAmount; j++) {
159
+ traceForType.push(traceData[j] || 0);
160
+ }
161
+ });
162
+ if (i !== basePos.length-1) {
163
+ startPos = endPos+correctionAmount;
164
+ nextBasePos = basePos[i + 2];
165
+ setEndPos();
166
+ }
167
+ }
168
+
169
+ return {
170
+ baseTraces,
171
+ ...chromData
172
+ };
173
+ }
174
+
175
+ export {
176
+ convertBasePosTraceToPerBpTrace
177
+ }
@@ -0,0 +1,225 @@
1
+ import fastaToJson from "./fastaToJson";
2
+ import genbankToJson from "./genbankToJson";
3
+ import sbolXmlToJson from "./sbolXmlToJson";
4
+ import extractFileExtension from "./utils/extractFileExtension.js";
5
+ import snapgeneToJson from "./snapgeneToJson";
6
+ import ab1ToJson from "./ab1ToJson";
7
+ import gffToJson from "./gffToJson";
8
+ import isBrowser from "./utils/isBrowser";
9
+ import { tidyUpSequenceData } from "@teselagen/sequence-utils";
10
+ import geneiousXmlToJson from "./geneiousXmlToJson";
11
+ import jbeiXmlToJson from "./jbeiXmlToJson";
12
+ import { unzipSync } from "fflate";
13
+
14
+ /**
15
+ * takes in file content string and its file name and determines what parser it needs to be sent to.
16
+ * The file is parsed to our old JSON schema and after it goes through an intermediate step where we convert that json to our new schema
17
+ * @param {string} fileContentString content of the file as a string
18
+ * @param {Function} onFileParsed //tnr: fill this out
19
+ */
20
+
21
+ async function anyToJson(fileContentStringOrFileObj, options) {
22
+ let fileContentString;
23
+ options = options || {};
24
+ let fileName = options.fileName || "";
25
+ if (!fileName && typeof fileContentStringOrFileObj !== "string") {
26
+ fileName = fileContentStringOrFileObj.name;
27
+ options.fileName = fileName;
28
+ }
29
+ const ext = extractFileExtension(fileName);
30
+ if (typeof fileContentStringOrFileObj === "string") {
31
+ fileContentString = fileContentStringOrFileObj;
32
+ } else {
33
+ if (/^(ab1)$/.test(ext)) {
34
+ // AB1 sequencing read
35
+ //we will always want to pass the file obj and not the string to ab1
36
+ return ab1ToJson(fileContentStringOrFileObj, options);
37
+ } else if (/^(prot)$/.test(ext)) {
38
+ // fileContentString = await getUtf8StringFromFile(
39
+ // fileContentStringOrFileObj,
40
+ // options
41
+ // );
42
+ // snapgene file (always requires that the full filename be passed in to anyToJson otherwise it won't parse properly)
43
+ //we will always want to pass the file obj and not the string to the snapgene parser because it expects a binary file
44
+ return snapgeneToJson(fileContentStringOrFileObj, options);
45
+ } else if (/^(dna)$/.test(ext)) {
46
+ // snapgene file (always requires that the full filename be passed in to anyToJson otherwise it won't parse properly)
47
+ //we will always want to pass the file obj and not the string to the snapgene parser because it expects a binary file
48
+ return snapgeneToJson(fileContentStringOrFileObj, options);
49
+ } else if (/^(geneious)$/.test(ext)) {
50
+ const a = await getUint8ArrayFromFile(fileContentStringOrFileObj);
51
+ let d;
52
+ try {
53
+ d = new TextDecoder().decode(a, { stream: false });
54
+ if (!d.includes("<geneious")) {
55
+ throw new Error("not geneious");
56
+ }
57
+ } catch (e) {
58
+ //catch the above error and try to unzip the file and see if it works
59
+ const b = unzipSync(a);
60
+ const c = Object.values(b)[0];
61
+ d = new TextDecoder().decode(c, { stream: false });
62
+ }
63
+ return geneiousXmlToJson(d, options);
64
+ } else {
65
+ // we want to get the string from the file obj
66
+ fileContentString = await getUtf8StringFromFile(
67
+ fileContentStringOrFileObj,
68
+ options
69
+ );
70
+ }
71
+ }
72
+ // console.log(`fileContentString.includes("seq:seq"):`,fileContentString.includes("seq:seq"))
73
+ // console.log(`fileContentString.includes("jbei")):`,fileContentString.includes("jbei"))
74
+ if (/^(fasta|fas|fa|fna|ffn)$/.test(ext)) {
75
+ // FASTA
76
+ return fastaToJson(fileContentString, options);
77
+ } else if (/^(gb|gbk)$/.test(ext)) {
78
+ // GENBANK
79
+ return genbankToJson(fileContentString, options);
80
+ } else if (
81
+ /^(seq)$/.test(ext) ||
82
+ (/^(xml)$/.test(ext) &&
83
+ fileContentString.includes("seq:seq") &&
84
+ fileContentString.includes("jbei"))
85
+ ) {
86
+ // JBEI
87
+ return jbeiXmlToJson(fileContentString, options);
88
+ } else if (/^(json)$/.test(ext)) {
89
+ // TG JSON Probably
90
+ const failure = {
91
+ messages: [`Unable to parse JSON file ${fileName}`],
92
+ success: false
93
+ };
94
+ try {
95
+ const cleaned = tidyUpSequenceData(
96
+ JSON.parse(fileContentString),
97
+ options
98
+ );
99
+ if (!cleaned.sequence.length) return [failure];
100
+ return [{ parsedSequence: cleaned, success: true }];
101
+ } catch (error) {
102
+ console.error(`error:`, error);
103
+ return [failure];
104
+ }
105
+ } else if (/^(gp|genpep)$/.test(ext)) {
106
+ // PROTEIN GENBANK
107
+ return genbankToJson(fileContentString, { ...options, isProtein: true });
108
+ } else if (/^(xml|rdf)$/.test(ext)) {
109
+ // XML/RDF
110
+ return sbolXmlToJson(
111
+ fileContentString || fileContentStringOrFileObj,
112
+ options
113
+ );
114
+ } else if (/^(gff|gff3)$/.test(ext)) {
115
+ // GFF
116
+ return gffToJson(fileContentStringOrFileObj, options);
117
+ } else {
118
+ // console.warn(
119
+ // "TNR: No filename passed to anyToJson so we're going through the list of parsers. Make sure you're passing the filename when using anyToJson!"
120
+ // );
121
+ let parsersToTry = [
122
+ {
123
+ fn: genbankToJson,
124
+ name: "Genbank Parser"
125
+ },
126
+ {
127
+ fn: fastaToJson,
128
+ name: "Fasta Parser"
129
+ }
130
+ ];
131
+ const firstChar = fileContentString[fileContentString.search(/\S|$/)];
132
+
133
+ //try to guess the file type based on the first non-whitespace char in the filestring
134
+ if (firstChar === ">") {
135
+ parsersToTry = parsersToTry.sort((a) => {
136
+ if (a.name === "Fasta Parser") return -1;
137
+ return 1;
138
+ });
139
+ } else if (firstChar === "L") {
140
+ parsersToTry = parsersToTry.sort((a) => {
141
+ if (a.name === "Genbank Parser") return -1;
142
+ return 1;
143
+ });
144
+ }
145
+
146
+ for (const parser of parsersToTry) {
147
+ const toReturn = await parser.fn(fileContentString, options);
148
+ if (successfulParsing(toReturn)) {
149
+ //continue on to through the normal flow
150
+ toReturn.forEach(function (result) {
151
+ result.messages.push("Parsed using " + parser.name + ".");
152
+ });
153
+ return toReturn;
154
+ }
155
+ }
156
+
157
+ //none of the parsers worked
158
+ return [
159
+ {
160
+ messages: [
161
+ "Unable to parse file as FASTA, genbank, JBEI, or SBOL formats"
162
+ ],
163
+ success: false
164
+ }
165
+ ];
166
+ }
167
+
168
+ //helper function to determine whether or not the parsing was successful or not
169
+ function successfulParsing(resultArray) {
170
+ return resultArray.some(function (result) {
171
+ return result.success;
172
+ });
173
+ }
174
+ }
175
+
176
+ export default anyToJson;
177
+
178
+ function getUtf8StringFromFile(file, { emulateBrowser } = {}) {
179
+ if (!isBrowser && !emulateBrowser) {
180
+ //emulate browser is only used for testing purposes
181
+ //we're in a node context
182
+ return Buffer.isBuffer(file)
183
+ ? file.toString("utf-8")
184
+ : Buffer.isBuffer(file.buffer)
185
+ ? file.buffer.toString("utf-8")
186
+ : file;
187
+ }
188
+ const reader = new window.FileReader();
189
+ reader.readAsText(file, "UTF-8");
190
+ return new Promise((resolve, reject) => {
191
+ reader.onload = (evt) => {
192
+ resolve(evt.target.result);
193
+ };
194
+ reader.onerror = (err) => {
195
+ console.error("err:", err);
196
+ reject(err);
197
+ };
198
+ });
199
+ }
200
+ function getUint8ArrayFromFile(file, { emulateBrowser } = {}) {
201
+ if (!isBrowser && !emulateBrowser) {
202
+ //emulate browser is only used for testing purposes
203
+ //we're in a node context
204
+ return Buffer.isBuffer(file)
205
+ ? new Uint8Array(file)
206
+ : Buffer.isBuffer(file.buffer)
207
+ ? new Uint8Array(file.buffer)
208
+ : file;
209
+ }
210
+ const reader = new window.FileReader();
211
+ // reader.readAsText(file, "UTF-8");
212
+ reader.readAsArrayBuffer(file);
213
+
214
+ return new Promise((resolve, reject) => {
215
+ reader.onload = (evt) => {
216
+ const arrayBuffer = evt.target.result;
217
+ const bytes = new Uint8Array(arrayBuffer);
218
+ resolve(bytes);
219
+ };
220
+ reader.onerror = (err) => {
221
+ console.error("err:", err);
222
+ reject(err);
223
+ };
224
+ });
225
+ }
@@ -0,0 +1,101 @@
1
+ import createInitialSequence from "./utils/createInitialSequence";
2
+ import splitStringIntoLines from "./utils/splitStringIntoLines.js";
3
+ import validateSequenceArray from "./utils/validateSequenceArray";
4
+
5
+ /**
6
+ * parses a fasta file that may or may not contain multiple resultArray
7
+ * @param {[string]} fileString [string respresentation of file contents]
8
+ * @param {[function]} onFileParsed [callback for a parsed sequence]
9
+ * @author Joshua P Nixon
10
+ */
11
+
12
+ function fastaToJson(fileString, options) {
13
+ let resultArray = [];
14
+ let result = null;
15
+ try {
16
+ const lines = splitStringIntoLines(fileString);
17
+
18
+ for (let i = 0; i < lines.length; i++) {
19
+ parseLine(lines[i]);
20
+ }
21
+ if (result) {
22
+ resultArray.push(result);
23
+ result = null;
24
+ }
25
+ } catch (e) {
26
+ console.error("error:", e);
27
+ console.error("error.stack: ", e.stack);
28
+ resultArray = [
29
+ {
30
+ success: false,
31
+ messages: ["Import Error: Invalid File"],
32
+ },
33
+ ];
34
+ }
35
+ return validateSequenceArray(resultArray, options);
36
+
37
+ function parseLine(line) {
38
+ line = line.trim();
39
+ if (";" === line[0]) {
40
+ //first instace is title, afterwards comments are ignored
41
+ if (result) {
42
+ return;
43
+ }
44
+ result = createInitialSequence(options);
45
+ parseTitle(line);
46
+ } else if (">" === line[0]) {
47
+ //header line
48
+ if (result) {
49
+ resultArray.push(result);
50
+ result = null;
51
+ }
52
+ result = createInitialSequence(options);
53
+ parseTitle(line);
54
+ } else {
55
+ //sequence line
56
+ if (!result) {
57
+ result = createInitialSequence(options);
58
+ }
59
+ if ("*" === line[line.length - 1]) {
60
+ //some resultArray are ended with an asterisk
61
+ parseSequenceLine(line.substring(0, line.length - 1));
62
+ resultArray.push(result);
63
+ result = null;
64
+ } else {
65
+ parseSequenceLine(line);
66
+ }
67
+ }
68
+ if (options && options.parseFastaAsCircular) {
69
+ result.parsedSequence.circular = true;
70
+ }
71
+ }
72
+
73
+ function parseTitle(line) {
74
+
75
+ if (options && 'parseName' in options && !options.parseName){
76
+ result.parsedSequence.name = line.slice(1)
77
+ return
78
+ }
79
+
80
+ const pipeIndex = line.indexOf("|");
81
+ if (pipeIndex > -1) {
82
+ result.parsedSequence.name = line.slice(1, pipeIndex);
83
+ result.parsedSequence.description = line.slice(pipeIndex + 1);
84
+ } else {
85
+ result.parsedSequence.name = line.slice(1);
86
+ }
87
+ }
88
+
89
+ function parseSequenceLine(line) {
90
+ // http://www.ncbi.nlm.nih.gov/BLAST/blastcgihelp.shtml says
91
+ // that the sequence can be interspersed with numbers and/or spaces and - dashes for gaps.
92
+ // if (options && !options.doNotRemoveDashes && line.match(/[\s0-9-]/)) {
93
+ // line = line.replace(/[\s[0-9-]/g, "");
94
+ // const msg = "Warning: spaces, numbers and/or dashes were removed from sequence"
95
+ // result.messages.indexOf(msg === -1) && result.messages.push(msg);
96
+ // }
97
+ result.parsedSequence.sequence += line;
98
+ }
99
+ }
100
+
101
+ export default fastaToJson;
@@ -0,0 +1,20 @@
1
+ interface sequenceObject {
2
+ features: [],
3
+ parts: [],
4
+ circular: boolean,
5
+ }
6
+ // interface parsedResult {
7
+ // parsedSequence: sequenceObject
8
+ // }
9
+ type ParsedResult = {
10
+ parsedSequence: boolean
11
+ }
12
+ // interface onFileParsedCallback {
13
+ // (res: [parsedResult]): void;
14
+ // }
15
+ type onFileParsedCallback<ParsedResult> = (parsedResult: <ParsedResult>): ParsedResult => void
16
+
17
+
18
+ export default genbankToJson<genbankFileString,onFileParsedCallback > = (genbankFileString: string, onFileParsedCallback: onFileParsedCallback, options) => void
19
+
20
+