@teselagen/bio-parsers 0.1.27 → 0.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +24219 -39924
- package/index.mjs +24238 -39921
- package/index.umd.js +32684 -48391
- package/package.json +5 -8
- package/src/ab1ToJson.js +177 -0
- package/src/anyToJson.js +225 -0
- package/src/fastaToJson.js +101 -0
- package/src/genbankToJson.d.__ts +20 -0
- package/src/genbankToJson.js +688 -0
- package/src/geneiousXmlToJson.js +147 -0
- package/src/gffToJson.js +43 -0
- package/src/index.js +23 -0
- package/src/jbeiXmlToJson.js +109 -0
- package/src/jsonToBed.js +39 -0
- package/src/jsonToFasta.js +33 -0
- package/src/jsonToGenbank.js +423 -0
- package/src/jsonToJsonString.js +26 -0
- package/src/sbolXmlToJson.js +135 -0
- package/src/snapgeneToJson.js +245 -0
- package/src/utils/NameUtils.js +10 -0
- package/src/utils/ParserUtil.js +93 -0
- package/src/utils/cleanUpTeselagenJsonForExport.js +13 -0
- package/src/utils/constants.js +24 -0
- package/src/utils/convertOldSequenceDataToNewDataType.js +64 -0
- package/src/utils/createInitialSequence.js +14 -0
- package/src/utils/extractFileExtension.js +14 -0
- package/src/utils/flattenSequenceArray.js +17 -0
- package/src/utils/getArrayBufferFromFile.js +32 -0
- package/src/utils/isBrowser.js +1 -0
- package/src/utils/parseUracilFeatures.js +13 -0
- package/src/utils/pragmasAndTypes.js +21 -0
- package/src/utils/searchWholeObjByName.js +98 -0
- package/src/utils/splitStringIntoLines.js +13 -0
- package/src/utils/unmangleUrls.js +34 -0
- package/src/utils/validateSequence.js +349 -0
- package/src/utils/validateSequenceArray.js +20 -0
|
@@ -0,0 +1,688 @@
|
|
|
1
|
+
/* eslint-disable no-var*/
|
|
2
|
+
import { convertAACaretPositionOrRangeToDna } from "@teselagen/sequence-utils";
|
|
3
|
+
|
|
4
|
+
import { gbDivisions, untitledSequenceName } from "./utils/constants";
|
|
5
|
+
import flattenSequenceArray from "./utils/flattenSequenceArray";
|
|
6
|
+
import validateSequenceArray from "./utils/validateSequenceArray";
|
|
7
|
+
import splitStringIntoLines from "./utils/splitStringIntoLines.js";
|
|
8
|
+
|
|
9
|
+
import createInitialSequence from "./utils/createInitialSequence";
|
|
10
|
+
|
|
11
|
+
function genbankToJson(string, options = {}) {
|
|
12
|
+
const {
|
|
13
|
+
inclusive1BasedStart,
|
|
14
|
+
inclusive1BasedEnd,
|
|
15
|
+
//these are also valid options:
|
|
16
|
+
// primersAsFeatures,
|
|
17
|
+
// sequenceTypeFromLocus,
|
|
18
|
+
// isProtein,
|
|
19
|
+
} = options;
|
|
20
|
+
|
|
21
|
+
const resultsArray = [];
|
|
22
|
+
let result;
|
|
23
|
+
let currentFeatureNote;
|
|
24
|
+
|
|
25
|
+
const genbankAnnotationKey = {
|
|
26
|
+
LOCUS_TAG: "LOCUS",
|
|
27
|
+
DEFINITION_TAG: "DEFINITION",
|
|
28
|
+
ACCESSION_TAG: "ACCESSION",
|
|
29
|
+
VERSION_TAG: "VERSION",
|
|
30
|
+
KEYWORDS_TAG: "KEYWORDS",
|
|
31
|
+
//SEGMENT_TAG:"SEGMENT"
|
|
32
|
+
SOURCE_TAG: "SOURCE",
|
|
33
|
+
ORGANISM_TAG: "ORGANISM",
|
|
34
|
+
REFERENCE_TAG: "REFERENCE",
|
|
35
|
+
AUTHORS_TAG: "AUTHORS",
|
|
36
|
+
CONSORTIUM_TAG: "CONSRTM",
|
|
37
|
+
TITLE_TAG: "TITLE",
|
|
38
|
+
JOURNAL_TAG: "JOURNAL",
|
|
39
|
+
PUBMED_TAG: "PUBMED",
|
|
40
|
+
REMARK_TAG: "REMARK",
|
|
41
|
+
COMMENT_TAG: "COMMENT",
|
|
42
|
+
FEATURES_TAG: "FEATURES",
|
|
43
|
+
BASE_COUNT_TAG: "BASE COUNT",
|
|
44
|
+
//CONTIG_TAG: "CONTIG"
|
|
45
|
+
ORIGIN_TAG: "ORIGIN",
|
|
46
|
+
END_SEQUENCE_TAG: "//",
|
|
47
|
+
};
|
|
48
|
+
let hasFoundLocus = false;
|
|
49
|
+
let featureLocationIndentation;
|
|
50
|
+
try {
|
|
51
|
+
const lines = splitStringIntoLines(string);
|
|
52
|
+
let LINETYPE = false;
|
|
53
|
+
|
|
54
|
+
if (lines === null) {
|
|
55
|
+
addMessage("Import Error: Sequence file is empty");
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
lines.some(function (line) {
|
|
59
|
+
if (line === null) {
|
|
60
|
+
return true; //break the some loop
|
|
61
|
+
}
|
|
62
|
+
const key = getLineKey(line);
|
|
63
|
+
const val = getLineVal(line);
|
|
64
|
+
const isKeyRunon = isKeywordRunon(line);
|
|
65
|
+
const isSubKey = isSubKeyword(line);
|
|
66
|
+
const isKey = isKeyword(line);
|
|
67
|
+
|
|
68
|
+
//only set a new LINETYPE in the case that we've encountered a key that warrants it.
|
|
69
|
+
if (key === "LOCUS") {
|
|
70
|
+
LINETYPE = key;
|
|
71
|
+
} else if (key === "REFERENCE") {
|
|
72
|
+
LINETYPE = key;
|
|
73
|
+
} else if (key === "FEATURES") {
|
|
74
|
+
LINETYPE = key;
|
|
75
|
+
} else if (key === "ORIGIN") {
|
|
76
|
+
LINETYPE = key;
|
|
77
|
+
} else if (key === "//") {
|
|
78
|
+
LINETYPE = key;
|
|
79
|
+
} else if (isKey === true) {
|
|
80
|
+
LINETYPE = key;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// IGNORE LINES: DO NOT EVEN PROCESS
|
|
84
|
+
if (line.trim() === "" || key === ";") {
|
|
85
|
+
//tnr: don't add the following message because it is not particularly informative
|
|
86
|
+
// addMessage(
|
|
87
|
+
// "Warning: Empty line, or ';' detected. Ignoring line: " +
|
|
88
|
+
// line);
|
|
89
|
+
return false; // go to next line
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (!hasFoundLocus && LINETYPE !== genbankAnnotationKey.LOCUS_TAG) {
|
|
93
|
+
// 'Genbank files must start with a LOCUS tag so this must not be a genbank'
|
|
94
|
+
return true; //break the some loop
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
switch (LINETYPE) {
|
|
98
|
+
case genbankAnnotationKey.LOCUS_TAG:
|
|
99
|
+
if (hasFoundLocus) {
|
|
100
|
+
//here we concatenate the locus lines together
|
|
101
|
+
line = hasFoundLocus + line;
|
|
102
|
+
}
|
|
103
|
+
parseLocus(line);
|
|
104
|
+
hasFoundLocus = line;
|
|
105
|
+
break;
|
|
106
|
+
case genbankAnnotationKey.FEATURES_TAG:
|
|
107
|
+
//If no location is specified, exclude feature and return messages
|
|
108
|
+
if (val === "") {
|
|
109
|
+
addMessage(
|
|
110
|
+
"Warning: The feature '" +
|
|
111
|
+
key +
|
|
112
|
+
"'' has no location specified. This line has been ignored: line" +
|
|
113
|
+
line
|
|
114
|
+
);
|
|
115
|
+
break;
|
|
116
|
+
}
|
|
117
|
+
parseFeatures(line, key, val);
|
|
118
|
+
break;
|
|
119
|
+
case genbankAnnotationKey.ORIGIN_TAG:
|
|
120
|
+
parseOrigin(line, key);
|
|
121
|
+
break;
|
|
122
|
+
case genbankAnnotationKey.END_SEQUENCE_TAG:
|
|
123
|
+
endSeq();
|
|
124
|
+
break;
|
|
125
|
+
case genbankAnnotationKey.DEFINITION_TAG:
|
|
126
|
+
line = line.replace(/DEFINITION/, "");
|
|
127
|
+
line = line.trim();
|
|
128
|
+
if (result.parsedSequence) {
|
|
129
|
+
if (result.parsedSequence.definition) {
|
|
130
|
+
result.parsedSequence.definition += " " + line;
|
|
131
|
+
} else {
|
|
132
|
+
result.parsedSequence.definition = line;
|
|
133
|
+
}
|
|
134
|
+
if (result.parsedSequence.description) {
|
|
135
|
+
result.parsedSequence.description += " " + line;
|
|
136
|
+
} else {
|
|
137
|
+
result.parsedSequence.description = line;
|
|
138
|
+
}
|
|
139
|
+
} else {
|
|
140
|
+
throw new Error(
|
|
141
|
+
"no sequence yet created upon which to extract an extra line!"
|
|
142
|
+
);
|
|
143
|
+
}
|
|
144
|
+
break;
|
|
145
|
+
case genbankAnnotationKey.ACCESSION_TAG:
|
|
146
|
+
line = line.replace(/ACCESSION/, "");
|
|
147
|
+
line = line.trim();
|
|
148
|
+
if (result.parsedSequence) {
|
|
149
|
+
result.parsedSequence.accession = line;
|
|
150
|
+
}
|
|
151
|
+
break;
|
|
152
|
+
case genbankAnnotationKey.VERSION_TAG:
|
|
153
|
+
line = line.replace(/VERSION/, "");
|
|
154
|
+
line = line.trim();
|
|
155
|
+
if (result.parsedSequence) {
|
|
156
|
+
result.parsedSequence.version = line;
|
|
157
|
+
}
|
|
158
|
+
break;
|
|
159
|
+
case "COMMENT":
|
|
160
|
+
line = line.replace(/COMMENT/, "");
|
|
161
|
+
line = line.trim();
|
|
162
|
+
if (result.parsedSequence) {
|
|
163
|
+
if (!result.parsedSequence.comments) {
|
|
164
|
+
result.parsedSequence.comments = [];
|
|
165
|
+
}
|
|
166
|
+
if (line.indexOf("teselagen_unique_id:") > -1) {
|
|
167
|
+
//capture the special comment
|
|
168
|
+
result.parsedSequence.teselagen_unique_id = line
|
|
169
|
+
.replace(/ /g, "")
|
|
170
|
+
.replace("teselagen_unique_id:", "");
|
|
171
|
+
} else if (line.indexOf("library:") > -1) {
|
|
172
|
+
result.parsedSequence.library = line
|
|
173
|
+
.replace(/ /g, "")
|
|
174
|
+
.replace("library:", "");
|
|
175
|
+
} else {
|
|
176
|
+
result.parsedSequence.comments.push(line);
|
|
177
|
+
}
|
|
178
|
+
} else {
|
|
179
|
+
throw new Error(
|
|
180
|
+
"no sequence yet created upon which to extract an extra line!"
|
|
181
|
+
);
|
|
182
|
+
}
|
|
183
|
+
break;
|
|
184
|
+
default:
|
|
185
|
+
// FOLLOWING FOR KEYWORDS NOT PREVIOUSLY DEFINED IN CASES
|
|
186
|
+
extractExtraLine(line);
|
|
187
|
+
if (key === "BASE") {
|
|
188
|
+
// do nothing; // BLANK LINES || line with ;;;;;;;;; || "BASE COUNT"
|
|
189
|
+
// console.warn("Parsing GenBank File: This line with BaseCount has been ignored: " + line);
|
|
190
|
+
addMessage(
|
|
191
|
+
"Warning: This BaseCount line has been ignored: " + line
|
|
192
|
+
);
|
|
193
|
+
break;
|
|
194
|
+
} else if (isKey) {
|
|
195
|
+
// REGULAR KEYWORDS (NOT LOCUS/FEATURES/ORIGIN) eg VERSION, ACCESSION, SOURCE, REFERENCE
|
|
196
|
+
// lastObj = parseKeyword(line, gb);
|
|
197
|
+
} else if (isSubKey) {
|
|
198
|
+
// REGULAR SUBKEYWORD, NOT FEATURE eg AUTHOR, ORGANISM
|
|
199
|
+
// tmp = gb.getLastKeyword();
|
|
200
|
+
// lastObj = parseSubKeyword(tmp, line, gb);
|
|
201
|
+
} else if (isKeyRunon) {
|
|
202
|
+
// RUNON LINES FOR NON-FEATURES
|
|
203
|
+
// lastObj.setValue(lastObj.getValue() + Teselagen.StringUtil.rpad("\n"," ",13) + Ext.String.trim(line));
|
|
204
|
+
// lastObj.appendValue(Teselagen.StringUtil.rpad("\n"," ",13) + Ext.String.trim(line), gb);
|
|
205
|
+
} else {
|
|
206
|
+
// console.warn("Parsing GenBank File: This line has been ignored: " + line);
|
|
207
|
+
addMessage("Warning: This line has been ignored: " + line);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
return false;
|
|
211
|
+
});
|
|
212
|
+
} catch (e) {
|
|
213
|
+
//catch any errors and set the result
|
|
214
|
+
console.error("Error trying to parse file as .gb:", e);
|
|
215
|
+
result = {
|
|
216
|
+
success: false,
|
|
217
|
+
messages: ["Import Error: Invalid File"],
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
//catch the case where we've successfully started a sequence and parsed it, but endSeq isn't called correctly
|
|
222
|
+
if (
|
|
223
|
+
!result ||
|
|
224
|
+
(result.success && resultsArray[resultsArray.length - 1] !== result)
|
|
225
|
+
) {
|
|
226
|
+
//current result isn't in resultsArray yet
|
|
227
|
+
//so we call endSeq here
|
|
228
|
+
endSeq();
|
|
229
|
+
}
|
|
230
|
+
//call the callback
|
|
231
|
+
|
|
232
|
+
//before we call the onFileParsed callback, we need to flatten the sequence, and convert the old sequence data to the new data type
|
|
233
|
+
const results = validateSequenceArray(
|
|
234
|
+
flattenSequenceArray(resultsArray, options),
|
|
235
|
+
options
|
|
236
|
+
);
|
|
237
|
+
// default sequence json has primers at the top level separate from features, e.g. parsedSequence: { primers: [ {}, {} ], features: [ {}, {} ] }
|
|
238
|
+
// if options.primersAsFeatures is set to true, primers are included in features with type set to primer
|
|
239
|
+
|
|
240
|
+
results.forEach((result) => {
|
|
241
|
+
if (result.success) {
|
|
242
|
+
const sequence = result.parsedSequence;
|
|
243
|
+
sequence.features.forEach((feat) => {
|
|
244
|
+
if (feat.type === "primer") {
|
|
245
|
+
feat.type = "primer_bind";
|
|
246
|
+
}
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
if (!options.primersAsFeatures) {
|
|
250
|
+
sequence.primers = sequence.features.filter(
|
|
251
|
+
(feat) => feat.type === "primer_bind"
|
|
252
|
+
);
|
|
253
|
+
sequence.features = sequence.features.filter(
|
|
254
|
+
(feat) => feat.type !== "primer_bind"
|
|
255
|
+
);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
return results;
|
|
261
|
+
|
|
262
|
+
function endSeq() {
|
|
263
|
+
//do some post processing clean-up
|
|
264
|
+
hasFoundLocus = false;
|
|
265
|
+
postProcessCurSeq();
|
|
266
|
+
//push the result into the resultsArray
|
|
267
|
+
resultsArray.push(result || { success: false });
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
function getCurrentFeature() {
|
|
271
|
+
return result.parsedSequence.features[
|
|
272
|
+
result.parsedSequence.features.length - 1
|
|
273
|
+
];
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
function addMessage(msg) {
|
|
277
|
+
if (result.messages.indexOf(msg === -1)) {
|
|
278
|
+
return result.messages.push(msg);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
function postProcessCurSeq() {
|
|
283
|
+
if (result && result.parsedSequence && result.parsedSequence.features) {
|
|
284
|
+
for (let i = 0; i < result.parsedSequence.features.length; i++) {
|
|
285
|
+
result.parsedSequence.features[i] = postProcessGenbankFeature(
|
|
286
|
+
result.parsedSequence.features[i]
|
|
287
|
+
);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
function parseOrigin(line, key) {
|
|
293
|
+
if (key !== genbankAnnotationKey.ORIGIN_TAG) {
|
|
294
|
+
const new_line = line.replace(/[\s]*[0-9]*/g, "");
|
|
295
|
+
result.parsedSequence.sequence += new_line;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
function parseLocus(line) {
|
|
300
|
+
result = createInitialSequence(options);
|
|
301
|
+
let locusName;
|
|
302
|
+
let circular;
|
|
303
|
+
|
|
304
|
+
let gbDivision;
|
|
305
|
+
let date;
|
|
306
|
+
const lineArr = line.split(/[\s]+/g);
|
|
307
|
+
|
|
308
|
+
if (lineArr.length <= 1) {
|
|
309
|
+
console.warn(
|
|
310
|
+
"Parsing GenBank File: WARNING! Locus line contains no values!"
|
|
311
|
+
);
|
|
312
|
+
// TODO
|
|
313
|
+
addMessage("Import Warning: Locus line contains no values: " + line);
|
|
314
|
+
}
|
|
315
|
+
locusName = lineArr[1];
|
|
316
|
+
|
|
317
|
+
// Linear vs Circular?
|
|
318
|
+
for (let i = 1; i < lineArr.length; i++) {
|
|
319
|
+
if (lineArr[i].match(/circular/gi)) {
|
|
320
|
+
circular = true;
|
|
321
|
+
} else if (lineArr[i].match(/linear/gi)) {
|
|
322
|
+
circular = false;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// Date and Div
|
|
327
|
+
// Date is in format:1-APR-2012
|
|
328
|
+
for (let j = 1; j < lineArr.length; j++) {
|
|
329
|
+
const item = lineArr[j];
|
|
330
|
+
if (item.match(/-[A-Z]{3}-/g)) {
|
|
331
|
+
date = item;
|
|
332
|
+
}
|
|
333
|
+
// isProtein
|
|
334
|
+
if (j === 3 && item.match(/aa/i)) {
|
|
335
|
+
options.sequenceTypeFromLocus = item;
|
|
336
|
+
options.isProtein = true;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if (
|
|
340
|
+
j === 4 &&
|
|
341
|
+
(item.match(/ds-dna/i) || item.match(/ss-dna/i) || item.match(/dna/i) || item.match(/rna/i))
|
|
342
|
+
) {
|
|
343
|
+
if (options.isProtein === undefined) {
|
|
344
|
+
options.isProtein = false;
|
|
345
|
+
}
|
|
346
|
+
options.sequenceTypeFromLocus = item;
|
|
347
|
+
if (item.match(/ss-dna/i)) {
|
|
348
|
+
options.isSingleStrandedDNA = true;
|
|
349
|
+
}
|
|
350
|
+
if (item.match(/rna/i) && !item.match(/ss-rna/i)) {
|
|
351
|
+
options.isDoubleStrandedRNA = true;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// Division
|
|
356
|
+
if (
|
|
357
|
+
typeof lineArr[j] === "string" &&
|
|
358
|
+
gbDivisions[lineArr[j].toUpperCase()]
|
|
359
|
+
) {
|
|
360
|
+
gbDivision = lineArr[j].toUpperCase();
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
//don't use "exported as a file name unless it is out last option"
|
|
365
|
+
if (
|
|
366
|
+
locusName !== "Exported" ||
|
|
367
|
+
result.parsedSequence.name === untitledSequenceName
|
|
368
|
+
) {
|
|
369
|
+
result.parsedSequence.name = locusName;
|
|
370
|
+
}
|
|
371
|
+
result.parsedSequence.gbDivision = gbDivision;
|
|
372
|
+
result.parsedSequence.sequenceTypeFromLocus = options.sequenceTypeFromLocus;
|
|
373
|
+
result.parsedSequence.isSingleStrandedDNA = options.isSingleStrandedDNA;
|
|
374
|
+
result.parsedSequence.isDoubleStrandedRNA = options.isDoubleStrandedRNA;
|
|
375
|
+
result.parsedSequence.date = date;
|
|
376
|
+
result.parsedSequence.circular = circular;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
function extractExtraLine(line) {
|
|
380
|
+
if (result.parsedSequence) {
|
|
381
|
+
if (!result.parsedSequence.extraLines) {
|
|
382
|
+
result.parsedSequence.extraLines = [];
|
|
383
|
+
}
|
|
384
|
+
result.parsedSequence.extraLines.push(line);
|
|
385
|
+
} else {
|
|
386
|
+
throw new Error(
|
|
387
|
+
"no sequence yet created upon which to extract an extra line!"
|
|
388
|
+
);
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
/* eslint-disable no-var */
|
|
392
|
+
var lastLineWasFeaturesTag;
|
|
393
|
+
var lastLineWasLocation;
|
|
394
|
+
/* eslint-enable no-var*/
|
|
395
|
+
|
|
396
|
+
function parseFeatures(line, key, val) {
|
|
397
|
+
let strand;
|
|
398
|
+
// FOR THE MAIN FEATURES LOCATION/QUALIFIER LINE
|
|
399
|
+
if (key === genbankAnnotationKey.FEATURES_TAG) {
|
|
400
|
+
lastLineWasFeaturesTag = true;
|
|
401
|
+
return;
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
if (lastLineWasFeaturesTag) {
|
|
405
|
+
//we need to get the indentation of feature locations
|
|
406
|
+
featureLocationIndentation =
|
|
407
|
+
getLengthOfWhiteSpaceBeforeStartOfLetters(line);
|
|
408
|
+
//set lastLineWasFeaturesTag to false
|
|
409
|
+
lastLineWasFeaturesTag = false;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// FOR LOCATION && QUALIFIER LINES
|
|
413
|
+
if (isFeatureLineRunon(line, featureLocationIndentation)) {
|
|
414
|
+
//the line is a continuation of the above line
|
|
415
|
+
if (lastLineWasLocation) {
|
|
416
|
+
//the last line was a location, so the run-on line is expected to be a feature location as well
|
|
417
|
+
parseFeatureLocation(line.trim(), options);
|
|
418
|
+
lastLineWasLocation = true;
|
|
419
|
+
} else {
|
|
420
|
+
//the last line was a note
|
|
421
|
+
if (currentFeatureNote) {
|
|
422
|
+
//append to the currentFeatureNote
|
|
423
|
+
// only trim file formatting spaces (i.e. the left ones)
|
|
424
|
+
// spaces on the right are necessary (e.g. spacing between words, etc.)
|
|
425
|
+
currentFeatureNote[currentFeatureNote.length - 1] += line
|
|
426
|
+
.trimLeft()
|
|
427
|
+
.replace(/"/g, "");
|
|
428
|
+
}
|
|
429
|
+
lastLineWasLocation = false;
|
|
430
|
+
}
|
|
431
|
+
} else {
|
|
432
|
+
// New Element/Qualifier lines. Not runon lines.
|
|
433
|
+
if (isNote(line)) {
|
|
434
|
+
// is a new Feature Element (e.g. source, CDS) in the form of "[\s] KEY SEQLOCATION"
|
|
435
|
+
// is a FeatureQualifier in the /KEY="BLAH" format; could be multiple per Element
|
|
436
|
+
//Check that feature did not get skipped for missing location
|
|
437
|
+
if (getCurrentFeature()) {
|
|
438
|
+
parseFeatureNote(line);
|
|
439
|
+
lastLineWasLocation = false;
|
|
440
|
+
} else {
|
|
441
|
+
return;
|
|
442
|
+
}
|
|
443
|
+
} else {
|
|
444
|
+
//the line is a location, so we make a new feature from it
|
|
445
|
+
if (val.match(/complement/g)) {
|
|
446
|
+
strand = -1;
|
|
447
|
+
} else {
|
|
448
|
+
strand = 1;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
newFeature();
|
|
452
|
+
const feat = getCurrentFeature();
|
|
453
|
+
feat.type = key;
|
|
454
|
+
feat.strand = strand;
|
|
455
|
+
|
|
456
|
+
parseFeatureLocation(val, options);
|
|
457
|
+
lastLineWasLocation = true;
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
function newFeature() {
|
|
463
|
+
result.parsedSequence.features.push({
|
|
464
|
+
locations: [],
|
|
465
|
+
notes: {},
|
|
466
|
+
});
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
function isNote(line) {
|
|
470
|
+
let qual = false;
|
|
471
|
+
/*if (line.charAt(21) === "/") {//T.H. Hard coded method
|
|
472
|
+
qual = true;
|
|
473
|
+
}*/
|
|
474
|
+
if (line.trim().charAt(0).match(/\//)) {
|
|
475
|
+
// searches based on looking for / in beginning of line
|
|
476
|
+
qual = true;
|
|
477
|
+
} else if (line.match(/^[\s]*\/[\w]+=[\S]+/)) {
|
|
478
|
+
// searches based on " /key=BLAH" regex
|
|
479
|
+
qual = true;
|
|
480
|
+
}
|
|
481
|
+
return qual;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
function parseFeatureLocation(locStr, options) {
|
|
485
|
+
locStr = locStr.trim();
|
|
486
|
+
const locArr = [];
|
|
487
|
+
locStr.replace(/(\d+)/g, function (string, match) {
|
|
488
|
+
locArr.push(match);
|
|
489
|
+
});
|
|
490
|
+
for (let i = 0; i < locArr.length; i += 2) {
|
|
491
|
+
const start = parseInt(locArr[i], 10) - (inclusive1BasedStart ? 0 : 1);
|
|
492
|
+
let end = parseInt(locArr[i + 1], 10) - (inclusive1BasedEnd ? 0 : 1);
|
|
493
|
+
if (isNaN(end)) {
|
|
494
|
+
//if no end is supplied, assume that the end should be set to whatever the start is
|
|
495
|
+
//this makes a feature location passed as:
|
|
496
|
+
//147
|
|
497
|
+
//function like:
|
|
498
|
+
//147..147
|
|
499
|
+
end = start;
|
|
500
|
+
}
|
|
501
|
+
const location = {
|
|
502
|
+
start: start,
|
|
503
|
+
end: end,
|
|
504
|
+
};
|
|
505
|
+
const feat = getCurrentFeature();
|
|
506
|
+
feat.locations.push(
|
|
507
|
+
options.isProtein
|
|
508
|
+
? convertAACaretPositionOrRangeToDna(location)
|
|
509
|
+
: location
|
|
510
|
+
);
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
function parseFeatureNote(line) {
|
|
515
|
+
let newLine, lineArr;
|
|
516
|
+
|
|
517
|
+
// only trim file formatting spaces (i.e. the left ones)
|
|
518
|
+
// spaces on the right are necessary (e.g. spacing between words, etc.)
|
|
519
|
+
newLine = line.trimLeft();
|
|
520
|
+
newLine = newLine.replace(/^\/|"$/g, "");
|
|
521
|
+
lineArr = newLine.split(/="|=/);
|
|
522
|
+
|
|
523
|
+
let val = lineArr.slice(1).join("=");
|
|
524
|
+
|
|
525
|
+
if (val) {
|
|
526
|
+
val = val.replace(/\\/g, " ");
|
|
527
|
+
|
|
528
|
+
if (line.match(/="/g)) {
|
|
529
|
+
val = val.replace(/".*/g, "");
|
|
530
|
+
} else if (val.match(/^\d+$/g)) {
|
|
531
|
+
val = parseInt(val, 10);
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
const key = lineArr[0];
|
|
535
|
+
const currentNotes = getCurrentFeature().notes;
|
|
536
|
+
if (currentNotes[key]) {
|
|
537
|
+
//array already exists, so push value into it
|
|
538
|
+
currentNotes[key].push(val);
|
|
539
|
+
} else {
|
|
540
|
+
//array doesn't exist yet, so create it and populate it with the value
|
|
541
|
+
currentNotes[key] = [val];
|
|
542
|
+
}
|
|
543
|
+
currentFeatureNote = currentNotes[key];
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
function getLineKey(line) {
|
|
547
|
+
let arr;
|
|
548
|
+
line = line.replace(/^[\s]*/, "");
|
|
549
|
+
|
|
550
|
+
if (line.indexOf("=") < 0) {
|
|
551
|
+
arr = line.split(/[\s]+/);
|
|
552
|
+
} else {
|
|
553
|
+
arr = line.split(/=/);
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
return arr[0];
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
function getLineVal(line) {
|
|
560
|
+
let arr;
|
|
561
|
+
if (line.indexOf("=") < 0) {
|
|
562
|
+
line = line.replace(/^[\s]*[\S]+[\s]+|[\s]+$/, "");
|
|
563
|
+
line = line.trim();
|
|
564
|
+
return line;
|
|
565
|
+
} else {
|
|
566
|
+
arr = line.split(/=/);
|
|
567
|
+
return arr.slice(1).join("");
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
function isKeyword(line) {
|
|
572
|
+
let isKey = false;
|
|
573
|
+
if (line.substr(0, 10).match(/^[\S]+/)) {
|
|
574
|
+
isKey = true;
|
|
575
|
+
}
|
|
576
|
+
return isKey;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
function isSubKeyword(line) {
|
|
580
|
+
let isSubKey = false;
|
|
581
|
+
if (line.substr(0, 10).match(/^[\s]+[\S]+/)) {
|
|
582
|
+
isSubKey = true;
|
|
583
|
+
}
|
|
584
|
+
return isSubKey;
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
function isKeywordRunon(line) {
|
|
588
|
+
let runon;
|
|
589
|
+
if (line.substr(0, 10).match(/[\s]{10}/)) {
|
|
590
|
+
runon = true;
|
|
591
|
+
} else {
|
|
592
|
+
runon = false;
|
|
593
|
+
}
|
|
594
|
+
return runon;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
function postProcessGenbankFeature(feat) {
|
|
598
|
+
if (feat.notes.label) {
|
|
599
|
+
feat.name = feat.notes.label[0];
|
|
600
|
+
} else if (feat.notes.gene) {
|
|
601
|
+
feat.name = feat.notes.gene[0];
|
|
602
|
+
} else if (feat.notes.ApEinfo_label) {
|
|
603
|
+
feat.name = feat.notes.ApEinfo_label[0];
|
|
604
|
+
} else if (feat.notes.name) {
|
|
605
|
+
feat.name = feat.notes.name[0];
|
|
606
|
+
} else if (feat.notes.organism) {
|
|
607
|
+
feat.name = feat.notes.organism[0];
|
|
608
|
+
} else if (feat.notes.locus_tag) {
|
|
609
|
+
feat.name = feat.notes.locus_tag[0];
|
|
610
|
+
} else if (feat.notes.note) {
|
|
611
|
+
//if the name is coming from a note, shorten the name to 100 chars long
|
|
612
|
+
feat.name = feat.notes.note[0].substr(0, 100);
|
|
613
|
+
} else {
|
|
614
|
+
feat.name = "Untitled Feature";
|
|
615
|
+
}
|
|
616
|
+
feat.name = typeof feat.name === "string" ? feat.name : String(feat.name);
|
|
617
|
+
if (feat.name !== 0 && !feat.name) {
|
|
618
|
+
feat.name = "Untitled Feature";
|
|
619
|
+
}
|
|
620
|
+
if (feat.name.length > 100) {
|
|
621
|
+
//shorten the name to a reasonable length if necessary and warn the user about it
|
|
622
|
+
const oldName = feat.name;
|
|
623
|
+
feat.name = feat.name.substr(0, 100);
|
|
624
|
+
addMessage(
|
|
625
|
+
`Warning: Shortening name of feature ${oldName} (max 100 chars)`
|
|
626
|
+
);
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
if (feat.notes.direction) {
|
|
630
|
+
feat.arrowheadType =
|
|
631
|
+
feat.notes.direction[0].toUpperCase() === "BOTH"
|
|
632
|
+
? "BOTH"
|
|
633
|
+
: feat.notes.direction[0].toUpperCase() === "NONE"
|
|
634
|
+
? "NONE"
|
|
635
|
+
: undefined;
|
|
636
|
+
delete feat.notes.direction;
|
|
637
|
+
}
|
|
638
|
+
return feat;
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
function isFeatureLineRunon(line, featureLocationIndentation) {
|
|
643
|
+
const indentationOfLine = getLengthOfWhiteSpaceBeforeStartOfLetters(line);
|
|
644
|
+
if (featureLocationIndentation === indentationOfLine) {
|
|
645
|
+
//the feature location indentation calculated right after the feature tag
|
|
646
|
+
//cannot be the same as the indentation of the line
|
|
647
|
+
//
|
|
648
|
+
//FEATURES Location/Qualifiers
|
|
649
|
+
// rep_origin complement(1074..3302)
|
|
650
|
+
//01234 <-- this is the indentation we're talking about
|
|
651
|
+
return false; //the line is NOT a run on
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
const trimmed = line.trim();
|
|
655
|
+
if (trimmed.charAt(0).match(/\//)) {
|
|
656
|
+
//the first char in the trimmed line cannot be a /
|
|
657
|
+
return false; //the line is NOT a run on
|
|
658
|
+
}
|
|
659
|
+
//the line is a run on
|
|
660
|
+
return true;
|
|
661
|
+
//run-on line example:
|
|
662
|
+
//FEATURES Location/Qualifiers
|
|
663
|
+
// rep_origin complement(1074..3302)
|
|
664
|
+
// /label=pSC101**
|
|
665
|
+
// /note="REP_ORIGIN REP_ORIGIN pSC101* aka pMPP6, gives plasm
|
|
666
|
+
// id number 3 -4 copies per cell, BglII site in pSC101* ori h <--run-on line!
|
|
667
|
+
// as been dele ted by quick change agatcT changed to agatcA g <--run-on line!
|
|
668
|
+
// iving pSC101* * pSC101* aka pMPP6, gives plasmid number 3-4 <--run-on line!
|
|
669
|
+
// copies p er cell, BglII site in pSC101* ori has been delet <--run-on line!
|
|
670
|
+
// ed by quic k change agatcT changed to agatcA giving pSC101* <--run-on line!
|
|
671
|
+
// * [pBbS0a-RFP]" <--run-on line!
|
|
672
|
+
// /gene="SC101** Ori"
|
|
673
|
+
// /note="pSC101* aka pMPP6, gives plasmid number 3-4 copies p
|
|
674
|
+
// er cell, BglII site in pSC101* ori has been deleted by qui
|
|
675
|
+
// c k change agatcT changed to agatcA giving pSC101**"
|
|
676
|
+
// /vntifkey="33"
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
function getLengthOfWhiteSpaceBeforeStartOfLetters(string) {
|
|
680
|
+
const match = /^\s*/.exec(string);
|
|
681
|
+
if (match !== null) {
|
|
682
|
+
return match[0].length;
|
|
683
|
+
} else {
|
|
684
|
+
return 0;
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
|
|
688
|
+
export default genbankToJson;
|