@teselagen/bio-parsers 0.1.26 → 0.1.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +24219 -39924
- package/index.mjs +24238 -39921
- package/index.umd.js +32684 -48391
- package/package.json +3 -7
- package/src/ab1ToJson.js +177 -0
- package/src/anyToJson.js +225 -0
- package/src/fastaToJson.js +101 -0
- package/src/genbankToJson.d.__ts +20 -0
- package/src/genbankToJson.js +688 -0
- package/src/geneiousXmlToJson.js +147 -0
- package/src/gffToJson.js +43 -0
- package/src/index.js +23 -0
- package/src/jbeiXmlToJson.js +109 -0
- package/src/jsonToBed.js +39 -0
- package/src/jsonToFasta.js +33 -0
- package/src/jsonToGenbank.js +423 -0
- package/src/jsonToJsonString.js +26 -0
- package/src/sbolXmlToJson.js +135 -0
- package/src/snapgeneToJson.js +245 -0
- package/src/utils/NameUtils.js +10 -0
- package/src/utils/ParserUtil.js +93 -0
- package/src/utils/cleanUpTeselagenJsonForExport.js +13 -0
- package/src/utils/constants.js +24 -0
- package/src/utils/convertOldSequenceDataToNewDataType.js +64 -0
- package/src/utils/createInitialSequence.js +14 -0
- package/src/utils/extractFileExtension.js +14 -0
- package/src/utils/flattenSequenceArray.js +17 -0
- package/src/utils/getArrayBufferFromFile.js +32 -0
- package/src/utils/isBrowser.js +1 -0
- package/src/utils/parseUracilFeatures.js +13 -0
- package/src/utils/pragmasAndTypes.js +21 -0
- package/src/utils/searchWholeObjByName.js +98 -0
- package/src/utils/splitStringIntoLines.js +13 -0
- package/src/utils/unmangleUrls.js +34 -0
- package/src/utils/validateSequence.js +349 -0
- package/src/utils/validateSequenceArray.js +20 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
export function unmangleUrls(str) {
|
|
2
|
+
if (!str) return str;
|
|
3
|
+
if (typeof str !== "string") return str;
|
|
4
|
+
|
|
5
|
+
const urlRegex = /%%TG%%_(.*?)_%%TG%%/g;
|
|
6
|
+
return str.replace(urlRegex, function (outer, innerUrl) {
|
|
7
|
+
if (innerUrl) {
|
|
8
|
+
return `${decodeURIComponent(innerUrl)}`;
|
|
9
|
+
}
|
|
10
|
+
return outer;
|
|
11
|
+
});
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export function mangleOrStripUrls(
|
|
15
|
+
str,
|
|
16
|
+
{ mangleUrls, doNotMangleOrStripUrls } = {}
|
|
17
|
+
) {
|
|
18
|
+
if (!str) return str;
|
|
19
|
+
const urlRegex = /(((https?:\/\/)|(www\.))[^\s]+)/g;
|
|
20
|
+
if (mangleUrls || doNotMangleOrStripUrls) {
|
|
21
|
+
if (doNotMangleOrStripUrls) {
|
|
22
|
+
//if doNotMangleOrStripUrls=true, just return the original string
|
|
23
|
+
return str;
|
|
24
|
+
}
|
|
25
|
+
//if mangleUrls=true, return a URL mangled and encoded string
|
|
26
|
+
return str.replace(urlRegex, function (url) {
|
|
27
|
+
return `%%TG%%_${encodeURIComponent(url)}_%%TG%%`;
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
//if no options passed, strip all URLs from the string
|
|
31
|
+
return str.replace(urlRegex, function () {
|
|
32
|
+
return ``;
|
|
33
|
+
});
|
|
34
|
+
}
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
import areNonNegativeIntegers from "validate.io-nonnegative-integer-array";
|
|
2
|
+
import { getFeatureTypes } from "@teselagen/sequence-utils";
|
|
3
|
+
import {
|
|
4
|
+
filterAminoAcidSequenceString,
|
|
5
|
+
filterSequenceString,
|
|
6
|
+
guessIfSequenceIsDnaAndNotProtein,
|
|
7
|
+
} from "@teselagen/sequence-utils";
|
|
8
|
+
import { filter, some, upperFirst } from "lodash";
|
|
9
|
+
import pragmasAndTypes from "./pragmasAndTypes.js";
|
|
10
|
+
import { forEach } from "lodash";
|
|
11
|
+
import { map } from "lodash";
|
|
12
|
+
import { unmangleUrls } from "./unmangleUrls";
|
|
13
|
+
import { reformatName } from "./NameUtils.js";
|
|
14
|
+
|
|
15
|
+
//validation checking
|
|
16
|
+
/**
|
|
17
|
+
* validation and sanitizing of our teselagen sequence data type
|
|
18
|
+
* @param {object} sequence Our teselagen sequence data type
|
|
19
|
+
* @return response {
|
|
20
|
+
validatedAndCleanedSequence: {},
|
|
21
|
+
messages: [],
|
|
22
|
+
};
|
|
23
|
+
*/
|
|
24
|
+
export default function validateSequence(sequence, options = {}) {
|
|
25
|
+
let {
|
|
26
|
+
isProtein,
|
|
27
|
+
isOligo,
|
|
28
|
+
guessIfProtein,
|
|
29
|
+
guessIfProteinOptions,
|
|
30
|
+
reformatSeqName,
|
|
31
|
+
inclusive1BasedStart,
|
|
32
|
+
inclusive1BasedEnd,
|
|
33
|
+
additionalValidChars,
|
|
34
|
+
allowOverflowAnnotations,
|
|
35
|
+
coerceFeatureTypes,
|
|
36
|
+
} = options;
|
|
37
|
+
const response = {
|
|
38
|
+
validatedAndCleanedSequence: {},
|
|
39
|
+
messages: [],
|
|
40
|
+
};
|
|
41
|
+
if (!sequence || typeof sequence !== "object") {
|
|
42
|
+
throw new Error("Invalid sequence");
|
|
43
|
+
}
|
|
44
|
+
if (!sequence.name) {
|
|
45
|
+
//we'll handle transferring the file name outside of this function
|
|
46
|
+
//for now just set it to a blank string
|
|
47
|
+
sequence.name = "";
|
|
48
|
+
}
|
|
49
|
+
if (!sequence.extraLines) {
|
|
50
|
+
sequence.extraLines = [];
|
|
51
|
+
}
|
|
52
|
+
if (!sequence.comments) {
|
|
53
|
+
sequence.comments = [];
|
|
54
|
+
}
|
|
55
|
+
if (sequence.description) {
|
|
56
|
+
sequence.description = unmangleUrls(sequence.description);
|
|
57
|
+
}
|
|
58
|
+
const oldName = sequence.name;
|
|
59
|
+
if (reformatSeqName) {
|
|
60
|
+
sequence.name = reformatName(sequence.name);
|
|
61
|
+
}
|
|
62
|
+
if (oldName !== sequence.name) {
|
|
63
|
+
response.messages.push(
|
|
64
|
+
"Name (" + oldName + ") reformatted to " + sequence.name
|
|
65
|
+
);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (Array.isArray(sequence.sequence)) {
|
|
69
|
+
sequence.sequence = sequence.sequence.join("");
|
|
70
|
+
}
|
|
71
|
+
if (!sequence.sequence) {
|
|
72
|
+
response.messages.push("No sequence detected");
|
|
73
|
+
sequence.sequence = "";
|
|
74
|
+
}
|
|
75
|
+
let validChars;
|
|
76
|
+
if (isProtein === undefined && guessIfProtein) {
|
|
77
|
+
isProtein = !guessIfSequenceIsDnaAndNotProtein(
|
|
78
|
+
sequence.sequence,
|
|
79
|
+
guessIfProteinOptions
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
if (isProtein) {
|
|
83
|
+
//tnr: add code to strip invalid protein data..
|
|
84
|
+
validChars = filterAminoAcidSequenceString(sequence.sequence);
|
|
85
|
+
if (validChars !== sequence.sequence) {
|
|
86
|
+
sequence.sequence = validChars;
|
|
87
|
+
response.messages.push(
|
|
88
|
+
"Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
|
|
89
|
+
);
|
|
90
|
+
}
|
|
91
|
+
sequence.type = "PROTEIN";
|
|
92
|
+
sequence.isProtein = true;
|
|
93
|
+
if (!sequence.proteinSequence) {
|
|
94
|
+
sequence.proteinSequence = sequence.sequence;
|
|
95
|
+
}
|
|
96
|
+
sequence.proteinSize = sequence.proteinSequence.length;
|
|
97
|
+
} else {
|
|
98
|
+
//todo: this logic won't catch every case of RNA, so we should probably handle RNA conversion at another level..
|
|
99
|
+
const temp = sequence.sequence;
|
|
100
|
+
if (!isOligo) {
|
|
101
|
+
sequence.sequence = sequence.sequence.replace(/u/gi, (u) =>
|
|
102
|
+
u === "U" ? "T" : "t"
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
if (temp !== sequence.sequence) {
|
|
106
|
+
sequence.type = "RNA";
|
|
107
|
+
} else {
|
|
108
|
+
sequence.type = "DNA";
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
validChars = filterSequenceString(sequence.sequence, additionalValidChars);
|
|
112
|
+
if (validChars !== sequence.sequence) {
|
|
113
|
+
sequence.sequence = validChars;
|
|
114
|
+
response.messages.push(
|
|
115
|
+
"Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
|
|
116
|
+
);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
if (!sequence.size) {
|
|
121
|
+
sequence.size = isProtein
|
|
122
|
+
? sequence.proteinSequence.length * 3
|
|
123
|
+
: sequence.sequence.length;
|
|
124
|
+
}
|
|
125
|
+
let circularityExplicitlyDefined;
|
|
126
|
+
if (
|
|
127
|
+
sequence.circular === false ||
|
|
128
|
+
sequence.circular === "false" ||
|
|
129
|
+
sequence.circular === -1
|
|
130
|
+
) {
|
|
131
|
+
sequence.circular = false;
|
|
132
|
+
circularityExplicitlyDefined = true;
|
|
133
|
+
} else if (!sequence.circular) {
|
|
134
|
+
sequence.circular = false;
|
|
135
|
+
circularityExplicitlyDefined = circularityExplicitlyDefined || false;
|
|
136
|
+
} else {
|
|
137
|
+
sequence.circular = true;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (!sequence.features || !Array.isArray(sequence.features)) {
|
|
141
|
+
response.messages.push("No valid features detected");
|
|
142
|
+
sequence.features = [];
|
|
143
|
+
}
|
|
144
|
+
//tnr: maybe this should be wrapped in its own function (in case we want to use it elsewhere)
|
|
145
|
+
sequence.features = sequence.features.filter(function (feature) {
|
|
146
|
+
if (!feature || typeof feature !== "object") {
|
|
147
|
+
response.messages.push("Invalid feature detected and removed");
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
150
|
+
feature.start = parseInt(feature.start, 10);
|
|
151
|
+
feature.end = parseInt(feature.end, 10);
|
|
152
|
+
|
|
153
|
+
if (!feature.name || typeof feature.name !== "string") {
|
|
154
|
+
response.messages.push(
|
|
155
|
+
'Unable to detect valid name for feature, setting name to "Untitled Feature"'
|
|
156
|
+
);
|
|
157
|
+
feature.name = "Untitled Feature";
|
|
158
|
+
}
|
|
159
|
+
if (
|
|
160
|
+
!allowOverflowAnnotations &&
|
|
161
|
+
(!areNonNegativeIntegers([feature.start]) ||
|
|
162
|
+
feature.start > sequence.size - (inclusive1BasedStart ? 0 : 1))
|
|
163
|
+
) {
|
|
164
|
+
response.messages.push(
|
|
165
|
+
"Invalid feature start: " +
|
|
166
|
+
feature.start +
|
|
167
|
+
" detected for " +
|
|
168
|
+
feature.name +
|
|
169
|
+
" and set to 1"
|
|
170
|
+
); //setting it to 0 internally, but users will see it as 1
|
|
171
|
+
feature.start = 0;
|
|
172
|
+
}
|
|
173
|
+
if (
|
|
174
|
+
!allowOverflowAnnotations &&
|
|
175
|
+
(!areNonNegativeIntegers([feature.end]) ||
|
|
176
|
+
feature.end > sequence.size - (inclusive1BasedEnd ? 0 : 1))
|
|
177
|
+
) {
|
|
178
|
+
feature.end = Math.max(sequence.size - 1, inclusive1BasedEnd ? 0 : 1);
|
|
179
|
+
response.messages.push(
|
|
180
|
+
"Invalid feature end: " +
|
|
181
|
+
feature.end +
|
|
182
|
+
" detected for " +
|
|
183
|
+
feature.name +
|
|
184
|
+
" and set to " +
|
|
185
|
+
(feature.end + 1)
|
|
186
|
+
);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if (
|
|
190
|
+
feature.start - (inclusive1BasedStart ? 0 : 1) >
|
|
191
|
+
feature.end - (inclusive1BasedEnd ? 0 : 1) &&
|
|
192
|
+
sequence.circular === false
|
|
193
|
+
) {
|
|
194
|
+
if (circularityExplicitlyDefined) {
|
|
195
|
+
response.messages.push(
|
|
196
|
+
"Invalid circular feature detected in explicitly linear sequence. " +
|
|
197
|
+
feature.name +
|
|
198
|
+
". start set to 1"
|
|
199
|
+
); //setting it to 0 internally, but users will see it as 1
|
|
200
|
+
feature.start = 0;
|
|
201
|
+
} else {
|
|
202
|
+
response.messages.push(
|
|
203
|
+
"Circular feature detected in implicitly linear sequence. Setting sequence to be circular."
|
|
204
|
+
);
|
|
205
|
+
sequence.circular = true;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
feature.strand = parseInt(feature.strand, 10);
|
|
210
|
+
if (
|
|
211
|
+
feature.strand === -1 ||
|
|
212
|
+
feature.strand === false ||
|
|
213
|
+
feature.strand === "false" ||
|
|
214
|
+
feature.strand === "-"
|
|
215
|
+
) {
|
|
216
|
+
feature.strand = -1;
|
|
217
|
+
} else {
|
|
218
|
+
feature.strand = 1;
|
|
219
|
+
}
|
|
220
|
+
let invalidFeatureType;
|
|
221
|
+
if (
|
|
222
|
+
feature.type &&
|
|
223
|
+
typeof feature.type === "string" &&
|
|
224
|
+
feature.type.toLowerCase() === "primer"
|
|
225
|
+
) {
|
|
226
|
+
feature.type = "primer_bind";
|
|
227
|
+
}
|
|
228
|
+
if (
|
|
229
|
+
!feature.type ||
|
|
230
|
+
typeof feature.type !== "string" ||
|
|
231
|
+
!getFeatureTypes({ includeHidden: true }).some(function (featureType) {
|
|
232
|
+
if (featureType.toLowerCase() === feature.type.toLowerCase()) {
|
|
233
|
+
feature.type = featureType; //this makes sure the feature.type is being set to the exact value of the accepted featureType
|
|
234
|
+
return true;
|
|
235
|
+
}
|
|
236
|
+
return false;
|
|
237
|
+
})
|
|
238
|
+
) {
|
|
239
|
+
//tnr: commenting this logic out
|
|
240
|
+
if (coerceFeatureTypes || !feature.type) {
|
|
241
|
+
response.messages.push(
|
|
242
|
+
'Invalid feature type detected: "' +
|
|
243
|
+
feature.type +
|
|
244
|
+
'" within ' +
|
|
245
|
+
feature.name +
|
|
246
|
+
". set type to misc_feature"
|
|
247
|
+
);
|
|
248
|
+
if (typeof feature.type === "string") {
|
|
249
|
+
invalidFeatureType = feature.type;
|
|
250
|
+
}
|
|
251
|
+
feature.type = "misc_feature";
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
if (!feature.notes) {
|
|
255
|
+
feature.notes = {};
|
|
256
|
+
}
|
|
257
|
+
//if the original feature type was invalid, push it onto the notes object under featureType
|
|
258
|
+
if (invalidFeatureType) {
|
|
259
|
+
if (!feature.notes.featureType) {
|
|
260
|
+
feature.notes.featureType = [];
|
|
261
|
+
}
|
|
262
|
+
feature.notes.featureType.push(invalidFeatureType);
|
|
263
|
+
}
|
|
264
|
+
if (feature.notes.label) {
|
|
265
|
+
//we've already used the label as the name by default if both gene and label were present
|
|
266
|
+
delete feature.notes.label;
|
|
267
|
+
} else if (feature.notes.gene) {
|
|
268
|
+
//gene was useds for name (if it existed)
|
|
269
|
+
delete feature.notes.gene;
|
|
270
|
+
} else if (feature.notes.name) {
|
|
271
|
+
//name was used for name (if it existed)
|
|
272
|
+
delete feature.notes.name;
|
|
273
|
+
}
|
|
274
|
+
if (feature.notes.color) {
|
|
275
|
+
feature.color = feature.notes.color[0] || feature.color;
|
|
276
|
+
delete feature.notes.color;
|
|
277
|
+
}
|
|
278
|
+
if (feature.notes.labelColor) {
|
|
279
|
+
feature.labelColor = feature.notes.labelColor[0] || feature.labelColor;
|
|
280
|
+
delete feature.notes.labelColor;
|
|
281
|
+
}
|
|
282
|
+
if (
|
|
283
|
+
feature.notes.pragma &&
|
|
284
|
+
some(feature.notes.pragma, (p) => p === "overlapsSelf")
|
|
285
|
+
) {
|
|
286
|
+
feature.overlapsSelf = true;
|
|
287
|
+
feature.notes.pragma = filter(
|
|
288
|
+
feature.notes.pragma,
|
|
289
|
+
(p) => p !== "overlapsSelf"
|
|
290
|
+
);
|
|
291
|
+
}
|
|
292
|
+
feature.notes.note &&
|
|
293
|
+
some(feature.notes.note, (n) => {
|
|
294
|
+
if (
|
|
295
|
+
n &&
|
|
296
|
+
typeof n === "string" &&
|
|
297
|
+
n.toLowerCase().includes("sequence:")
|
|
298
|
+
) {
|
|
299
|
+
//remove it after we're parsed it out
|
|
300
|
+
feature.notes.note = filter(
|
|
301
|
+
feature.notes.note,
|
|
302
|
+
(p) => p && !p.toLowerCase().includes("sequence:")
|
|
303
|
+
);
|
|
304
|
+
if (feature.notes.note.length === 0) {
|
|
305
|
+
delete feature.notes.note;
|
|
306
|
+
}
|
|
307
|
+
const match = n.match(/sequence:[ \r\n.]*[a-zA-Z]*/i);
|
|
308
|
+
if (match && match[0])
|
|
309
|
+
feature.bases = match[0]
|
|
310
|
+
.replace(/\s/g, "")
|
|
311
|
+
.replace("sequence:", "");
|
|
312
|
+
|
|
313
|
+
return true;
|
|
314
|
+
}
|
|
315
|
+
});
|
|
316
|
+
|
|
317
|
+
feature.notes.primerBindsOn &&
|
|
318
|
+
some(feature.notes.primerBindsOn, (n) => {
|
|
319
|
+
if (n) {
|
|
320
|
+
feature.primerBindsOn = n;
|
|
321
|
+
delete feature.notes.primerBindsOn;
|
|
322
|
+
}
|
|
323
|
+
});
|
|
324
|
+
|
|
325
|
+
for (const { pragma, type } of pragmasAndTypes) {
|
|
326
|
+
if (
|
|
327
|
+
options[`accept${upperFirst(type)}`] !== false && //acceptParts, acceptWarnings,
|
|
328
|
+
feature.notes.pragma &&
|
|
329
|
+
some(feature.notes.pragma, (p) => p === pragma)
|
|
330
|
+
) {
|
|
331
|
+
if (!sequence[type]) {
|
|
332
|
+
sequence[type] = []; //initialize an empty array if necessary
|
|
333
|
+
}
|
|
334
|
+
feature.type = type.slice(0, -1); //set the type before pushing it onto the array
|
|
335
|
+
delete feature.notes.pragma;
|
|
336
|
+
sequence[type].push(feature);
|
|
337
|
+
return false; //don't include the features
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
forEach(feature.notes, (noteArray, key) => {
|
|
341
|
+
feature.notes[key] = map(noteArray, (note) => {
|
|
342
|
+
return unmangleUrls(note);
|
|
343
|
+
});
|
|
344
|
+
});
|
|
345
|
+
return true;
|
|
346
|
+
});
|
|
347
|
+
response.validatedAndCleanedSequence = sequence;
|
|
348
|
+
return response;
|
|
349
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import validateSequence from './validateSequence.js';
|
|
2
|
+
|
|
3
|
+
export default function validateSequenceArray(parsingResultArray, options) {
|
|
4
|
+
if (parsingResultArray) {
|
|
5
|
+
if (!Array.isArray(parsingResultArray)) {
|
|
6
|
+
//wrap the parsingResult into an array if it isn't one already
|
|
7
|
+
parsingResultArray = [parsingResultArray];
|
|
8
|
+
}
|
|
9
|
+
//should convert the old data type to the new data type (flattened sequence)
|
|
10
|
+
parsingResultArray.forEach(function(parsingResult) {
|
|
11
|
+
if (parsingResult.success) {
|
|
12
|
+
const res = validateSequence(parsingResult.parsedSequence, options);
|
|
13
|
+
//add any validation error messages to the parsed sequence results messages
|
|
14
|
+
parsingResult.messages = parsingResult.messages.concat(res.messages);
|
|
15
|
+
parsingResult.parsedSequence = res.validatedAndCleanedSequence;
|
|
16
|
+
}
|
|
17
|
+
});
|
|
18
|
+
}
|
|
19
|
+
return parsingResultArray;
|
|
20
|
+
};
|