@teselagen/bio-parsers 0.1.26 → 0.1.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,423 @@
1
+ /* eslint-disable no-var*/
2
+ import { get, cloneDeep, map, each, isObject, flatMap } from "lodash";
3
+ import color from "color";
4
+
5
+ import pragmasAndTypes from "./utils/pragmasAndTypes.js";
6
+ import { mangleOrStripUrls } from "./utils/unmangleUrls.js";
7
+ import { reformatName } from "./utils/NameUtils.js";
8
+ import { getFeatureToColorMap } from "@teselagen/sequence-utils";
9
+ const StringUtil = {
10
+ /** Trims white space at beginning and end of string
11
+ * @param {string} line
12
+ * @returns {string} line
13
+ */
14
+ trim: function (line) {
15
+ return line.replace(/^\s+|\s+$/g, "");
16
+ },
17
+
18
+ /** Trims white space at beginning string
19
+ * @param {string} line
20
+ * @returns {string} line
21
+ */
22
+ ltrim: function (line) {
23
+ return line.replace(/^\s+/, "");
24
+ },
25
+
26
+ /** Trims white space at end of string
27
+ * @param {string} line
28
+ * @returns {string} line
29
+ */
30
+ rtrim: function (line) {
31
+ return line.replace(/\s+$/, "");
32
+ },
33
+
34
+ /** Pads white space at beginning of string
35
+ * @param {string} line
36
+ * @returns {string} line
37
+ */
38
+ lpad: function (line, padString, length) {
39
+ let str = line;
40
+ while (str.length < length) str = padString + str;
41
+ return str;
42
+ },
43
+
44
+ /** Pads white space at end of string
45
+ * @param {string} line
46
+ * @returns {string} line
47
+ */
48
+ rpad: function (line, padString, length) {
49
+ let str = line;
50
+ while (str.length < length) str = str + padString;
51
+ return str;
52
+ },
53
+ };
54
+
55
+ const DIGEST_PART_EXPORT_FIELD_MAP = {
56
+ isDigestPart: "isDigestPart",
57
+ isDigestValid: "isDigestValid",
58
+ "re5Prime.name": "re5PrimeName",
59
+ "re5Prime.recognitionRegex": "re5PrimePattern",
60
+ re5PrimeOverhang: "re5PrimeOverhang",
61
+ re5PrimeOverhangStrand: "re5PrimeOverhangStrand",
62
+ re5PrimeRecognitionTypeCode: "re5PrimeRecognitionTypeCode",
63
+ "re3Prime.name": "re3PrimeName",
64
+ "re3Prime.recognitionRegex": "re3PrimePattern",
65
+ re3PrimeOverhang: "re3PrimeOverhang",
66
+ re3PrimeOverhangStrand: "re3PrimeOverhangStrand",
67
+ re3PrimeRecognitionTypeCode: "re3PrimeRecognitionTypeCode",
68
+ };
69
+
70
+ function cutUpArray(val, start, end) {
71
+ return val.slice(start, end).join("");
72
+ }
73
+
74
+ function cutUpStr(val, start, end) {
75
+ return val.slice(start, end);
76
+ }
77
+
78
+ export default function (_serSeq, options) {
79
+ options = options || {};
80
+ options.reformatSeqName = options.reformatSeqName !== false;
81
+ const serSeq = cloneDeep(_serSeq);
82
+ if (!serSeq) return false;
83
+
84
+ try {
85
+ if (serSeq.isProtein || serSeq.type === "protein" || serSeq.type === "AA") {
86
+ serSeq.isProtein = true;
87
+ serSeq.sequence = serSeq.proteinSequence || serSeq.sequence;
88
+ options.isProtein = true;
89
+ }
90
+ let content = null;
91
+ const cutUp = typeof serSeq.sequence === "string" ? cutUpStr : cutUpArray;
92
+ if (!serSeq.sequence) serSeq.sequence = "";
93
+
94
+ let lines = [];
95
+ lines.push(createGenbankLocus(serSeq, options));
96
+ if (serSeq.definition || serSeq.description) {
97
+ lines.push(
98
+ "DEFINITION " +
99
+ mangleOrStripUrls(serSeq.definition || serSeq.description, options)
100
+ );
101
+ }
102
+
103
+ if (serSeq.accession) {
104
+ lines.push("ACCESSION " + serSeq.accession);
105
+ }
106
+
107
+ if (serSeq.version) {
108
+ lines.push("VERSION " + serSeq.version);
109
+ }
110
+
111
+ if (serSeq.extraLines) {
112
+ lines = lines.concat(serSeq.extraLines);
113
+ }
114
+ if (serSeq.comments) {
115
+ serSeq.comments.forEach(function (comment) {
116
+ lines.push("COMMENT " + comment);
117
+ });
118
+ }
119
+ if (serSeq.teselagen_unique_id) {
120
+ lines.push(
121
+ "COMMENT teselagen_unique_id: " + serSeq.teselagen_unique_id
122
+ );
123
+ }
124
+ if (serSeq.library) {
125
+ lines.push("COMMENT library: " + serSeq.library);
126
+ }
127
+
128
+ let longestFeatureTypeLength = 15;
129
+
130
+ serSeq.features = map(serSeq.features).concat(
131
+ flatMap(pragmasAndTypes, ({ pragma, type }) => {
132
+ return flatMap(serSeq[type], (ann) => {
133
+ if (!isObject(ann)) {
134
+ return [];
135
+ }
136
+ if (type === "primers") {
137
+ ann.type = "primer_bind";
138
+ }
139
+ if (type === "parts" && ann.isDigestPart) {
140
+ addDigestPartFieldsToNotes(ann);
141
+ }
142
+ ann.notes = pragma
143
+ ? {
144
+ ...ann.notes,
145
+ pragma: [pragma],
146
+ }
147
+ : ann.notes;
148
+ return ann;
149
+ });
150
+ })
151
+ );
152
+ serSeq.features.forEach(({ type }) => {
153
+ if (type && type.length > longestFeatureTypeLength) {
154
+ longestFeatureTypeLength = type.length;
155
+ }
156
+ });
157
+
158
+ let printedFeatureHeader;
159
+ each(serSeq.features, function (feat) {
160
+ if (!printedFeatureHeader) {
161
+ printedFeatureHeader = true;
162
+ lines.push("FEATURES Location/Qualifiers");
163
+ }
164
+ lines.push(
165
+ featureToGenbankString(feat, {
166
+ ...options,
167
+ featurePadLength: longestFeatureTypeLength + 1,
168
+ })
169
+ );
170
+ });
171
+
172
+ lines.push("ORIGIN ");
173
+ for (let i = 0; i < serSeq.sequence.length; i = i + 60) {
174
+ const line = [];
175
+ const ind = StringUtil.lpad("" + (i + 1), " ", 9);
176
+ line.push(ind);
177
+
178
+ for (let j = i; j < i + 60; j = j + 10) {
179
+ // line.push(serSeq.sequence.slice(j,j+10).join(''));
180
+ line.push(cutUp(serSeq.sequence, j, j + 10));
181
+ }
182
+ lines.push(line.join(" "));
183
+ }
184
+
185
+ lines.push("//");
186
+
187
+ content = lines.join("\r\n");
188
+ // return cb(err, content);
189
+ return content;
190
+ } catch (e) {
191
+ console.warn("Error processing sequence << Check jsonToGenbank.js");
192
+ console.warn(serSeq);
193
+ console.warn(e.stack);
194
+ return false;
195
+ }
196
+ }
197
+
198
+ function createGenbankLocus(serSeq, options) {
199
+ if (serSeq.sequence.symbols) {
200
+ serSeq.sequence = serSeq.sequence.symbols.split("");
201
+ }
202
+
203
+ let tmp;
204
+ let dnaType;
205
+ if (serSeq.isProtein) {
206
+ dnaType = "";
207
+ } else if (serSeq.type === "RNA") {
208
+ dnaType = serSeq?.doubleStranded ? 'RNA' : serSeq?.sequenceTypeFromLocus ?? "ss-RNA";
209
+ } else {
210
+ dnaType = serSeq?.doubleStranded ? 'DNA' : serSeq?.sequenceTypeFromLocus ?? "DNA";
211
+ }
212
+ const date = getCurrentDateString();
213
+
214
+ let line = StringUtil.rpad("LOCUS", " ", 12);
215
+ let nameToUse = serSeq.name || "Untitled_Sequence";
216
+ nameToUse = options.reformatSeqName
217
+ ? reformatName(nameToUse)
218
+ : nameToUse;
219
+ line += StringUtil.rpad(nameToUse, " ", 16);
220
+ line += " "; // T.H line 2778 of GenbankFormat.as col 29 space
221
+ line += StringUtil.lpad(String(serSeq.sequence.length), " ", 11);
222
+ line += serSeq.isProtein ? " aa " : " bp "; // col 41
223
+ // if (strandType !== "") {
224
+ // tmp = strandType + "-";
225
+ // } else {
226
+ tmp = "";
227
+ // }
228
+ line += StringUtil.lpad(tmp, " ", 3);
229
+ line += StringUtil.rpad(dnaType, " ", 6);
230
+ line += " ";
231
+
232
+ if (!serSeq.circular || serSeq.circular === "0") {
233
+ line += "linear ";
234
+ //line += " ";
235
+ } else {
236
+ line += "circular";
237
+ }
238
+
239
+ line += " "; //col 64
240
+ line += StringUtil.rpad(serSeq.gbDivision || "SYN", " ", 1);
241
+ // }
242
+ line += " "; // col 68
243
+ // DOES NOT PARSE DATE USEFULLY ORIGINALLY!
244
+ line += date;
245
+ //line += "\n";
246
+
247
+ return line;
248
+ }
249
+
250
+ function getCurrentDateString() {
251
+ let date = new Date();
252
+ date = date.toString().split(" ");
253
+ const day = date[2];
254
+ const month = date[1].toUpperCase();
255
+ const year = date[3];
256
+ return day + "-" + month + "-" + year;
257
+ }
258
+
259
+ function featureNoteInDataToGenbankString(name, value, options) {
260
+ return (
261
+ StringUtil.lpad("/", " ", 22) +
262
+ name +
263
+ '="' +
264
+ mangleOrStripUrls(value, options) +
265
+ '"'
266
+ );
267
+ }
268
+
269
+ function featureToGenbankString(feat, options) {
270
+ const lines = [];
271
+ if (feat.type === "primer") {
272
+ feat.type = "primer_bind";
273
+ }
274
+ const line =
275
+ " " +
276
+ StringUtil.rpad(feat.type || "misc_feature", " ", options.featurePadLength);
277
+ let locStr = "";
278
+
279
+ //for(var i=0;i<feat.locations.length;i++) {
280
+ // var loc = feat.locations[i];
281
+ // locStr.push((loc.start+1) + '..' + loc.end);
282
+ //}
283
+
284
+ if (feat.locations && feat.locations.length > 1) {
285
+ feat.locations.forEach((loc, i) => {
286
+ locStr +=
287
+ getProteinStart(
288
+ parseInt(loc.start, 10) + (options.inclusive1BasedStart ? 0 : 1),
289
+ options.isProtein
290
+ ) +
291
+ ".." +
292
+ getProteinEnd(
293
+ parseInt(loc.end, 10) + (options.inclusive1BasedEnd ? 0 : 1),
294
+ options.isProtein
295
+ );
296
+
297
+ if (i !== feat.locations.length - 1) {
298
+ locStr += ",";
299
+ }
300
+ });
301
+ locStr = "join(" + locStr + ")";
302
+ } else {
303
+ locStr +=
304
+ getProteinStart(
305
+ parseInt(feat.start, 10) + (options.inclusive1BasedStart ? 0 : 1),
306
+ options.isProtein
307
+ ) +
308
+ ".." +
309
+ getProteinEnd(
310
+ parseInt(feat.end, 10) + (options.inclusive1BasedEnd ? 0 : 1),
311
+ options.isProtein
312
+ );
313
+ }
314
+
315
+ // locStr = locStr.join(",");
316
+
317
+ if (feat.strand === -1) {
318
+ locStr = "complement(" + locStr + ")";
319
+ }
320
+
321
+ lines.push(line + locStr);
322
+
323
+ lines.push(
324
+ featureNoteInDataToGenbankString(
325
+ "label",
326
+ feat.name || "Untitled Feature",
327
+ options
328
+ )
329
+ );
330
+
331
+ if (feat.bases && feat.bases.length && feat.type === "primer_bind") {
332
+ addToNotes(feat, "note", `sequence: ${feat.bases}`);
333
+ }
334
+ if (feat.primerBindsOn && feat.type === "primer_bind") {
335
+ addToNotes(feat, "primerBindsOn", feat.primerBindsOn);
336
+ }
337
+ if (feat.overlapsSelf) {
338
+ addToNotes(feat, "pragma", "overlapsSelf");
339
+ }
340
+ if (feat.arrowheadType) {
341
+ const valToAdd =
342
+ feat.arrowheadType.toUpperCase() === "BOTH"
343
+ ? "BOTH"
344
+ : feat.arrowheadType.toUpperCase() === "NONE"
345
+ ? "NONE"
346
+ : undefined;
347
+
348
+ if (valToAdd) addToNotes(feat, "direction", valToAdd);
349
+ }
350
+ let notes = feat.notes;
351
+
352
+ if (notes) {
353
+ try {
354
+ if (typeof notes === "string") {
355
+ try {
356
+ notes = JSON.parse(notes);
357
+ } catch (e) {
358
+ console.warn("Warning: Note incorrectly sent as a string.");
359
+ notes = {}; //set the notes to a blank object
360
+ }
361
+ }
362
+ Object.keys(notes).forEach(function (key) {
363
+ if (key === "color" || key === "labelColor") return; //we'll handle this below
364
+ if (notes[key] instanceof Array) {
365
+ notes[key].forEach(function (value) {
366
+ lines.push(featureNoteInDataToGenbankString(key, value, options));
367
+ });
368
+ } else {
369
+ console.warn("Warning: Note object expected array values");
370
+ console.warn(notes);
371
+ }
372
+ });
373
+ } catch (e) {
374
+ console.warn("Warning: Note cannot be processed");
375
+ }
376
+ }
377
+ feat.color = (feat.notes && feat.notes.color) || feat.color;
378
+ feat.labelColor = (feat.notes && feat.notes.labelColor) || feat.labelColor;
379
+
380
+ if (
381
+ feat.color &&
382
+ color.rgb(feat.color).string() !==
383
+ color
384
+ .rgb(getFeatureToColorMap({ includeHidden: true })[feat.type])
385
+ .string() //don't save a color note if the color is already the same as our defaults
386
+ ) {
387
+ lines.push(featureNoteInDataToGenbankString("color", feat.color, options));
388
+ }
389
+ if (feat.labelColor) {
390
+ lines.push(
391
+ featureNoteInDataToGenbankString("labelColor", feat.labelColor, options)
392
+ );
393
+ }
394
+
395
+ return lines.join("\r\n");
396
+ }
397
+
398
+ function getProteinStart(val, isProtein) {
399
+ if (!isProtein) return val;
400
+ return Math.floor((val + 2) / 3);
401
+ }
402
+ function getProteinEnd(val, isProtein) {
403
+ if (!isProtein) return val;
404
+ return Math.floor(val / 3);
405
+ }
406
+
407
+ function addToNotes(ann, key, val) {
408
+ if (!ann.notes) {
409
+ ann.notes = {};
410
+ }
411
+ if (!ann.notes[key]) {
412
+ ann.notes[key] = [];
413
+ }
414
+ ann.notes[key].push(val);
415
+ }
416
+
417
+ function addDigestPartFieldsToNotes(ann) {
418
+ Object.entries(DIGEST_PART_EXPORT_FIELD_MAP).forEach(
419
+ ([digestFieldPath, digestFieldName]) => {
420
+ addToNotes(ann, digestFieldName, String(get(ann, digestFieldPath)));
421
+ }
422
+ );
423
+ }
@@ -0,0 +1,26 @@
1
+ import { omit } from "lodash";
2
+
3
+ import cleanUpTeselagenJsonForExport from "./utils/cleanUpTeselagenJsonForExport"
4
+ import { tidyUpSequenceData } from "@teselagen/sequence-utils";
5
+
6
+ /**
7
+ * @param {*} incomingJson
8
+ * @returns a sequence json cleaned up and converted to a string with extranous fields ommited
9
+ */
10
+ export default function jsonToJsonString(incomingJson) {
11
+ return JSON.stringify(
12
+ omit(
13
+ cleanUpTeselagenJsonForExport(
14
+ tidyUpSequenceData(incomingJson, { annotationsAsObjects: false })
15
+ ),
16
+ [
17
+ "sequenceFragments",
18
+ "sequenceFeatures",
19
+ "cutsites",
20
+ "orfs",
21
+ "filteredParts",
22
+ "filteredFeatures"
23
+ ]
24
+ )
25
+ );
26
+ }
@@ -0,0 +1,135 @@
1
+ /* eslint-disable no-var*/
2
+ import validateSequenceArray from "./utils/validateSequenceArray";
3
+ import searchWholeObjByName, {
4
+ searchWholeObjByNameSimple,
5
+ searchWholeObjByNameSimpleArray,
6
+ } from "./utils/searchWholeObjByName";
7
+
8
+ import { XMLParser } from "fast-xml-parser";
9
+ import { flatMap, get } from "lodash";
10
+
11
+ //Here's what should be in the callback:
12
+ // {
13
+ // parsedSequence:
14
+ // messages:
15
+ // success:
16
+ // }
17
+ async function sbolXmlToJson(string, options) {
18
+ options = options || {};
19
+ const onFileParsed = function (sequences) {
20
+ //before we call the onFileParsed callback, we need to validate the sequence
21
+ return validateSequenceArray(sequences, options);
22
+ };
23
+ let response = {
24
+ parsedSequence: null,
25
+ messages: [],
26
+ success: true,
27
+ };
28
+ try {
29
+ const result = new XMLParser({
30
+ isArray: () => true,
31
+ ignoreAttributes: false,
32
+ }).parse(string);
33
+ const sbolJsonMatches = searchWholeObjByName("DnaComponent", result);
34
+ if (sbolJsonMatches[0]) {
35
+ const resultArray = [];
36
+ for (let i = 0; i < sbolJsonMatches[0].value.length; i++) {
37
+ try {
38
+ response = {
39
+ parsedSequence: null,
40
+ messages: [],
41
+ success: true,
42
+ };
43
+ response.parsedSequence = parseSbolJson(
44
+ sbolJsonMatches[0].value[i],
45
+ options
46
+ );
47
+ } catch (e) {
48
+ console.error("error:", e);
49
+ console.error("error.stack: ", e.stack);
50
+ resultArray.push({
51
+ success: false,
52
+ messages: ["Error while parsing Sbol format"],
53
+ });
54
+ }
55
+ if (response.parsedSequence.features.length > 0) {
56
+ response.messages.push(
57
+ "SBOL feature types are stored in feature notes"
58
+ );
59
+ }
60
+ resultArray.push(response);
61
+ }
62
+ return onFileParsed(resultArray);
63
+ } else {
64
+ return onFileParsed({
65
+ success: false,
66
+ messages: ["Error: XML is not valid Jbei or Sbol format"],
67
+ });
68
+ }
69
+ } catch (e) {
70
+ return onFileParsed({
71
+ success: false,
72
+ messages: ["Error parsing XML to JSON"],
73
+ });
74
+ }
75
+ }
76
+ // Converts SBOL formats.
77
+ // * Specifications for SBOL can be found at http://www.sbolstandard.org/specification/core-data-model
78
+ // *
79
+ // * The hierarcy of the components in an SBOL object is:
80
+ // *
81
+ // * The hierarchy is Collection -> DnaComponent -> DnaSequence
82
+ // *
83
+ // * Check for each level and parse downward from there.
84
+ // tnrtodo: this should be tested with a wider variety of sbol file types!
85
+ function parseSbolJson(sbolJson, options) {
86
+ let name;
87
+ if (get(sbolJson, "name[0]")) {
88
+ name = get(sbolJson, "name[0]");
89
+ } else {
90
+ name = get(sbolJson, "displayId[0]");
91
+ }
92
+ return {
93
+ // circular: get(sbolJson, "seq:circular[0]"), //tnrtodo this needs to be changed
94
+ circular: false,
95
+ sequence: get(sbolJson, "dnaSequence[0].DnaSequence[0].nucleotides"),
96
+ name: name,
97
+ features: flatMap(sbolJson.annotation, function (annotation) {
98
+ const feature = get(annotation, "SequenceAnnotation[0]");
99
+ if (feature) {
100
+ const notes = searchWholeObjByNameSimpleArray("@_ns2:about", feature);
101
+ const otherNotes = searchWholeObjByNameSimpleArray(
102
+ "@_ns2:resource",
103
+ feature
104
+ );
105
+ const newNotes = {};
106
+ [...notes, ...otherNotes].forEach(function (note) {
107
+ if (note) {
108
+ if (!newNotes.about) newNotes.about = [];
109
+ newNotes.about.push(note);
110
+ }
111
+ });
112
+ const featureName =
113
+ searchWholeObjByNameSimple("name", feature) ||
114
+ searchWholeObjByNameSimple("displayId", feature);
115
+ return {
116
+ name: featureName,
117
+ notes: newNotes,
118
+ type: "misc_feature", // sbol represents the feature type in what we are parsing as notes as the URL is difficult to follow
119
+ // type: feature['seq:label'], //tnrtodo: figure out if an annotation type is passed
120
+ // id: feature['seq:label'],
121
+ start: parseInt(
122
+ get(feature, "bioStart[0]") - (options.inclusive1BasedStart ? 0 : 1)
123
+ ),
124
+ end: parseInt(
125
+ get(feature, "bioEnd[0]") - (options.inclusive1BasedEnd ? 0 : 1)
126
+ ),
127
+ strand: get(feature, "strand[0]"), //+ or -
128
+ // notes: feature['seq:label'],
129
+ };
130
+ }
131
+ }),
132
+ };
133
+ }
134
+
135
+ export default sbolXmlToJson;