@teselagen/bio-parsers 0.1.27 → 0.1.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ export function unmangleUrls(str) {
2
+ if (!str) return str;
3
+ if (typeof str !== "string") return str;
4
+
5
+ const urlRegex = /%%TG%%_(.*?)_%%TG%%/g;
6
+ return str.replace(urlRegex, function (outer, innerUrl) {
7
+ if (innerUrl) {
8
+ return `${decodeURIComponent(innerUrl)}`;
9
+ }
10
+ return outer;
11
+ });
12
+ }
13
+
14
+ export function mangleOrStripUrls(
15
+ str,
16
+ { mangleUrls, doNotMangleOrStripUrls } = {}
17
+ ) {
18
+ if (!str) return str;
19
+ const urlRegex = /(((https?:\/\/)|(www\.))[^\s]+)/g;
20
+ if (mangleUrls || doNotMangleOrStripUrls) {
21
+ if (doNotMangleOrStripUrls) {
22
+ //if doNotMangleOrStripUrls=true, just return the original string
23
+ return str;
24
+ }
25
+ //if mangleUrls=true, return a URL mangled and encoded string
26
+ return str.replace(urlRegex, function (url) {
27
+ return `%%TG%%_${encodeURIComponent(url)}_%%TG%%`;
28
+ });
29
+ }
30
+ //if no options passed, strip all URLs from the string
31
+ return str.replace(urlRegex, function () {
32
+ return ``;
33
+ });
34
+ }
@@ -0,0 +1,349 @@
1
+ import areNonNegativeIntegers from "validate.io-nonnegative-integer-array";
2
+ import { getFeatureTypes } from "@teselagen/sequence-utils";
3
+ import {
4
+ filterAminoAcidSequenceString,
5
+ filterSequenceString,
6
+ guessIfSequenceIsDnaAndNotProtein,
7
+ } from "@teselagen/sequence-utils";
8
+ import { filter, some, upperFirst } from "lodash";
9
+ import pragmasAndTypes from "./pragmasAndTypes.js";
10
+ import { forEach } from "lodash";
11
+ import { map } from "lodash";
12
+ import { unmangleUrls } from "./unmangleUrls";
13
+ import { reformatName } from "./NameUtils.js";
14
+
15
+ //validation checking
16
+ /**
17
+ * validation and sanitizing of our teselagen sequence data type
18
+ * @param {object} sequence Our teselagen sequence data type
19
+ * @return response {
20
+ validatedAndCleanedSequence: {},
21
+ messages: [],
22
+ };
23
+ */
24
+ export default function validateSequence(sequence, options = {}) {
25
+ let {
26
+ isProtein,
27
+ isOligo,
28
+ guessIfProtein,
29
+ guessIfProteinOptions,
30
+ reformatSeqName,
31
+ inclusive1BasedStart,
32
+ inclusive1BasedEnd,
33
+ additionalValidChars,
34
+ allowOverflowAnnotations,
35
+ coerceFeatureTypes,
36
+ } = options;
37
+ const response = {
38
+ validatedAndCleanedSequence: {},
39
+ messages: [],
40
+ };
41
+ if (!sequence || typeof sequence !== "object") {
42
+ throw new Error("Invalid sequence");
43
+ }
44
+ if (!sequence.name) {
45
+ //we'll handle transferring the file name outside of this function
46
+ //for now just set it to a blank string
47
+ sequence.name = "";
48
+ }
49
+ if (!sequence.extraLines) {
50
+ sequence.extraLines = [];
51
+ }
52
+ if (!sequence.comments) {
53
+ sequence.comments = [];
54
+ }
55
+ if (sequence.description) {
56
+ sequence.description = unmangleUrls(sequence.description);
57
+ }
58
+ const oldName = sequence.name;
59
+ if (reformatSeqName) {
60
+ sequence.name = reformatName(sequence.name);
61
+ }
62
+ if (oldName !== sequence.name) {
63
+ response.messages.push(
64
+ "Name (" + oldName + ") reformatted to " + sequence.name
65
+ );
66
+ }
67
+
68
+ if (Array.isArray(sequence.sequence)) {
69
+ sequence.sequence = sequence.sequence.join("");
70
+ }
71
+ if (!sequence.sequence) {
72
+ response.messages.push("No sequence detected");
73
+ sequence.sequence = "";
74
+ }
75
+ let validChars;
76
+ if (isProtein === undefined && guessIfProtein) {
77
+ isProtein = !guessIfSequenceIsDnaAndNotProtein(
78
+ sequence.sequence,
79
+ guessIfProteinOptions
80
+ );
81
+ }
82
+ if (isProtein) {
83
+ //tnr: add code to strip invalid protein data..
84
+ validChars = filterAminoAcidSequenceString(sequence.sequence);
85
+ if (validChars !== sequence.sequence) {
86
+ sequence.sequence = validChars;
87
+ response.messages.push(
88
+ "Import Error: Illegal character(s) detected and removed from amino acid sequence. Allowed characters are: xtgalmfwkqespvicyhrndu"
89
+ );
90
+ }
91
+ sequence.type = "PROTEIN";
92
+ sequence.isProtein = true;
93
+ if (!sequence.proteinSequence) {
94
+ sequence.proteinSequence = sequence.sequence;
95
+ }
96
+ sequence.proteinSize = sequence.proteinSequence.length;
97
+ } else {
98
+ //todo: this logic won't catch every case of RNA, so we should probably handle RNA conversion at another level..
99
+ const temp = sequence.sequence;
100
+ if (!isOligo) {
101
+ sequence.sequence = sequence.sequence.replace(/u/gi, (u) =>
102
+ u === "U" ? "T" : "t"
103
+ );
104
+ }
105
+ if (temp !== sequence.sequence) {
106
+ sequence.type = "RNA";
107
+ } else {
108
+ sequence.type = "DNA";
109
+ }
110
+
111
+ validChars = filterSequenceString(sequence.sequence, additionalValidChars);
112
+ if (validChars !== sequence.sequence) {
113
+ sequence.sequence = validChars;
114
+ response.messages.push(
115
+ "Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn"
116
+ );
117
+ }
118
+ }
119
+
120
+ if (!sequence.size) {
121
+ sequence.size = isProtein
122
+ ? sequence.proteinSequence.length * 3
123
+ : sequence.sequence.length;
124
+ }
125
+ let circularityExplicitlyDefined;
126
+ if (
127
+ sequence.circular === false ||
128
+ sequence.circular === "false" ||
129
+ sequence.circular === -1
130
+ ) {
131
+ sequence.circular = false;
132
+ circularityExplicitlyDefined = true;
133
+ } else if (!sequence.circular) {
134
+ sequence.circular = false;
135
+ circularityExplicitlyDefined = circularityExplicitlyDefined || false;
136
+ } else {
137
+ sequence.circular = true;
138
+ }
139
+
140
+ if (!sequence.features || !Array.isArray(sequence.features)) {
141
+ response.messages.push("No valid features detected");
142
+ sequence.features = [];
143
+ }
144
+ //tnr: maybe this should be wrapped in its own function (in case we want to use it elsewhere)
145
+ sequence.features = sequence.features.filter(function (feature) {
146
+ if (!feature || typeof feature !== "object") {
147
+ response.messages.push("Invalid feature detected and removed");
148
+ return false;
149
+ }
150
+ feature.start = parseInt(feature.start, 10);
151
+ feature.end = parseInt(feature.end, 10);
152
+
153
+ if (!feature.name || typeof feature.name !== "string") {
154
+ response.messages.push(
155
+ 'Unable to detect valid name for feature, setting name to "Untitled Feature"'
156
+ );
157
+ feature.name = "Untitled Feature";
158
+ }
159
+ if (
160
+ !allowOverflowAnnotations &&
161
+ (!areNonNegativeIntegers([feature.start]) ||
162
+ feature.start > sequence.size - (inclusive1BasedStart ? 0 : 1))
163
+ ) {
164
+ response.messages.push(
165
+ "Invalid feature start: " +
166
+ feature.start +
167
+ " detected for " +
168
+ feature.name +
169
+ " and set to 1"
170
+ ); //setting it to 0 internally, but users will see it as 1
171
+ feature.start = 0;
172
+ }
173
+ if (
174
+ !allowOverflowAnnotations &&
175
+ (!areNonNegativeIntegers([feature.end]) ||
176
+ feature.end > sequence.size - (inclusive1BasedEnd ? 0 : 1))
177
+ ) {
178
+ feature.end = Math.max(sequence.size - 1, inclusive1BasedEnd ? 0 : 1);
179
+ response.messages.push(
180
+ "Invalid feature end: " +
181
+ feature.end +
182
+ " detected for " +
183
+ feature.name +
184
+ " and set to " +
185
+ (feature.end + 1)
186
+ );
187
+ }
188
+
189
+ if (
190
+ feature.start - (inclusive1BasedStart ? 0 : 1) >
191
+ feature.end - (inclusive1BasedEnd ? 0 : 1) &&
192
+ sequence.circular === false
193
+ ) {
194
+ if (circularityExplicitlyDefined) {
195
+ response.messages.push(
196
+ "Invalid circular feature detected in explicitly linear sequence. " +
197
+ feature.name +
198
+ ". start set to 1"
199
+ ); //setting it to 0 internally, but users will see it as 1
200
+ feature.start = 0;
201
+ } else {
202
+ response.messages.push(
203
+ "Circular feature detected in implicitly linear sequence. Setting sequence to be circular."
204
+ );
205
+ sequence.circular = true;
206
+ }
207
+ }
208
+
209
+ feature.strand = parseInt(feature.strand, 10);
210
+ if (
211
+ feature.strand === -1 ||
212
+ feature.strand === false ||
213
+ feature.strand === "false" ||
214
+ feature.strand === "-"
215
+ ) {
216
+ feature.strand = -1;
217
+ } else {
218
+ feature.strand = 1;
219
+ }
220
+ let invalidFeatureType;
221
+ if (
222
+ feature.type &&
223
+ typeof feature.type === "string" &&
224
+ feature.type.toLowerCase() === "primer"
225
+ ) {
226
+ feature.type = "primer_bind";
227
+ }
228
+ if (
229
+ !feature.type ||
230
+ typeof feature.type !== "string" ||
231
+ !getFeatureTypes({ includeHidden: true }).some(function (featureType) {
232
+ if (featureType.toLowerCase() === feature.type.toLowerCase()) {
233
+ feature.type = featureType; //this makes sure the feature.type is being set to the exact value of the accepted featureType
234
+ return true;
235
+ }
236
+ return false;
237
+ })
238
+ ) {
239
+ //tnr: commenting this logic out
240
+ if (coerceFeatureTypes || !feature.type) {
241
+ response.messages.push(
242
+ 'Invalid feature type detected: "' +
243
+ feature.type +
244
+ '" within ' +
245
+ feature.name +
246
+ ". set type to misc_feature"
247
+ );
248
+ if (typeof feature.type === "string") {
249
+ invalidFeatureType = feature.type;
250
+ }
251
+ feature.type = "misc_feature";
252
+ }
253
+ }
254
+ if (!feature.notes) {
255
+ feature.notes = {};
256
+ }
257
+ //if the original feature type was invalid, push it onto the notes object under featureType
258
+ if (invalidFeatureType) {
259
+ if (!feature.notes.featureType) {
260
+ feature.notes.featureType = [];
261
+ }
262
+ feature.notes.featureType.push(invalidFeatureType);
263
+ }
264
+ if (feature.notes.label) {
265
+ //we've already used the label as the name by default if both gene and label were present
266
+ delete feature.notes.label;
267
+ } else if (feature.notes.gene) {
268
+ //gene was useds for name (if it existed)
269
+ delete feature.notes.gene;
270
+ } else if (feature.notes.name) {
271
+ //name was used for name (if it existed)
272
+ delete feature.notes.name;
273
+ }
274
+ if (feature.notes.color) {
275
+ feature.color = feature.notes.color[0] || feature.color;
276
+ delete feature.notes.color;
277
+ }
278
+ if (feature.notes.labelColor) {
279
+ feature.labelColor = feature.notes.labelColor[0] || feature.labelColor;
280
+ delete feature.notes.labelColor;
281
+ }
282
+ if (
283
+ feature.notes.pragma &&
284
+ some(feature.notes.pragma, (p) => p === "overlapsSelf")
285
+ ) {
286
+ feature.overlapsSelf = true;
287
+ feature.notes.pragma = filter(
288
+ feature.notes.pragma,
289
+ (p) => p !== "overlapsSelf"
290
+ );
291
+ }
292
+ feature.notes.note &&
293
+ some(feature.notes.note, (n) => {
294
+ if (
295
+ n &&
296
+ typeof n === "string" &&
297
+ n.toLowerCase().includes("sequence:")
298
+ ) {
299
+ //remove it after we're parsed it out
300
+ feature.notes.note = filter(
301
+ feature.notes.note,
302
+ (p) => p && !p.toLowerCase().includes("sequence:")
303
+ );
304
+ if (feature.notes.note.length === 0) {
305
+ delete feature.notes.note;
306
+ }
307
+ const match = n.match(/sequence:[ \r\n.]*[a-zA-Z]*/i);
308
+ if (match && match[0])
309
+ feature.bases = match[0]
310
+ .replace(/\s/g, "")
311
+ .replace("sequence:", "");
312
+
313
+ return true;
314
+ }
315
+ });
316
+
317
+ feature.notes.primerBindsOn &&
318
+ some(feature.notes.primerBindsOn, (n) => {
319
+ if (n) {
320
+ feature.primerBindsOn = n;
321
+ delete feature.notes.primerBindsOn;
322
+ }
323
+ });
324
+
325
+ for (const { pragma, type } of pragmasAndTypes) {
326
+ if (
327
+ options[`accept${upperFirst(type)}`] !== false && //acceptParts, acceptWarnings,
328
+ feature.notes.pragma &&
329
+ some(feature.notes.pragma, (p) => p === pragma)
330
+ ) {
331
+ if (!sequence[type]) {
332
+ sequence[type] = []; //initialize an empty array if necessary
333
+ }
334
+ feature.type = type.slice(0, -1); //set the type before pushing it onto the array
335
+ delete feature.notes.pragma;
336
+ sequence[type].push(feature);
337
+ return false; //don't include the features
338
+ }
339
+ }
340
+ forEach(feature.notes, (noteArray, key) => {
341
+ feature.notes[key] = map(noteArray, (note) => {
342
+ return unmangleUrls(note);
343
+ });
344
+ });
345
+ return true;
346
+ });
347
+ response.validatedAndCleanedSequence = sequence;
348
+ return response;
349
+ }
@@ -0,0 +1,20 @@
1
+ import validateSequence from './validateSequence.js';
2
+
3
+ export default function validateSequenceArray(parsingResultArray, options) {
4
+ if (parsingResultArray) {
5
+ if (!Array.isArray(parsingResultArray)) {
6
+ //wrap the parsingResult into an array if it isn't one already
7
+ parsingResultArray = [parsingResultArray];
8
+ }
9
+ //should convert the old data type to the new data type (flattened sequence)
10
+ parsingResultArray.forEach(function(parsingResult) {
11
+ if (parsingResult.success) {
12
+ const res = validateSequence(parsingResult.parsedSequence, options);
13
+ //add any validation error messages to the parsed sequence results messages
14
+ parsingResult.messages = parsingResult.messages.concat(res.messages);
15
+ parsingResult.parsedSequence = res.validatedAndCleanedSequence;
16
+ }
17
+ });
18
+ }
19
+ return parsingResultArray;
20
+ };