gff-nostream 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/util.ts ADDED
@@ -0,0 +1,425 @@
1
+ // Fast, low-level functions for parsing and formatting GFF3.
2
+ // JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module.
3
+
4
+ /**
5
+ * Unescape a string value used in a GFF3 attribute.
6
+ *
7
+ * @param stringVal - Escaped GFF3 string value
8
+ * @returns An unescaped string value
9
+ */
10
+ export function unescape(stringVal: string): string {
11
+ return stringVal.replaceAll(/%([0-9A-Fa-f]{2})/g, (_match, seq) =>
12
+ String.fromCharCode(parseInt(seq, 16)),
13
+ )
14
+ }
15
+
16
+ function _escape(regex: RegExp, s: string | number) {
17
+ return String(s).replace(regex, ch => {
18
+ const hex = ch.charCodeAt(0).toString(16).toUpperCase().padStart(2, '0')
19
+ return `%${hex}`
20
+ })
21
+ }
22
+
23
+ /**
24
+ * Escape a value for use in a GFF3 attribute value.
25
+ *
26
+ * @param rawVal - Raw GFF3 attribute value
27
+ * @returns An escaped string value
28
+ */
29
+ export function escape(rawVal: string | number): string {
30
+ return _escape(/[\n;\r\t=%&,\u0000-\u001f\u007f-\u00ff]/g, rawVal)
31
+ }
32
+
33
+ /**
34
+ * Escape a value for use in a GFF3 column value.
35
+ *
36
+ * @param rawVal - Raw GFF3 column value
37
+ * @returns An escaped column value
38
+ */
39
+ export function escapeColumn(rawVal: string | number): string {
40
+ return _escape(/[\n\r\t%\u0000-\u001f\u007f-\u00ff]/g, rawVal)
41
+ }
42
+
43
+ /**
44
+ * Parse the 9th column (attributes) of a GFF3 feature line.
45
+ *
46
+ * @param attrString - String of GFF3 9th column
47
+ * @returns Parsed attributes
48
+ */
49
+ export function parseAttributes(attrString: string): GFF3Attributes {
50
+ if (!attrString.length || attrString === '.') {
51
+ return {}
52
+ }
53
+
54
+ const attrs: GFF3Attributes = {}
55
+
56
+ attrString
57
+ .replace(/\r?\n$/, '')
58
+ .split(';')
59
+ .forEach(a => {
60
+ const nv = a.split('=', 2)
61
+ if (!nv[1]?.length) {
62
+ return
63
+ }
64
+
65
+ nv[0] = nv[0].trim()
66
+ let arec = attrs[nv[0].trim()]
67
+ if (!arec) {
68
+ arec = []
69
+ attrs[nv[0]] = arec
70
+ }
71
+
72
+ arec.push(
73
+ ...nv[1]
74
+ .split(',')
75
+ .map(s => s.trim())
76
+ .map(unescape),
77
+ )
78
+ })
79
+ return attrs
80
+ }
81
+
82
+ /**
83
+ * Parse a GFF3 feature line
84
+ *
85
+ * @param line - GFF3 feature line
86
+ * @returns The parsed feature
87
+ */
88
+ export function parseFeature(line: string): GFF3FeatureLine {
89
+ // split the line into columns and replace '.' with null in each column
90
+ const f = line.split('\t').map(a => (a === '.' || a === '' ? null : a))
91
+
92
+ // unescape only the ref, source, and type columns
93
+ const parsed: GFF3FeatureLine = {
94
+ seq_id: f[0] && unescape(f[0]),
95
+ source: f[1] && unescape(f[1]),
96
+ type: f[2] && unescape(f[2]),
97
+ start: f[3] === null ? null : parseInt(f[3], 10),
98
+ end: f[4] === null ? null : parseInt(f[4], 10),
99
+ score: f[5] === null ? null : parseFloat(f[5]),
100
+ strand: f[6],
101
+ phase: f[7],
102
+ attributes: f[8] === null ? null : parseAttributes(f[8]),
103
+ }
104
+ return parsed
105
+ }
106
+
107
+ /**
108
+ * Parse a GFF3 directive line.
109
+ *
110
+ * @param line - GFF3 directive line
111
+ * @returns The parsed directive
112
+ */
113
+ export function parseDirective(
114
+ line: string,
115
+ ):
116
+ | GFF3Directive
117
+ | GFF3SequenceRegionDirective
118
+ | GFF3GenomeBuildDirective
119
+ | null {
120
+ const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line)
121
+ if (!match) {
122
+ return null
123
+ }
124
+
125
+ const [, name] = match
126
+ let [, , contents] = match
127
+
128
+ const parsed: GFF3Directive = { directive: name }
129
+ if (contents.length) {
130
+ contents = contents.replace(/\r?\n$/, '')
131
+ parsed.value = contents
132
+ }
133
+
134
+ // do a little additional parsing for sequence-region and genome-build directives
135
+ if (name === 'sequence-region') {
136
+ const c = contents.split(/\s+/, 3)
137
+ return {
138
+ ...parsed,
139
+ seq_id: c[0],
140
+ start: c[1]?.replaceAll(/\D/g, ''),
141
+ end: c[2]?.replaceAll(/\D/g, ''),
142
+ } as GFF3SequenceRegionDirective
143
+ } else if (name === 'genome-build') {
144
+ const [source, buildName] = contents.split(/\s+/, 2)
145
+ return {
146
+ ...parsed,
147
+ source,
148
+ buildName,
149
+ } as GFF3GenomeBuildDirective
150
+ }
151
+
152
+ return parsed
153
+ }
154
+
155
+ /**
156
+ * Format an attributes object into a string suitable for the 9th column of GFF3.
157
+ *
158
+ * @param attrs - Attributes
159
+ * @returns GFF3 9th column string
160
+ */
161
+ export function formatAttributes(attrs: GFF3Attributes): string {
162
+ const attrOrder: string[] = []
163
+ Object.entries(attrs).forEach(([tag, val]) => {
164
+ if (!val) {
165
+ return
166
+ }
167
+ let valstring
168
+ if (val.hasOwnProperty('toString')) {
169
+ valstring = escape(val.toString())
170
+ // } else if (Array.isArray(val.values)) {
171
+ // valstring = val.values.map(escape).join(',')
172
+ } else if (Array.isArray(val)) {
173
+ valstring = val.map(escape).join(',')
174
+ } else {
175
+ valstring = escape(val)
176
+ }
177
+ attrOrder.push(`${escape(tag)}=${valstring}`)
178
+ })
179
+ return attrOrder.length ? attrOrder.join(';') : '.'
180
+ }
181
+
182
+ function _formatSingleFeature(
183
+ f: GFF3FeatureLine | GFF3FeatureLineWithRefs,
184
+ seenFeature: Record<string, boolean | undefined>,
185
+ ) {
186
+ const attrString =
187
+ f.attributes === null || f.attributes === undefined
188
+ ? '.'
189
+ : formatAttributes(f.attributes)
190
+
191
+ const fields = [
192
+ f.seq_id === null ? '.' : escapeColumn(f.seq_id),
193
+ f.source === null ? '.' : escapeColumn(f.source),
194
+ f.type === null ? '.' : escapeColumn(f.type),
195
+ f.start === null ? '.' : escapeColumn(f.start),
196
+ f.end === null ? '.' : escapeColumn(f.end),
197
+ f.score === null ? '.' : escapeColumn(f.score),
198
+ f.strand === null ? '.' : escapeColumn(f.strand),
199
+ f.phase === null ? '.' : escapeColumn(f.phase),
200
+ attrString,
201
+ ]
202
+
203
+ const formattedString = `${fields.join('\t')}\n`
204
+
205
+ // if we have already output this exact feature, skip it
206
+ if (seenFeature[formattedString]) {
207
+ return ''
208
+ }
209
+
210
+ seenFeature[formattedString] = true
211
+ return formattedString
212
+ }
213
+
214
+ function _formatFeature(
215
+ feature:
216
+ | GFF3FeatureLine
217
+ | GFF3FeatureLineWithRefs
218
+ | (GFF3FeatureLine | GFF3FeatureLineWithRefs)[],
219
+ seenFeature: Record<string, boolean | undefined>,
220
+ ): string {
221
+ if (Array.isArray(feature)) {
222
+ return feature.map(f => _formatFeature(f, seenFeature)).join('')
223
+ }
224
+
225
+ const strings = [_formatSingleFeature(feature, seenFeature)]
226
+ if (_isFeatureLineWithRefs(feature)) {
227
+ strings.push(
228
+ ...feature.child_features.map(f => _formatFeature(f, seenFeature)),
229
+ ...feature.derived_features.map(f => _formatFeature(f, seenFeature)),
230
+ )
231
+ }
232
+ return strings.join('')
233
+ }
234
+
235
+ /**
236
+ * Format a feature object or array of feature objects into one or more lines of
237
+ * GFF3.
238
+ *
239
+ * @param featureOrFeatures - A feature object or array of feature objects
240
+ * @returns A string of one or more GFF3 lines
241
+ */
242
+ export function formatFeature(
243
+ featureOrFeatures:
244
+ | GFF3FeatureLine
245
+ | GFF3FeatureLineWithRefs
246
+ | (GFF3FeatureLine | GFF3FeatureLineWithRefs)[],
247
+ ): string {
248
+ const seen = {}
249
+ return _formatFeature(featureOrFeatures, seen)
250
+ }
251
+
252
+ /**
253
+ * Format a directive into a line of GFF3.
254
+ *
255
+ * @param directive - A directive object
256
+ * @returns A directive line string
257
+ */
258
+ export function formatDirective(directive: GFF3Directive): string {
259
+ let str = `##${directive.directive}`
260
+ if (directive.value) {
261
+ str += ` ${directive.value}`
262
+ }
263
+ str += '\n'
264
+ return str
265
+ }
266
+
267
+ /**
268
+ * Format a comment into a GFF3 comment.
269
+ * Yes I know this is just adding a # and a newline.
270
+ *
271
+ * @param comment - A comment object
272
+ * @returns A comment line string
273
+ */
274
+ export function formatComment(comment: GFF3Comment): string {
275
+ return `# ${comment.comment}\n`
276
+ }
277
+
278
+ /**
279
+ * Format a sequence object as FASTA
280
+ *
281
+ * @param seq - A sequence object
282
+ * @returns Formatted single FASTA sequence string
283
+ */
284
+ export function formatSequence(seq: GFF3Sequence): string {
285
+ return `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n${
286
+ seq.sequence
287
+ }\n`
288
+ }
289
+
290
+ /**
291
+ * Format a directive, comment, sequence, or feature, or array of such items,
292
+ * into one or more lines of GFF3.
293
+ *
294
+ * @param itemOrItems - A comment, sequence, or feature, or array of such items
295
+ * @returns A formatted string or array of strings
296
+ */
297
+ export function formatItem(
298
+ itemOrItems:
299
+ | GFF3FeatureLineWithRefs
300
+ | GFF3Directive
301
+ | GFF3Comment
302
+ | GFF3Sequence
303
+ | (GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence)[],
304
+ ): string | string[] {
305
+ function formatSingleItem(
306
+ item: GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence,
307
+ ) {
308
+ if ('attributes' in item) {
309
+ return formatFeature(item)
310
+ }
311
+ if ('directive' in item) {
312
+ return formatDirective(item)
313
+ }
314
+ if ('sequence' in item) {
315
+ return formatSequence(item)
316
+ }
317
+ if ('comment' in item) {
318
+ return formatComment(item)
319
+ }
320
+ return '# (invalid item found during format)\n'
321
+ }
322
+
323
+ if (Array.isArray(itemOrItems)) {
324
+ return itemOrItems.map(formatSingleItem)
325
+ }
326
+ return formatSingleItem(itemOrItems)
327
+ }
328
+
329
+ /** A record of GFF3 attribute identifiers and the values of those identifiers */
330
+ export type GFF3Attributes = Record<string, string[] | undefined>
331
+
332
+ /** A representation of a single line of a GFF3 file */
333
+ export interface GFF3FeatureLine {
334
+ /** The ID of the landmark used to establish the coordinate system for the current feature */
335
+ seq_id: string | null
336
+ /** A free text qualifier intended to describe the algorithm or operating procedure that generated this feature */
337
+ source: string | null
338
+ /** The type of the feature */
339
+ type: string | null
340
+ /** The start coordinates of the feature */
341
+ start: number | null
342
+ /** The end coordinates of the feature */
343
+ end: number | null
344
+ /** The score of the feature */
345
+ score: number | null
346
+ /** The strand of the feature */
347
+ strand: string | null
348
+ /** For features of type "CDS", the phase indicates where the next codon begins relative to the 5' end of the current CDS feature */
349
+ phase: string | null
350
+ /** Feature attributes */
351
+ attributes: GFF3Attributes | null
352
+ }
353
+
354
+ /**
355
+ * A GFF3 Feature line that includes references to other features defined in
356
+ * their "Parent" or "Derives_from" attributes
357
+ */
358
+ export interface GFF3FeatureLineWithRefs extends GFF3FeatureLine {
359
+ /** An array of child features */
360
+ child_features: GFF3Feature[]
361
+ /** An array of features derived from this feature */
362
+ derived_features: GFF3Feature[]
363
+ }
364
+
365
+ function _isFeatureLineWithRefs(
366
+ featureLine: GFF3FeatureLine | GFF3FeatureLineWithRefs,
367
+ ): featureLine is GFF3FeatureLineWithRefs {
368
+ return (
369
+ (featureLine as GFF3FeatureLineWithRefs).child_features !== undefined &&
370
+ (featureLine as GFF3FeatureLineWithRefs).derived_features !== undefined
371
+ )
372
+ }
373
+
374
+ /**
375
+ * A GFF3 feature, which may include multiple individual feature lines
376
+ */
377
+ export type GFF3Feature = GFF3FeatureLineWithRefs[]
378
+
379
+ /** A GFF3 directive */
380
+ export interface GFF3Directive {
381
+ /** The name of the directive */
382
+ directive: string
383
+ /** The string value of the directive */
384
+ value?: string
385
+ }
386
+
387
+ /** A GFF3 sequence-region directive */
388
+ export interface GFF3SequenceRegionDirective extends GFF3Directive {
389
+ /** The string value of the directive */
390
+ value: string
391
+ /** The sequence ID parsed from the directive */
392
+ seq_id: string
393
+ /** The sequence start parsed from the directive */
394
+ start: string
395
+ /** The sequence end parsed from the directive */
396
+ end: string
397
+ }
398
+
399
+ /** A GFF3 genome-build directive */
400
+ export interface GFF3GenomeBuildDirective extends GFF3Directive {
401
+ /** The string value of the directive */
402
+ value: string
403
+ /** The genome build source parsed from the directive */
404
+ source: string
405
+ /** The genome build name parsed from the directive */
406
+ buildName: string
407
+ }
408
+
409
+ /** A GFF3 comment */
410
+ export interface GFF3Comment {
411
+ /** The text of the comment */
412
+ comment: string
413
+ }
414
+
415
+ /** A GFF3 FASTA single sequence */
416
+ export interface GFF3Sequence {
417
+ /** The ID of the sequence */
418
+ id: string
419
+ /** The description of the sequence */
420
+ description?: string
421
+ /** The sequence */
422
+ sequence: string
423
+ }
424
+
425
+ export type GFF3Item = GFF3Feature | GFF3Directive | GFF3Comment | GFF3Sequence