gff-nostream 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +163 -0
- package/dist/api.d.ts +23 -0
- package/dist/api.js +40 -0
- package/dist/api.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/parse.d.ts +55 -0
- package/dist/parse.js +341 -0
- package/dist/parse.js.map +1 -0
- package/dist/util.d.ts +166 -0
- package/dist/util.js +274 -0
- package/dist/util.js.map +1 -0
- package/esm/api.d.ts +23 -0
- package/esm/api.js +42 -0
- package/esm/api.js.map +1 -0
- package/esm/index.d.ts +3 -0
- package/esm/index.js +3 -0
- package/esm/index.js.map +1 -0
- package/esm/parse.d.ts +55 -0
- package/esm/parse.js +317 -0
- package/esm/parse.js.map +1 -0
- package/esm/util.d.ts +166 -0
- package/esm/util.js +268 -0
- package/esm/util.js.map +1 -0
- package/package.json +52 -0
- package/src/api.ts +86 -0
- package/src/index.ts +12 -0
- package/src/parse.ts +400 -0
- package/src/util.ts +425 -0
package/src/util.ts
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
// Fast, low-level functions for parsing and formatting GFF3.
|
|
2
|
+
// JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module.
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Unescape a string value used in a GFF3 attribute.
|
|
6
|
+
*
|
|
7
|
+
* @param stringVal - Escaped GFF3 string value
|
|
8
|
+
* @returns An unescaped string value
|
|
9
|
+
*/
|
|
10
|
+
export function unescape(stringVal: string): string {
|
|
11
|
+
return stringVal.replaceAll(/%([0-9A-Fa-f]{2})/g, (_match, seq) =>
|
|
12
|
+
String.fromCharCode(parseInt(seq, 16)),
|
|
13
|
+
)
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function _escape(regex: RegExp, s: string | number) {
|
|
17
|
+
return String(s).replace(regex, ch => {
|
|
18
|
+
const hex = ch.charCodeAt(0).toString(16).toUpperCase().padStart(2, '0')
|
|
19
|
+
return `%${hex}`
|
|
20
|
+
})
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Escape a value for use in a GFF3 attribute value.
|
|
25
|
+
*
|
|
26
|
+
* @param rawVal - Raw GFF3 attribute value
|
|
27
|
+
* @returns An escaped string value
|
|
28
|
+
*/
|
|
29
|
+
export function escape(rawVal: string | number): string {
|
|
30
|
+
return _escape(/[\n;\r\t=%&,\u0000-\u001f\u007f-\u00ff]/g, rawVal)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Escape a value for use in a GFF3 column value.
|
|
35
|
+
*
|
|
36
|
+
* @param rawVal - Raw GFF3 column value
|
|
37
|
+
* @returns An escaped column value
|
|
38
|
+
*/
|
|
39
|
+
export function escapeColumn(rawVal: string | number): string {
|
|
40
|
+
return _escape(/[\n\r\t%\u0000-\u001f\u007f-\u00ff]/g, rawVal)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Parse the 9th column (attributes) of a GFF3 feature line.
|
|
45
|
+
*
|
|
46
|
+
* @param attrString - String of GFF3 9th column
|
|
47
|
+
* @returns Parsed attributes
|
|
48
|
+
*/
|
|
49
|
+
export function parseAttributes(attrString: string): GFF3Attributes {
|
|
50
|
+
if (!attrString.length || attrString === '.') {
|
|
51
|
+
return {}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const attrs: GFF3Attributes = {}
|
|
55
|
+
|
|
56
|
+
attrString
|
|
57
|
+
.replace(/\r?\n$/, '')
|
|
58
|
+
.split(';')
|
|
59
|
+
.forEach(a => {
|
|
60
|
+
const nv = a.split('=', 2)
|
|
61
|
+
if (!nv[1]?.length) {
|
|
62
|
+
return
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
nv[0] = nv[0].trim()
|
|
66
|
+
let arec = attrs[nv[0].trim()]
|
|
67
|
+
if (!arec) {
|
|
68
|
+
arec = []
|
|
69
|
+
attrs[nv[0]] = arec
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
arec.push(
|
|
73
|
+
...nv[1]
|
|
74
|
+
.split(',')
|
|
75
|
+
.map(s => s.trim())
|
|
76
|
+
.map(unescape),
|
|
77
|
+
)
|
|
78
|
+
})
|
|
79
|
+
return attrs
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Parse a GFF3 feature line
|
|
84
|
+
*
|
|
85
|
+
* @param line - GFF3 feature line
|
|
86
|
+
* @returns The parsed feature
|
|
87
|
+
*/
|
|
88
|
+
export function parseFeature(line: string): GFF3FeatureLine {
|
|
89
|
+
// split the line into columns and replace '.' with null in each column
|
|
90
|
+
const f = line.split('\t').map(a => (a === '.' || a === '' ? null : a))
|
|
91
|
+
|
|
92
|
+
// unescape only the ref, source, and type columns
|
|
93
|
+
const parsed: GFF3FeatureLine = {
|
|
94
|
+
seq_id: f[0] && unescape(f[0]),
|
|
95
|
+
source: f[1] && unescape(f[1]),
|
|
96
|
+
type: f[2] && unescape(f[2]),
|
|
97
|
+
start: f[3] === null ? null : parseInt(f[3], 10),
|
|
98
|
+
end: f[4] === null ? null : parseInt(f[4], 10),
|
|
99
|
+
score: f[5] === null ? null : parseFloat(f[5]),
|
|
100
|
+
strand: f[6],
|
|
101
|
+
phase: f[7],
|
|
102
|
+
attributes: f[8] === null ? null : parseAttributes(f[8]),
|
|
103
|
+
}
|
|
104
|
+
return parsed
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Parse a GFF3 directive line.
|
|
109
|
+
*
|
|
110
|
+
* @param line - GFF3 directive line
|
|
111
|
+
* @returns The parsed directive
|
|
112
|
+
*/
|
|
113
|
+
export function parseDirective(
|
|
114
|
+
line: string,
|
|
115
|
+
):
|
|
116
|
+
| GFF3Directive
|
|
117
|
+
| GFF3SequenceRegionDirective
|
|
118
|
+
| GFF3GenomeBuildDirective
|
|
119
|
+
| null {
|
|
120
|
+
const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line)
|
|
121
|
+
if (!match) {
|
|
122
|
+
return null
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const [, name] = match
|
|
126
|
+
let [, , contents] = match
|
|
127
|
+
|
|
128
|
+
const parsed: GFF3Directive = { directive: name }
|
|
129
|
+
if (contents.length) {
|
|
130
|
+
contents = contents.replace(/\r?\n$/, '')
|
|
131
|
+
parsed.value = contents
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// do a little additional parsing for sequence-region and genome-build directives
|
|
135
|
+
if (name === 'sequence-region') {
|
|
136
|
+
const c = contents.split(/\s+/, 3)
|
|
137
|
+
return {
|
|
138
|
+
...parsed,
|
|
139
|
+
seq_id: c[0],
|
|
140
|
+
start: c[1]?.replaceAll(/\D/g, ''),
|
|
141
|
+
end: c[2]?.replaceAll(/\D/g, ''),
|
|
142
|
+
} as GFF3SequenceRegionDirective
|
|
143
|
+
} else if (name === 'genome-build') {
|
|
144
|
+
const [source, buildName] = contents.split(/\s+/, 2)
|
|
145
|
+
return {
|
|
146
|
+
...parsed,
|
|
147
|
+
source,
|
|
148
|
+
buildName,
|
|
149
|
+
} as GFF3GenomeBuildDirective
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return parsed
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Format an attributes object into a string suitable for the 9th column of GFF3.
|
|
157
|
+
*
|
|
158
|
+
* @param attrs - Attributes
|
|
159
|
+
* @returns GFF3 9th column string
|
|
160
|
+
*/
|
|
161
|
+
export function formatAttributes(attrs: GFF3Attributes): string {
|
|
162
|
+
const attrOrder: string[] = []
|
|
163
|
+
Object.entries(attrs).forEach(([tag, val]) => {
|
|
164
|
+
if (!val) {
|
|
165
|
+
return
|
|
166
|
+
}
|
|
167
|
+
let valstring
|
|
168
|
+
if (val.hasOwnProperty('toString')) {
|
|
169
|
+
valstring = escape(val.toString())
|
|
170
|
+
// } else if (Array.isArray(val.values)) {
|
|
171
|
+
// valstring = val.values.map(escape).join(',')
|
|
172
|
+
} else if (Array.isArray(val)) {
|
|
173
|
+
valstring = val.map(escape).join(',')
|
|
174
|
+
} else {
|
|
175
|
+
valstring = escape(val)
|
|
176
|
+
}
|
|
177
|
+
attrOrder.push(`${escape(tag)}=${valstring}`)
|
|
178
|
+
})
|
|
179
|
+
return attrOrder.length ? attrOrder.join(';') : '.'
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
function _formatSingleFeature(
|
|
183
|
+
f: GFF3FeatureLine | GFF3FeatureLineWithRefs,
|
|
184
|
+
seenFeature: Record<string, boolean | undefined>,
|
|
185
|
+
) {
|
|
186
|
+
const attrString =
|
|
187
|
+
f.attributes === null || f.attributes === undefined
|
|
188
|
+
? '.'
|
|
189
|
+
: formatAttributes(f.attributes)
|
|
190
|
+
|
|
191
|
+
const fields = [
|
|
192
|
+
f.seq_id === null ? '.' : escapeColumn(f.seq_id),
|
|
193
|
+
f.source === null ? '.' : escapeColumn(f.source),
|
|
194
|
+
f.type === null ? '.' : escapeColumn(f.type),
|
|
195
|
+
f.start === null ? '.' : escapeColumn(f.start),
|
|
196
|
+
f.end === null ? '.' : escapeColumn(f.end),
|
|
197
|
+
f.score === null ? '.' : escapeColumn(f.score),
|
|
198
|
+
f.strand === null ? '.' : escapeColumn(f.strand),
|
|
199
|
+
f.phase === null ? '.' : escapeColumn(f.phase),
|
|
200
|
+
attrString,
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
const formattedString = `${fields.join('\t')}\n`
|
|
204
|
+
|
|
205
|
+
// if we have already output this exact feature, skip it
|
|
206
|
+
if (seenFeature[formattedString]) {
|
|
207
|
+
return ''
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
seenFeature[formattedString] = true
|
|
211
|
+
return formattedString
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function _formatFeature(
|
|
215
|
+
feature:
|
|
216
|
+
| GFF3FeatureLine
|
|
217
|
+
| GFF3FeatureLineWithRefs
|
|
218
|
+
| (GFF3FeatureLine | GFF3FeatureLineWithRefs)[],
|
|
219
|
+
seenFeature: Record<string, boolean | undefined>,
|
|
220
|
+
): string {
|
|
221
|
+
if (Array.isArray(feature)) {
|
|
222
|
+
return feature.map(f => _formatFeature(f, seenFeature)).join('')
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
const strings = [_formatSingleFeature(feature, seenFeature)]
|
|
226
|
+
if (_isFeatureLineWithRefs(feature)) {
|
|
227
|
+
strings.push(
|
|
228
|
+
...feature.child_features.map(f => _formatFeature(f, seenFeature)),
|
|
229
|
+
...feature.derived_features.map(f => _formatFeature(f, seenFeature)),
|
|
230
|
+
)
|
|
231
|
+
}
|
|
232
|
+
return strings.join('')
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Format a feature object or array of feature objects into one or more lines of
|
|
237
|
+
* GFF3.
|
|
238
|
+
*
|
|
239
|
+
* @param featureOrFeatures - A feature object or array of feature objects
|
|
240
|
+
* @returns A string of one or more GFF3 lines
|
|
241
|
+
*/
|
|
242
|
+
export function formatFeature(
|
|
243
|
+
featureOrFeatures:
|
|
244
|
+
| GFF3FeatureLine
|
|
245
|
+
| GFF3FeatureLineWithRefs
|
|
246
|
+
| (GFF3FeatureLine | GFF3FeatureLineWithRefs)[],
|
|
247
|
+
): string {
|
|
248
|
+
const seen = {}
|
|
249
|
+
return _formatFeature(featureOrFeatures, seen)
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Format a directive into a line of GFF3.
|
|
254
|
+
*
|
|
255
|
+
* @param directive - A directive object
|
|
256
|
+
* @returns A directive line string
|
|
257
|
+
*/
|
|
258
|
+
export function formatDirective(directive: GFF3Directive): string {
|
|
259
|
+
let str = `##${directive.directive}`
|
|
260
|
+
if (directive.value) {
|
|
261
|
+
str += ` ${directive.value}`
|
|
262
|
+
}
|
|
263
|
+
str += '\n'
|
|
264
|
+
return str
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Format a comment into a GFF3 comment.
|
|
269
|
+
* Yes I know this is just adding a # and a newline.
|
|
270
|
+
*
|
|
271
|
+
* @param comment - A comment object
|
|
272
|
+
* @returns A comment line string
|
|
273
|
+
*/
|
|
274
|
+
export function formatComment(comment: GFF3Comment): string {
|
|
275
|
+
return `# ${comment.comment}\n`
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Format a sequence object as FASTA
|
|
280
|
+
*
|
|
281
|
+
* @param seq - A sequence object
|
|
282
|
+
* @returns Formatted single FASTA sequence string
|
|
283
|
+
*/
|
|
284
|
+
export function formatSequence(seq: GFF3Sequence): string {
|
|
285
|
+
return `>${seq.id}${seq.description ? ` ${seq.description}` : ''}\n${
|
|
286
|
+
seq.sequence
|
|
287
|
+
}\n`
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* Format a directive, comment, sequence, or feature, or array of such items,
|
|
292
|
+
* into one or more lines of GFF3.
|
|
293
|
+
*
|
|
294
|
+
* @param itemOrItems - A comment, sequence, or feature, or array of such items
|
|
295
|
+
* @returns A formatted string or array of strings
|
|
296
|
+
*/
|
|
297
|
+
export function formatItem(
|
|
298
|
+
itemOrItems:
|
|
299
|
+
| GFF3FeatureLineWithRefs
|
|
300
|
+
| GFF3Directive
|
|
301
|
+
| GFF3Comment
|
|
302
|
+
| GFF3Sequence
|
|
303
|
+
| (GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence)[],
|
|
304
|
+
): string | string[] {
|
|
305
|
+
function formatSingleItem(
|
|
306
|
+
item: GFF3FeatureLineWithRefs | GFF3Directive | GFF3Comment | GFF3Sequence,
|
|
307
|
+
) {
|
|
308
|
+
if ('attributes' in item) {
|
|
309
|
+
return formatFeature(item)
|
|
310
|
+
}
|
|
311
|
+
if ('directive' in item) {
|
|
312
|
+
return formatDirective(item)
|
|
313
|
+
}
|
|
314
|
+
if ('sequence' in item) {
|
|
315
|
+
return formatSequence(item)
|
|
316
|
+
}
|
|
317
|
+
if ('comment' in item) {
|
|
318
|
+
return formatComment(item)
|
|
319
|
+
}
|
|
320
|
+
return '# (invalid item found during format)\n'
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
if (Array.isArray(itemOrItems)) {
|
|
324
|
+
return itemOrItems.map(formatSingleItem)
|
|
325
|
+
}
|
|
326
|
+
return formatSingleItem(itemOrItems)
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
/** A record of GFF3 attribute identifiers and the values of those identifiers */
|
|
330
|
+
export type GFF3Attributes = Record<string, string[] | undefined>
|
|
331
|
+
|
|
332
|
+
/** A representation of a single line of a GFF3 file */
|
|
333
|
+
export interface GFF3FeatureLine {
|
|
334
|
+
/** The ID of the landmark used to establish the coordinate system for the current feature */
|
|
335
|
+
seq_id: string | null
|
|
336
|
+
/** A free text qualifier intended to describe the algorithm or operating procedure that generated this feature */
|
|
337
|
+
source: string | null
|
|
338
|
+
/** The type of the feature */
|
|
339
|
+
type: string | null
|
|
340
|
+
/** The start coordinates of the feature */
|
|
341
|
+
start: number | null
|
|
342
|
+
/** The end coordinates of the feature */
|
|
343
|
+
end: number | null
|
|
344
|
+
/** The score of the feature */
|
|
345
|
+
score: number | null
|
|
346
|
+
/** The strand of the feature */
|
|
347
|
+
strand: string | null
|
|
348
|
+
/** For features of type "CDS", the phase indicates where the next codon begins relative to the 5' end of the current CDS feature */
|
|
349
|
+
phase: string | null
|
|
350
|
+
/** Feature attributes */
|
|
351
|
+
attributes: GFF3Attributes | null
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
/**
|
|
355
|
+
* A GFF3 Feature line that includes references to other features defined in
|
|
356
|
+
* their "Parent" or "Derives_from" attributes
|
|
357
|
+
*/
|
|
358
|
+
export interface GFF3FeatureLineWithRefs extends GFF3FeatureLine {
|
|
359
|
+
/** An array of child features */
|
|
360
|
+
child_features: GFF3Feature[]
|
|
361
|
+
/** An array of features derived from this feature */
|
|
362
|
+
derived_features: GFF3Feature[]
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
function _isFeatureLineWithRefs(
|
|
366
|
+
featureLine: GFF3FeatureLine | GFF3FeatureLineWithRefs,
|
|
367
|
+
): featureLine is GFF3FeatureLineWithRefs {
|
|
368
|
+
return (
|
|
369
|
+
(featureLine as GFF3FeatureLineWithRefs).child_features !== undefined &&
|
|
370
|
+
(featureLine as GFF3FeatureLineWithRefs).derived_features !== undefined
|
|
371
|
+
)
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
/**
|
|
375
|
+
* A GFF3 feature, which may include multiple individual feature lines
|
|
376
|
+
*/
|
|
377
|
+
export type GFF3Feature = GFF3FeatureLineWithRefs[]
|
|
378
|
+
|
|
379
|
+
/** A GFF3 directive */
|
|
380
|
+
export interface GFF3Directive {
|
|
381
|
+
/** The name of the directive */
|
|
382
|
+
directive: string
|
|
383
|
+
/** The string value of the directive */
|
|
384
|
+
value?: string
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
/** A GFF3 sequence-region directive */
|
|
388
|
+
export interface GFF3SequenceRegionDirective extends GFF3Directive {
|
|
389
|
+
/** The string value of the directive */
|
|
390
|
+
value: string
|
|
391
|
+
/** The sequence ID parsed from the directive */
|
|
392
|
+
seq_id: string
|
|
393
|
+
/** The sequence start parsed from the directive */
|
|
394
|
+
start: string
|
|
395
|
+
/** The sequence end parsed from the directive */
|
|
396
|
+
end: string
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
/** A GFF3 genome-build directive */
|
|
400
|
+
export interface GFF3GenomeBuildDirective extends GFF3Directive {
|
|
401
|
+
/** The string value of the directive */
|
|
402
|
+
value: string
|
|
403
|
+
/** The genome build source parsed from the directive */
|
|
404
|
+
source: string
|
|
405
|
+
/** The genome build name parsed from the directive */
|
|
406
|
+
buildName: string
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
/** A GFF3 comment */
|
|
410
|
+
export interface GFF3Comment {
|
|
411
|
+
/** The text of the comment */
|
|
412
|
+
comment: string
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
/** A GFF3 FASTA single sequence */
|
|
416
|
+
export interface GFF3Sequence {
|
|
417
|
+
/** The ID of the sequence */
|
|
418
|
+
id: string
|
|
419
|
+
/** The description of the sequence */
|
|
420
|
+
description?: string
|
|
421
|
+
/** The sequence */
|
|
422
|
+
sequence: string
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
export type GFF3Item = GFF3Feature | GFF3Directive | GFF3Comment | GFF3Sequence
|