gff-nostream 3.0.11 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -94
- package/dist/api.d.ts +21 -34
- package/dist/api.js +53 -147
- package/dist/api.js.map +1 -1
- package/dist/index.d.ts +2 -2
- package/dist/index.js +1 -3
- package/dist/index.js.map +1 -1
- package/dist/util.d.ts +19 -120
- package/dist/util.js +29 -179
- package/dist/util.js.map +1 -1
- package/esm/api.d.ts +21 -34
- package/esm/api.js +54 -146
- package/esm/api.js.map +1 -1
- package/esm/index.d.ts +2 -2
- package/esm/index.js +1 -1
- package/esm/index.js.map +1 -1
- package/esm/util.d.ts +19 -120
- package/esm/util.js +29 -172
- package/esm/util.js.map +1 -1
- package/package.json +1 -1
- package/src/api.ts +80 -187
- package/src/index.ts +2 -18
- package/src/util.ts +39 -308
package/src/util.ts
CHANGED
|
@@ -1,10 +1,6 @@
|
|
|
1
|
-
// Fast, low-level functions for parsing
|
|
1
|
+
// Fast, low-level functions for parsing GFF3.
|
|
2
2
|
// JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module.
|
|
3
3
|
|
|
4
|
-
const directiveRegex = /^\s*##\s*(\S+)\s*(.*)/
|
|
5
|
-
const whitespaceRegex = /\s+/
|
|
6
|
-
const nonDigitRegex = /\D/g
|
|
7
|
-
|
|
8
4
|
const HEX_LOOKUP: Record<string, string | undefined> = {}
|
|
9
5
|
for (let i = 0; i < 256; i++) {
|
|
10
6
|
const hex = i.toString(16).toUpperCase().padStart(2, '0')
|
|
@@ -18,7 +14,6 @@ for (let i = 0; i < 256; i++) {
|
|
|
18
14
|
* @param stringVal - Escaped GFF3 string value
|
|
19
15
|
* @returns An unescaped string value
|
|
20
16
|
*/
|
|
21
|
-
|
|
22
17
|
export function unescape(stringVal: string) {
|
|
23
18
|
const idx = stringVal.indexOf('%')
|
|
24
19
|
if (idx === -1) {
|
|
@@ -30,18 +25,17 @@ export function unescape(stringVal: string) {
|
|
|
30
25
|
let i = idx
|
|
31
26
|
|
|
32
27
|
while (i < stringVal.length) {
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
} else {
|
|
40
|
-
result += stringVal.slice(i, i + 3)
|
|
41
|
-
}
|
|
28
|
+
const char =
|
|
29
|
+
stringVal[i] === '%' && i + 2 < stringVal.length
|
|
30
|
+
? HEX_LOOKUP[stringVal.slice(i + 1, i + 3)]
|
|
31
|
+
: undefined
|
|
32
|
+
if (char !== undefined) {
|
|
33
|
+
result += stringVal.slice(lastIdx, i) + char
|
|
42
34
|
i += 3
|
|
43
35
|
lastIdx = i
|
|
44
36
|
} else {
|
|
37
|
+
// Not a valid escape: advance one char so a '%' that begins a real
|
|
38
|
+
// escape immediately after isn't swallowed (e.g. the %20 in "a%b%20c").
|
|
45
39
|
i++
|
|
46
40
|
}
|
|
47
41
|
}
|
|
@@ -49,79 +43,6 @@ export function unescape(stringVal: string) {
|
|
|
49
43
|
return result + stringVal.slice(lastIdx)
|
|
50
44
|
}
|
|
51
45
|
|
|
52
|
-
function parseAttributesImpl(
|
|
53
|
-
attrString: string,
|
|
54
|
-
shouldUnescape: boolean,
|
|
55
|
-
): GFF3Attributes {
|
|
56
|
-
if (attrString.length === 0 || attrString === '.') {
|
|
57
|
-
return {}
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
const attrs: GFF3Attributes = {}
|
|
61
|
-
let len = attrString.length
|
|
62
|
-
|
|
63
|
-
if (attrString[len - 1] === '\n') {
|
|
64
|
-
len = attrString[len - 2] === '\r' ? len - 2 : len - 1
|
|
65
|
-
attrString = attrString.slice(0, len)
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
let start = 0
|
|
69
|
-
while (start < len) {
|
|
70
|
-
let semiIdx = attrString.indexOf(';', start)
|
|
71
|
-
if (semiIdx === -1) {
|
|
72
|
-
semiIdx = len
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
if (semiIdx > start) {
|
|
76
|
-
const eqIdx = attrString.indexOf('=', start)
|
|
77
|
-
if (eqIdx !== -1 && eqIdx < semiIdx && eqIdx + 1 < semiIdx) {
|
|
78
|
-
const tag = attrString.slice(start, eqIdx)
|
|
79
|
-
let arec = attrs[tag]
|
|
80
|
-
if (!arec) {
|
|
81
|
-
arec = []
|
|
82
|
-
attrs[tag] = arec
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
let valStart = eqIdx + 1
|
|
86
|
-
while (valStart < semiIdx) {
|
|
87
|
-
let commaIdx = attrString.indexOf(',', valStart)
|
|
88
|
-
if (commaIdx === -1 || commaIdx > semiIdx) {
|
|
89
|
-
commaIdx = semiIdx
|
|
90
|
-
}
|
|
91
|
-
if (commaIdx > valStart) {
|
|
92
|
-
const val = attrString.slice(valStart, commaIdx)
|
|
93
|
-
arec.push(shouldUnescape ? unescape(val) : val)
|
|
94
|
-
}
|
|
95
|
-
valStart = commaIdx + 1
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
start = semiIdx + 1
|
|
100
|
-
}
|
|
101
|
-
return attrs
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
/**
|
|
105
|
-
* Parse the 9th column (attributes) of a GFF3 feature line.
|
|
106
|
-
*
|
|
107
|
-
* @param attrString - String of GFF3 9th column
|
|
108
|
-
* @returns Parsed attributes
|
|
109
|
-
*/
|
|
110
|
-
export function parseAttributes(attrString: string): GFF3Attributes {
|
|
111
|
-
return parseAttributesImpl(attrString, true)
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
/**
|
|
115
|
-
* Parse the 9th column (attributes) of a GFF3 feature line without unescaping.
|
|
116
|
-
* Fast path for data known to contain no escaped characters.
|
|
117
|
-
*
|
|
118
|
-
* @param attrString - String of GFF3 9th column
|
|
119
|
-
* @returns Parsed attributes
|
|
120
|
-
*/
|
|
121
|
-
export function parseAttributesNoUnescape(attrString: string): GFF3Attributes {
|
|
122
|
-
return parseAttributesImpl(attrString, false)
|
|
123
|
-
}
|
|
124
|
-
|
|
125
46
|
function isEmpty(s: string) {
|
|
126
47
|
return s.length === 0 || s === '.'
|
|
127
48
|
}
|
|
@@ -134,189 +55,6 @@ function strField<E extends null | ''>(
|
|
|
134
55
|
return isEmpty(s) ? empty : shouldUnescape ? unescape(s) : s
|
|
135
56
|
}
|
|
136
57
|
|
|
137
|
-
function numField(s: string) {
|
|
138
|
-
return isEmpty(s) ? null : +s
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
function parseFeatureImpl(
|
|
142
|
-
line: string,
|
|
143
|
-
shouldUnescape: boolean,
|
|
144
|
-
): GFF3FeatureLine {
|
|
145
|
-
const f = line.split('\t')
|
|
146
|
-
const attrString = f[8]!
|
|
147
|
-
return {
|
|
148
|
-
seq_id: strField(f[0]!, shouldUnescape, null),
|
|
149
|
-
source: strField(f[1]!, shouldUnescape, null),
|
|
150
|
-
type: strField(f[2]!, shouldUnescape, null),
|
|
151
|
-
start: numField(f[3]!),
|
|
152
|
-
end: numField(f[4]!),
|
|
153
|
-
score: numField(f[5]!),
|
|
154
|
-
strand: strField(f[6]!, false, null),
|
|
155
|
-
phase: strField(f[7]!, false, null),
|
|
156
|
-
attributes: isEmpty(attrString)
|
|
157
|
-
? null
|
|
158
|
-
: parseAttributesImpl(attrString, shouldUnescape),
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
/**
|
|
163
|
-
* Parse a GFF3 feature line
|
|
164
|
-
*
|
|
165
|
-
* @param line - GFF3 feature line
|
|
166
|
-
* @returns The parsed feature
|
|
167
|
-
*/
|
|
168
|
-
export function parseFeature(line: string): GFF3FeatureLine {
|
|
169
|
-
return parseFeatureImpl(line, true)
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
/**
|
|
173
|
-
* Parse a GFF3 feature line without unescaping.
|
|
174
|
-
* Fast path for data known to contain no escaped characters.
|
|
175
|
-
*
|
|
176
|
-
* @param line - GFF3 feature line
|
|
177
|
-
* @returns The parsed feature
|
|
178
|
-
*/
|
|
179
|
-
export function parseFeatureNoUnescape(line: string): GFF3FeatureLine {
|
|
180
|
-
return parseFeatureImpl(line, false)
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
/**
|
|
184
|
-
* Parse a GFF3 directive line.
|
|
185
|
-
*
|
|
186
|
-
* @param line - GFF3 directive line
|
|
187
|
-
* @returns The parsed directive
|
|
188
|
-
*/
|
|
189
|
-
export function parseDirective(
|
|
190
|
-
line: string,
|
|
191
|
-
):
|
|
192
|
-
| GFF3Directive
|
|
193
|
-
| GFF3SequenceRegionDirective
|
|
194
|
-
| GFF3GenomeBuildDirective
|
|
195
|
-
| null {
|
|
196
|
-
const match = directiveRegex.exec(line)
|
|
197
|
-
if (!match) {
|
|
198
|
-
return null
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
const name = match[1]!
|
|
202
|
-
const contents = match[2]!
|
|
203
|
-
|
|
204
|
-
const parsed: GFF3Directive = { directive: name }
|
|
205
|
-
if (contents.length) {
|
|
206
|
-
parsed.value = contents.trimEnd()
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
if (name === 'sequence-region') {
|
|
210
|
-
const c = contents.split(whitespaceRegex, 3)
|
|
211
|
-
return {
|
|
212
|
-
...parsed,
|
|
213
|
-
seq_id: c[0]!,
|
|
214
|
-
start: c[1]!.replaceAll(nonDigitRegex, ''),
|
|
215
|
-
end: c[2]!.replaceAll(nonDigitRegex, ''),
|
|
216
|
-
}
|
|
217
|
-
} else if (name === 'genome-build') {
|
|
218
|
-
const [source, buildName] = contents.split(whitespaceRegex, 2)
|
|
219
|
-
return {
|
|
220
|
-
...parsed,
|
|
221
|
-
source: source!,
|
|
222
|
-
buildName: buildName!,
|
|
223
|
-
}
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
return parsed
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
/** A record of GFF3 attribute identifiers and the values of those identifiers */
|
|
230
|
-
export type GFF3Attributes = Record<string, string[] | undefined>
|
|
231
|
-
|
|
232
|
-
/** A representation of a single line of a GFF3 file */
|
|
233
|
-
export interface GFF3FeatureLine {
|
|
234
|
-
/** The ID of the landmark used to establish the coordinate system for the current feature */
|
|
235
|
-
seq_id: string | null
|
|
236
|
-
/** A free text qualifier intended to describe the algorithm or operating procedure that generated this feature */
|
|
237
|
-
source: string | null
|
|
238
|
-
/** The type of the feature */
|
|
239
|
-
type: string | null
|
|
240
|
-
/** The start coordinates of the feature */
|
|
241
|
-
start: number | null
|
|
242
|
-
/** The end coordinates of the feature */
|
|
243
|
-
end: number | null
|
|
244
|
-
/** The score of the feature */
|
|
245
|
-
score: number | null
|
|
246
|
-
/** The strand of the feature */
|
|
247
|
-
strand: string | null
|
|
248
|
-
/** For features of type "CDS", the phase indicates where the next codon begins relative to the 5' end of the current CDS feature */
|
|
249
|
-
phase: string | null
|
|
250
|
-
/** Feature attributes */
|
|
251
|
-
attributes: GFF3Attributes | null
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
/**
|
|
255
|
-
* A GFF3 Feature line that includes references to other features defined in
|
|
256
|
-
* their "Parent" or "Derives_from" attributes
|
|
257
|
-
*/
|
|
258
|
-
export interface GFF3FeatureLineWithRefs extends GFF3FeatureLine {
|
|
259
|
-
/** An array of child features */
|
|
260
|
-
child_features: GFF3Feature[]
|
|
261
|
-
/** An array of features derived from this feature */
|
|
262
|
-
derived_features: GFF3Feature[]
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
/**
|
|
266
|
-
* A GFF3 feature, which may include multiple individual feature lines
|
|
267
|
-
*/
|
|
268
|
-
export type GFF3Feature = GFF3FeatureLineWithRefs[]
|
|
269
|
-
|
|
270
|
-
/** A GFF3 directive */
|
|
271
|
-
export interface GFF3Directive {
|
|
272
|
-
/** The name of the directive */
|
|
273
|
-
directive: string
|
|
274
|
-
/** The string value of the directive */
|
|
275
|
-
value?: string
|
|
276
|
-
}
|
|
277
|
-
|
|
278
|
-
/** A GFF3 sequence-region directive */
|
|
279
|
-
export interface GFF3SequenceRegionDirective extends GFF3Directive {
|
|
280
|
-
/** The string value of the directive */
|
|
281
|
-
value: string
|
|
282
|
-
/** The sequence ID parsed from the directive */
|
|
283
|
-
seq_id: string
|
|
284
|
-
/** The sequence start parsed from the directive */
|
|
285
|
-
start: string
|
|
286
|
-
/** The sequence end parsed from the directive */
|
|
287
|
-
end: string
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
/** A GFF3 genome-build directive */
|
|
291
|
-
export interface GFF3GenomeBuildDirective extends GFF3Directive {
|
|
292
|
-
/** The string value of the directive */
|
|
293
|
-
value: string
|
|
294
|
-
/** The genome build source parsed from the directive */
|
|
295
|
-
source: string
|
|
296
|
-
/** The genome build name parsed from the directive */
|
|
297
|
-
buildName: string
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
/** A GFF3 comment */
|
|
301
|
-
export interface GFF3Comment {
|
|
302
|
-
/** The text of the comment */
|
|
303
|
-
comment: string
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
/** A GFF3 FASTA single sequence */
|
|
307
|
-
export interface GFF3Sequence {
|
|
308
|
-
/** The ID of the sequence */
|
|
309
|
-
id: string
|
|
310
|
-
/** The description of the sequence */
|
|
311
|
-
description?: string
|
|
312
|
-
/** The sequence */
|
|
313
|
-
sequence: string
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
export type GFF3Item = GFF3Feature | GFF3Directive | GFF3Comment | GFF3Sequence
|
|
317
|
-
|
|
318
|
-
// JBrowse format types and parsing functions
|
|
319
|
-
|
|
320
58
|
const JBROWSE_DEFAULT_FIELDS = new Set([
|
|
321
59
|
'start',
|
|
322
60
|
'end',
|
|
@@ -352,7 +90,18 @@ const COMMON_ATTRS: Record<string, string | undefined> = {
|
|
|
352
90
|
gap: 'gap',
|
|
353
91
|
}
|
|
354
92
|
|
|
355
|
-
|
|
93
|
+
const STRAND_MAP: Record<string, number | undefined> = {
|
|
94
|
+
'+': 1,
|
|
95
|
+
'-': -1,
|
|
96
|
+
'.': 0,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* A parsed GFF3 feature: a flat object with 0-based half-open coordinates,
|
|
101
|
+
* numeric strand (`1`/`-1`/`0`), attributes spread as lowercase top-level keys,
|
|
102
|
+
* and child features nested under `subfeatures`.
|
|
103
|
+
*/
|
|
104
|
+
export interface GffFeature {
|
|
356
105
|
start: number
|
|
357
106
|
end: number
|
|
358
107
|
strand?: number
|
|
@@ -361,11 +110,16 @@ export interface JBrowseFeature {
|
|
|
361
110
|
refName: string
|
|
362
111
|
phase?: number
|
|
363
112
|
score?: number
|
|
364
|
-
subfeatures:
|
|
113
|
+
subfeatures: GffFeature[]
|
|
365
114
|
[key: string]: unknown
|
|
366
115
|
}
|
|
367
116
|
|
|
368
|
-
|
|
117
|
+
/**
|
|
118
|
+
* Parse the 9th column (attributes) of a GFF3 feature line into `result`,
|
|
119
|
+
* lowercasing keys and suffixing any that collide with a default field name.
|
|
120
|
+
* Pass shouldUnescape=false as a fast path for data with no escaped characters.
|
|
121
|
+
*/
|
|
122
|
+
export function parseAttributes(
|
|
369
123
|
attrString: string,
|
|
370
124
|
result: Record<string, unknown>,
|
|
371
125
|
shouldUnescape: boolean,
|
|
@@ -420,38 +174,23 @@ function parseAttributesJBrowseImpl(
|
|
|
420
174
|
}
|
|
421
175
|
}
|
|
422
176
|
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
export function
|
|
431
|
-
attrString: string,
|
|
432
|
-
result: Record<string, unknown>,
|
|
433
|
-
) {
|
|
434
|
-
parseAttributesJBrowseImpl(attrString, result, false)
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
const STRAND_MAP: Record<string, number | undefined> = {
|
|
438
|
-
'+': 1,
|
|
439
|
-
'-': -1,
|
|
440
|
-
'.': 0,
|
|
441
|
-
}
|
|
442
|
-
|
|
443
|
-
function parseFeatureJBrowseImpl(
|
|
444
|
-
line: string,
|
|
445
|
-
shouldUnescape: boolean,
|
|
446
|
-
): JBrowseFeature {
|
|
177
|
+
/**
|
|
178
|
+
* Parse a GFF3 feature line. Unescaping is skipped entirely for lines with no
|
|
179
|
+
* '%' character, which is the common case.
|
|
180
|
+
*
|
|
181
|
+
* @param line - GFF3 feature line
|
|
182
|
+
* @returns The parsed feature
|
|
183
|
+
*/
|
|
184
|
+
export function parseFeature(line: string): GffFeature {
|
|
447
185
|
const f = line.split('\t')
|
|
186
|
+
const shouldUnescape = line.includes('%')
|
|
448
187
|
const startStr = f[3]!
|
|
449
188
|
const endStr = f[4]!
|
|
450
189
|
const scoreStr = f[5]!
|
|
451
190
|
const phase = f[7]!
|
|
452
191
|
const attrString = f[8]!
|
|
453
192
|
|
|
454
|
-
const result:
|
|
193
|
+
const result: GffFeature = {
|
|
455
194
|
refName: strField(f[0]!, shouldUnescape, ''),
|
|
456
195
|
source: strField(f[1]!, shouldUnescape, null),
|
|
457
196
|
type: strField(f[2]!, shouldUnescape, null),
|
|
@@ -463,14 +202,6 @@ function parseFeatureJBrowseImpl(
|
|
|
463
202
|
subfeatures: [],
|
|
464
203
|
}
|
|
465
204
|
|
|
466
|
-
|
|
205
|
+
parseAttributes(attrString, result, shouldUnescape)
|
|
467
206
|
return result
|
|
468
207
|
}
|
|
469
|
-
|
|
470
|
-
export function parseFeatureJBrowse(line: string): JBrowseFeature {
|
|
471
|
-
return parseFeatureJBrowseImpl(line, true)
|
|
472
|
-
}
|
|
473
|
-
|
|
474
|
-
export function parseFeatureJBrowseNoUnescape(line: string): JBrowseFeature {
|
|
475
|
-
return parseFeatureJBrowseImpl(line, false)
|
|
476
|
-
}
|