gff-nostream 1.3.9 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +33 -2
- package/dist/api.js +91 -3
- package/dist/api.js.map +1 -1
- package/dist/index.d.ts +3 -3
- package/dist/index.js +6 -3
- package/dist/index.js.map +1 -1
- package/dist/parse.d.ts +3 -16
- package/dist/parse.js +80 -124
- package/dist/parse.js.map +1 -1
- package/dist/util.d.ts +17 -0
- package/dist/util.js +160 -59
- package/dist/util.js.map +1 -1
- package/esm/api.d.ts +33 -2
- package/esm/api.js +87 -2
- package/esm/api.js.map +1 -1
- package/esm/index.d.ts +3 -3
- package/esm/index.js +2 -2
- package/esm/index.js.map +1 -1
- package/esm/parse.d.ts +3 -16
- package/esm/parse.js +103 -142
- package/esm/parse.js.map +1 -1
- package/esm/util.d.ts +17 -0
- package/esm/util.js +148 -58
- package/esm/util.js.map +1 -1
- package/package.json +27 -16
- package/src/api.ts +113 -12
- package/src/index.ts +6 -5
- package/src/parse.ts +80 -146
- package/src/util.ts +165 -62
package/src/util.ts
CHANGED
|
@@ -1,14 +1,28 @@
|
|
|
1
1
|
// Fast, low-level functions for parsing and formatting GFF3.
|
|
2
2
|
// JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module.
|
|
3
3
|
|
|
4
|
+
const escapeRegex = /%([0-9A-Fa-f]{2})/g
|
|
5
|
+
const directiveRegex = /^\s*##\s*(\S+)\s*(.*)/
|
|
6
|
+
const lineEndRegex = /\r?\n$/
|
|
7
|
+
const whitespaceRegex = /\s+/
|
|
8
|
+
const nonDigitRegex = /\D/g
|
|
9
|
+
// eslint-disable-next-line no-control-regex
|
|
10
|
+
const attrEscapeRegex = /[\n;\r\t=%&,\u0000-\u001f\u007f-\u00ff]/g
|
|
11
|
+
// eslint-disable-next-line no-control-regex
|
|
12
|
+
const columnEscapeRegex = /[\n\r\t%\u0000-\u001f\u007f-\u00ff]/g
|
|
13
|
+
|
|
4
14
|
/**
|
|
5
15
|
* Unescape a string value used in a GFF3 attribute.
|
|
6
16
|
*
|
|
7
17
|
* @param stringVal - Escaped GFF3 string value
|
|
8
18
|
* @returns An unescaped string value
|
|
9
19
|
*/
|
|
20
|
+
|
|
10
21
|
export function unescape(stringVal: string): string {
|
|
11
|
-
|
|
22
|
+
if (!stringVal.includes('%')) {
|
|
23
|
+
return stringVal
|
|
24
|
+
}
|
|
25
|
+
return stringVal.replaceAll(escapeRegex, (_match, seq) =>
|
|
12
26
|
String.fromCharCode(parseInt(seq, 16)),
|
|
13
27
|
)
|
|
14
28
|
}
|
|
@@ -27,7 +41,7 @@ function _escape(regex: RegExp, s: string | number) {
|
|
|
27
41
|
* @returns An escaped string value
|
|
28
42
|
*/
|
|
29
43
|
export function escape(rawVal: string | number): string {
|
|
30
|
-
return _escape(
|
|
44
|
+
return _escape(attrEscapeRegex, rawVal)
|
|
31
45
|
}
|
|
32
46
|
|
|
33
47
|
/**
|
|
@@ -37,7 +51,7 @@ export function escape(rawVal: string | number): string {
|
|
|
37
51
|
* @returns An escaped column value
|
|
38
52
|
*/
|
|
39
53
|
export function escapeColumn(rawVal: string | number): string {
|
|
40
|
-
return _escape(
|
|
54
|
+
return _escape(columnEscapeRegex, rawVal)
|
|
41
55
|
}
|
|
42
56
|
|
|
43
57
|
/**
|
|
@@ -53,29 +67,75 @@ export function parseAttributes(attrString: string): GFF3Attributes {
|
|
|
53
67
|
|
|
54
68
|
const attrs: GFF3Attributes = {}
|
|
55
69
|
|
|
56
|
-
attrString
|
|
57
|
-
|
|
58
|
-
.
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
70
|
+
let str = attrString
|
|
71
|
+
if (str.endsWith('\n')) {
|
|
72
|
+
str = str.slice(0, str.endsWith('\r\n') ? -2 : -1)
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
for (const a of str.split(';')) {
|
|
76
|
+
const eqIdx = a.indexOf('=')
|
|
77
|
+
if (eqIdx === -1) {
|
|
78
|
+
continue
|
|
79
|
+
}
|
|
80
|
+
const value = a.slice(eqIdx + 1)
|
|
81
|
+
if (!value.length) {
|
|
82
|
+
continue
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const tag = a.slice(0, eqIdx).trim()
|
|
86
|
+
let arec = attrs[tag]
|
|
87
|
+
if (!arec) {
|
|
88
|
+
arec = []
|
|
89
|
+
attrs[tag] = arec
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
for (const s of value.split(',')) {
|
|
93
|
+
arec.push(unescape(s.trim()))
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
return attrs
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Parse the 9th column (attributes) of a GFF3 feature line without unescaping.
|
|
101
|
+
* Fast path for data known to contain no escaped characters.
|
|
102
|
+
*
|
|
103
|
+
* @param attrString - String of GFF3 9th column
|
|
104
|
+
* @returns Parsed attributes
|
|
105
|
+
*/
|
|
106
|
+
export function parseAttributesNoUnescape(attrString: string): GFF3Attributes {
|
|
107
|
+
if (!attrString.length || attrString === '.') {
|
|
108
|
+
return {}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const attrs: GFF3Attributes = {}
|
|
112
|
+
|
|
113
|
+
let str = attrString
|
|
114
|
+
if (str.endsWith('\n')) {
|
|
115
|
+
str = str.slice(0, str.endsWith('\r\n') ? -2 : -1)
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
for (const a of str.split(';')) {
|
|
119
|
+
const eqIdx = a.indexOf('=')
|
|
120
|
+
if (eqIdx === -1) {
|
|
121
|
+
continue
|
|
122
|
+
}
|
|
123
|
+
const value = a.slice(eqIdx + 1)
|
|
124
|
+
if (!value.length) {
|
|
125
|
+
continue
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
const tag = a.slice(0, eqIdx).trim()
|
|
129
|
+
let arec = attrs[tag]
|
|
130
|
+
if (!arec) {
|
|
131
|
+
arec = []
|
|
132
|
+
attrs[tag] = arec
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
for (const s of value.split(',')) {
|
|
136
|
+
arec.push(s.trim())
|
|
137
|
+
}
|
|
138
|
+
}
|
|
79
139
|
return attrs
|
|
80
140
|
}
|
|
81
141
|
|
|
@@ -86,22 +146,72 @@ export function parseAttributes(attrString: string): GFF3Attributes {
|
|
|
86
146
|
* @returns The parsed feature
|
|
87
147
|
*/
|
|
88
148
|
export function parseFeature(line: string): GFF3FeatureLine {
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
149
|
+
return parseFieldsArray(line.split('\t'))
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Parse a GFF3 feature from a pre-split fields array
|
|
154
|
+
*
|
|
155
|
+
* @param f - Array of 9 GFF3 column values (use null or '.' for empty values)
|
|
156
|
+
* @returns The parsed feature
|
|
157
|
+
*/
|
|
158
|
+
function norm(a: string | null | undefined) {
|
|
159
|
+
return a === '.' || a === '' || a === undefined ? null : a
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
export function parseFieldsArray(f: (string | null | undefined)[]): GFF3FeatureLine {
|
|
163
|
+
const seq_id = norm(f[0])
|
|
164
|
+
const source = norm(f[1])
|
|
165
|
+
const type = norm(f[2])
|
|
166
|
+
const start = norm(f[3])
|
|
167
|
+
const end = norm(f[4])
|
|
168
|
+
const score = norm(f[5])
|
|
169
|
+
const strand = norm(f[6])
|
|
170
|
+
const phase = norm(f[7])
|
|
171
|
+
const attrString = norm(f[8])
|
|
172
|
+
|
|
173
|
+
return {
|
|
174
|
+
seq_id: seq_id ? unescape(seq_id) : null,
|
|
175
|
+
source: source ? unescape(source) : null,
|
|
176
|
+
type: type ? unescape(type) : null,
|
|
177
|
+
start: start === null ? null : parseInt(start, 10),
|
|
178
|
+
end: end === null ? null : parseInt(end, 10),
|
|
179
|
+
score: score === null ? null : parseFloat(score),
|
|
180
|
+
strand,
|
|
181
|
+
phase,
|
|
182
|
+
attributes: attrString === null ? null : parseAttributes(attrString),
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Parse a GFF3 feature from a pre-split fields array without unescaping.
|
|
188
|
+
* Fast path for data known to contain no escaped characters.
|
|
189
|
+
*
|
|
190
|
+
* @param f - Array of 9 GFF3 column values (use null or '.' for empty values)
|
|
191
|
+
* @returns The parsed feature
|
|
192
|
+
*/
|
|
193
|
+
export function parseFieldsArrayNoUnescape(f: (string | null | undefined)[]): GFF3FeatureLine {
|
|
194
|
+
const seq_id = norm(f[0])
|
|
195
|
+
const source = norm(f[1])
|
|
196
|
+
const type = norm(f[2])
|
|
197
|
+
const start = norm(f[3])
|
|
198
|
+
const end = norm(f[4])
|
|
199
|
+
const score = norm(f[5])
|
|
200
|
+
const strand = norm(f[6])
|
|
201
|
+
const phase = norm(f[7])
|
|
202
|
+
const attrString = norm(f[8])
|
|
203
|
+
|
|
204
|
+
return {
|
|
205
|
+
seq_id,
|
|
206
|
+
source,
|
|
207
|
+
type,
|
|
208
|
+
start: start === null ? null : parseInt(start, 10),
|
|
209
|
+
end: end === null ? null : parseInt(end, 10),
|
|
210
|
+
score: score === null ? null : parseFloat(score),
|
|
211
|
+
strand,
|
|
212
|
+
phase,
|
|
213
|
+
attributes: attrString === null ? null : parseAttributesNoUnescape(attrString),
|
|
103
214
|
}
|
|
104
|
-
return parsed
|
|
105
215
|
}
|
|
106
216
|
|
|
107
217
|
/**
|
|
@@ -117,7 +227,7 @@ export function parseDirective(
|
|
|
117
227
|
| GFF3SequenceRegionDirective
|
|
118
228
|
| GFF3GenomeBuildDirective
|
|
119
229
|
| null {
|
|
120
|
-
const match =
|
|
230
|
+
const match = directiveRegex.exec(line)
|
|
121
231
|
if (!match) {
|
|
122
232
|
return null
|
|
123
233
|
}
|
|
@@ -126,22 +236,22 @@ export function parseDirective(
|
|
|
126
236
|
let [, , contents] = match
|
|
127
237
|
|
|
128
238
|
const parsed: GFF3Directive = { directive: name }
|
|
129
|
-
if (contents
|
|
130
|
-
contents = contents
|
|
239
|
+
if (contents!.length) {
|
|
240
|
+
contents = contents!.replace(lineEndRegex, '')
|
|
131
241
|
parsed.value = contents
|
|
132
242
|
}
|
|
133
243
|
|
|
134
244
|
// do a little additional parsing for sequence-region and genome-build directives
|
|
135
245
|
if (name === 'sequence-region') {
|
|
136
|
-
const c = contents
|
|
246
|
+
const c = contents!.split(whitespaceRegex, 3)
|
|
137
247
|
return {
|
|
138
248
|
...parsed,
|
|
139
249
|
seq_id: c[0],
|
|
140
|
-
start: c[1]?.replaceAll(
|
|
141
|
-
end: c[2]?.replaceAll(
|
|
250
|
+
start: c[1]?.replaceAll(nonDigitRegex, ''),
|
|
251
|
+
end: c[2]?.replaceAll(nonDigitRegex, ''),
|
|
142
252
|
} as GFF3SequenceRegionDirective
|
|
143
253
|
} else if (name === 'genome-build') {
|
|
144
|
-
const [source, buildName] = contents
|
|
254
|
+
const [source, buildName] = contents!.split(whitespaceRegex, 2)
|
|
145
255
|
return {
|
|
146
256
|
...parsed,
|
|
147
257
|
source,
|
|
@@ -160,22 +270,12 @@ export function parseDirective(
|
|
|
160
270
|
*/
|
|
161
271
|
export function formatAttributes(attrs: GFF3Attributes): string {
|
|
162
272
|
const attrOrder: string[] = []
|
|
163
|
-
|
|
273
|
+
for (const [tag, val] of Object.entries(attrs)) {
|
|
164
274
|
if (!val) {
|
|
165
|
-
|
|
275
|
+
continue
|
|
166
276
|
}
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
valstring = escape(val.toString())
|
|
170
|
-
// } else if (Array.isArray(val.values)) {
|
|
171
|
-
// valstring = val.values.map(escape).join(',')
|
|
172
|
-
} else if (Array.isArray(val)) {
|
|
173
|
-
valstring = val.map(escape).join(',')
|
|
174
|
-
} else {
|
|
175
|
-
valstring = escape(val)
|
|
176
|
-
}
|
|
177
|
-
attrOrder.push(`${escape(tag)}=${valstring}`)
|
|
178
|
-
})
|
|
277
|
+
attrOrder.push(`${escape(tag)}=${val.map(escape).join(',')}`)
|
|
278
|
+
}
|
|
179
279
|
return attrOrder.length ? attrOrder.join(';') : '.'
|
|
180
280
|
}
|
|
181
281
|
|
|
@@ -184,6 +284,7 @@ function _formatSingleFeature(
|
|
|
184
284
|
seenFeature: Record<string, boolean | undefined>,
|
|
185
285
|
) {
|
|
186
286
|
const attrString =
|
|
287
|
+
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
|
187
288
|
f.attributes === null || f.attributes === undefined
|
|
188
289
|
? '.'
|
|
189
290
|
: formatAttributes(f.attributes)
|
|
@@ -366,7 +467,9 @@ function _isFeatureLineWithRefs(
|
|
|
366
467
|
featureLine: GFF3FeatureLine | GFF3FeatureLineWithRefs,
|
|
367
468
|
): featureLine is GFF3FeatureLineWithRefs {
|
|
368
469
|
return (
|
|
470
|
+
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
|
369
471
|
(featureLine as GFF3FeatureLineWithRefs).child_features !== undefined &&
|
|
472
|
+
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
|
|
370
473
|
(featureLine as GFF3FeatureLineWithRefs).derived_features !== undefined
|
|
371
474
|
)
|
|
372
475
|
}
|