gff-nostream 1.3.9 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/util.ts CHANGED
@@ -1,14 +1,28 @@
1
1
  // Fast, low-level functions for parsing and formatting GFF3.
2
2
  // JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module.
3
3
 
4
+ const escapeRegex = /%([0-9A-Fa-f]{2})/g
5
+ const directiveRegex = /^\s*##\s*(\S+)\s*(.*)/
6
+ const lineEndRegex = /\r?\n$/
7
+ const whitespaceRegex = /\s+/
8
+ const nonDigitRegex = /\D/g
9
+ // eslint-disable-next-line no-control-regex
10
+ const attrEscapeRegex = /[\n;\r\t=%&,\u0000-\u001f\u007f-\u00ff]/g
11
+ // eslint-disable-next-line no-control-regex
12
+ const columnEscapeRegex = /[\n\r\t%\u0000-\u001f\u007f-\u00ff]/g
13
+
4
14
  /**
5
15
  * Unescape a string value used in a GFF3 attribute.
6
16
  *
7
17
  * @param stringVal - Escaped GFF3 string value
8
18
  * @returns An unescaped string value
9
19
  */
20
+
10
21
  export function unescape(stringVal: string): string {
11
- return stringVal.replaceAll(/%([0-9A-Fa-f]{2})/g, (_match, seq) =>
22
+ if (!stringVal.includes('%')) {
23
+ return stringVal
24
+ }
25
+ return stringVal.replaceAll(escapeRegex, (_match, seq) =>
12
26
  String.fromCharCode(parseInt(seq, 16)),
13
27
  )
14
28
  }
@@ -27,7 +41,7 @@ function _escape(regex: RegExp, s: string | number) {
27
41
  * @returns An escaped string value
28
42
  */
29
43
  export function escape(rawVal: string | number): string {
30
- return _escape(/[\n;\r\t=%&,\u0000-\u001f\u007f-\u00ff]/g, rawVal)
44
+ return _escape(attrEscapeRegex, rawVal)
31
45
  }
32
46
 
33
47
  /**
@@ -37,7 +51,7 @@ export function escape(rawVal: string | number): string {
37
51
  * @returns An escaped column value
38
52
  */
39
53
  export function escapeColumn(rawVal: string | number): string {
40
- return _escape(/[\n\r\t%\u0000-\u001f\u007f-\u00ff]/g, rawVal)
54
+ return _escape(columnEscapeRegex, rawVal)
41
55
  }
42
56
 
43
57
  /**
@@ -53,29 +67,75 @@ export function parseAttributes(attrString: string): GFF3Attributes {
53
67
 
54
68
  const attrs: GFF3Attributes = {}
55
69
 
56
- attrString
57
- .replace(/\r?\n$/, '')
58
- .split(';')
59
- .forEach(a => {
60
- const nv = a.split('=', 2)
61
- if (!nv[1]?.length) {
62
- return
63
- }
64
-
65
- nv[0] = nv[0].trim()
66
- let arec = attrs[nv[0].trim()]
67
- if (!arec) {
68
- arec = []
69
- attrs[nv[0]] = arec
70
- }
71
-
72
- arec.push(
73
- ...nv[1]
74
- .split(',')
75
- .map(s => s.trim())
76
- .map(unescape),
77
- )
78
- })
70
+ let str = attrString
71
+ if (str.endsWith('\n')) {
72
+ str = str.slice(0, str.endsWith('\r\n') ? -2 : -1)
73
+ }
74
+
75
+ for (const a of str.split(';')) {
76
+ const eqIdx = a.indexOf('=')
77
+ if (eqIdx === -1) {
78
+ continue
79
+ }
80
+ const value = a.slice(eqIdx + 1)
81
+ if (!value.length) {
82
+ continue
83
+ }
84
+
85
+ const tag = a.slice(0, eqIdx).trim()
86
+ let arec = attrs[tag]
87
+ if (!arec) {
88
+ arec = []
89
+ attrs[tag] = arec
90
+ }
91
+
92
+ for (const s of value.split(',')) {
93
+ arec.push(unescape(s.trim()))
94
+ }
95
+ }
96
+ return attrs
97
+ }
98
+
99
+ /**
100
+ * Parse the 9th column (attributes) of a GFF3 feature line without unescaping.
101
+ * Fast path for data known to contain no escaped characters.
102
+ *
103
+ * @param attrString - String of GFF3 9th column
104
+ * @returns Parsed attributes
105
+ */
106
+ export function parseAttributesNoUnescape(attrString: string): GFF3Attributes {
107
+ if (!attrString.length || attrString === '.') {
108
+ return {}
109
+ }
110
+
111
+ const attrs: GFF3Attributes = {}
112
+
113
+ let str = attrString
114
+ if (str.endsWith('\n')) {
115
+ str = str.slice(0, str.endsWith('\r\n') ? -2 : -1)
116
+ }
117
+
118
+ for (const a of str.split(';')) {
119
+ const eqIdx = a.indexOf('=')
120
+ if (eqIdx === -1) {
121
+ continue
122
+ }
123
+ const value = a.slice(eqIdx + 1)
124
+ if (!value.length) {
125
+ continue
126
+ }
127
+
128
+ const tag = a.slice(0, eqIdx).trim()
129
+ let arec = attrs[tag]
130
+ if (!arec) {
131
+ arec = []
132
+ attrs[tag] = arec
133
+ }
134
+
135
+ for (const s of value.split(',')) {
136
+ arec.push(s.trim())
137
+ }
138
+ }
79
139
  return attrs
80
140
  }
81
141
 
@@ -86,22 +146,72 @@ export function parseAttributes(attrString: string): GFF3Attributes {
86
146
  * @returns The parsed feature
87
147
  */
88
148
  export function parseFeature(line: string): GFF3FeatureLine {
89
- // split the line into columns and replace '.' with null in each column
90
- const f = line.split('\t').map(a => (a === '.' || a === '' ? null : a))
91
-
92
- // unescape only the ref, source, and type columns
93
- const parsed: GFF3FeatureLine = {
94
- seq_id: f[0] && unescape(f[0]),
95
- source: f[1] && unescape(f[1]),
96
- type: f[2] && unescape(f[2]),
97
- start: f[3] === null ? null : parseInt(f[3], 10),
98
- end: f[4] === null ? null : parseInt(f[4], 10),
99
- score: f[5] === null ? null : parseFloat(f[5]),
100
- strand: f[6],
101
- phase: f[7],
102
- attributes: f[8] === null ? null : parseAttributes(f[8]),
149
+ return parseFieldsArray(line.split('\t'))
150
+ }
151
+
152
+ /**
153
+ * Parse a GFF3 feature from a pre-split fields array
154
+ *
155
+ * @param f - Array of 9 GFF3 column values (use null or '.' for empty values)
156
+ * @returns The parsed feature
157
+ */
158
+ function norm(a: string | null | undefined) {
159
+ return a === '.' || a === '' || a === undefined ? null : a
160
+ }
161
+
162
+ export function parseFieldsArray(f: (string | null | undefined)[]): GFF3FeatureLine {
163
+ const seq_id = norm(f[0])
164
+ const source = norm(f[1])
165
+ const type = norm(f[2])
166
+ const start = norm(f[3])
167
+ const end = norm(f[4])
168
+ const score = norm(f[5])
169
+ const strand = norm(f[6])
170
+ const phase = norm(f[7])
171
+ const attrString = norm(f[8])
172
+
173
+ return {
174
+ seq_id: seq_id ? unescape(seq_id) : null,
175
+ source: source ? unescape(source) : null,
176
+ type: type ? unescape(type) : null,
177
+ start: start === null ? null : parseInt(start, 10),
178
+ end: end === null ? null : parseInt(end, 10),
179
+ score: score === null ? null : parseFloat(score),
180
+ strand,
181
+ phase,
182
+ attributes: attrString === null ? null : parseAttributes(attrString),
183
+ }
184
+ }
185
+
186
+ /**
187
+ * Parse a GFF3 feature from a pre-split fields array without unescaping.
188
+ * Fast path for data known to contain no escaped characters.
189
+ *
190
+ * @param f - Array of 9 GFF3 column values (use null or '.' for empty values)
191
+ * @returns The parsed feature
192
+ */
193
+ export function parseFieldsArrayNoUnescape(f: (string | null | undefined)[]): GFF3FeatureLine {
194
+ const seq_id = norm(f[0])
195
+ const source = norm(f[1])
196
+ const type = norm(f[2])
197
+ const start = norm(f[3])
198
+ const end = norm(f[4])
199
+ const score = norm(f[5])
200
+ const strand = norm(f[6])
201
+ const phase = norm(f[7])
202
+ const attrString = norm(f[8])
203
+
204
+ return {
205
+ seq_id,
206
+ source,
207
+ type,
208
+ start: start === null ? null : parseInt(start, 10),
209
+ end: end === null ? null : parseInt(end, 10),
210
+ score: score === null ? null : parseFloat(score),
211
+ strand,
212
+ phase,
213
+ attributes: attrString === null ? null : parseAttributesNoUnescape(attrString),
103
214
  }
104
- return parsed
105
215
  }
106
216
 
107
217
  /**
@@ -117,7 +227,7 @@ export function parseDirective(
117
227
  | GFF3SequenceRegionDirective
118
228
  | GFF3GenomeBuildDirective
119
229
  | null {
120
- const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line)
230
+ const match = directiveRegex.exec(line)
121
231
  if (!match) {
122
232
  return null
123
233
  }
@@ -126,22 +236,22 @@ export function parseDirective(
126
236
  let [, , contents] = match
127
237
 
128
238
  const parsed: GFF3Directive = { directive: name }
129
- if (contents.length) {
130
- contents = contents.replace(/\r?\n$/, '')
239
+ if (contents!.length) {
240
+ contents = contents!.replace(lineEndRegex, '')
131
241
  parsed.value = contents
132
242
  }
133
243
 
134
244
  // do a little additional parsing for sequence-region and genome-build directives
135
245
  if (name === 'sequence-region') {
136
- const c = contents.split(/\s+/, 3)
246
+ const c = contents!.split(whitespaceRegex, 3)
137
247
  return {
138
248
  ...parsed,
139
249
  seq_id: c[0],
140
- start: c[1]?.replaceAll(/\D/g, ''),
141
- end: c[2]?.replaceAll(/\D/g, ''),
250
+ start: c[1]?.replaceAll(nonDigitRegex, ''),
251
+ end: c[2]?.replaceAll(nonDigitRegex, ''),
142
252
  } as GFF3SequenceRegionDirective
143
253
  } else if (name === 'genome-build') {
144
- const [source, buildName] = contents.split(/\s+/, 2)
254
+ const [source, buildName] = contents!.split(whitespaceRegex, 2)
145
255
  return {
146
256
  ...parsed,
147
257
  source,
@@ -160,22 +270,12 @@ export function parseDirective(
160
270
  */
161
271
  export function formatAttributes(attrs: GFF3Attributes): string {
162
272
  const attrOrder: string[] = []
163
- Object.entries(attrs).forEach(([tag, val]) => {
273
+ for (const [tag, val] of Object.entries(attrs)) {
164
274
  if (!val) {
165
- return
275
+ continue
166
276
  }
167
- let valstring
168
- if (val.hasOwnProperty('toString')) {
169
- valstring = escape(val.toString())
170
- // } else if (Array.isArray(val.values)) {
171
- // valstring = val.values.map(escape).join(',')
172
- } else if (Array.isArray(val)) {
173
- valstring = val.map(escape).join(',')
174
- } else {
175
- valstring = escape(val)
176
- }
177
- attrOrder.push(`${escape(tag)}=${valstring}`)
178
- })
277
+ attrOrder.push(`${escape(tag)}=${val.map(escape).join(',')}`)
278
+ }
179
279
  return attrOrder.length ? attrOrder.join(';') : '.'
180
280
  }
181
281
 
@@ -184,6 +284,7 @@ function _formatSingleFeature(
184
284
  seenFeature: Record<string, boolean | undefined>,
185
285
  ) {
186
286
  const attrString =
287
+ // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
187
288
  f.attributes === null || f.attributes === undefined
188
289
  ? '.'
189
290
  : formatAttributes(f.attributes)
@@ -366,7 +467,9 @@ function _isFeatureLineWithRefs(
366
467
  featureLine: GFF3FeatureLine | GFF3FeatureLineWithRefs,
367
468
  ): featureLine is GFF3FeatureLineWithRefs {
368
469
  return (
470
+ // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
369
471
  (featureLine as GFF3FeatureLineWithRefs).child_features !== undefined &&
472
+ // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
370
473
  (featureLine as GFF3FeatureLineWithRefs).derived_features !== undefined
371
474
  )
372
475
  }