gff-nostream 1.3.9 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/util.ts CHANGED
@@ -1,14 +1,28 @@
1
1
  // Fast, low-level functions for parsing and formatting GFF3.
2
2
  // JavaScript port of Robert Buels's Bio::GFF3::LowLevel Perl module.
3
3
 
4
+ const escapeRegex = /%([0-9A-Fa-f]{2})/g
5
+ const directiveRegex = /^\s*##\s*(\S+)\s*(.*)/
6
+ const lineEndRegex = /\r?\n$/
7
+ const whitespaceRegex = /\s+/
8
+ const nonDigitRegex = /\D/g
9
+ // eslint-disable-next-line no-control-regex
10
+ const attrEscapeRegex = /[\n;\r\t=%&,\u0000-\u001f\u007f-\u00ff]/g
11
+ // eslint-disable-next-line no-control-regex
12
+ const columnEscapeRegex = /[\n\r\t%\u0000-\u001f\u007f-\u00ff]/g
13
+
4
14
  /**
5
15
  * Unescape a string value used in a GFF3 attribute.
6
16
  *
7
17
  * @param stringVal - Escaped GFF3 string value
8
18
  * @returns An unescaped string value
9
19
  */
20
+
10
21
  export function unescape(stringVal: string): string {
11
- return stringVal.replaceAll(/%([0-9A-Fa-f]{2})/g, (_match, seq) =>
22
+ if (!stringVal.includes('%')) {
23
+ return stringVal
24
+ }
25
+ return stringVal.replaceAll(escapeRegex, (_match, seq) =>
12
26
  String.fromCharCode(parseInt(seq, 16)),
13
27
  )
14
28
  }
@@ -27,7 +41,7 @@ function _escape(regex: RegExp, s: string | number) {
27
41
  * @returns An escaped string value
28
42
  */
29
43
  export function escape(rawVal: string | number): string {
30
- return _escape(/[\n;\r\t=%&,\u0000-\u001f\u007f-\u00ff]/g, rawVal)
44
+ return _escape(attrEscapeRegex, rawVal)
31
45
  }
32
46
 
33
47
  /**
@@ -37,7 +51,7 @@ export function escape(rawVal: string | number): string {
37
51
  * @returns An escaped column value
38
52
  */
39
53
  export function escapeColumn(rawVal: string | number): string {
40
- return _escape(/[\n\r\t%\u0000-\u001f\u007f-\u00ff]/g, rawVal)
54
+ return _escape(columnEscapeRegex, rawVal)
41
55
  }
42
56
 
43
57
  /**
@@ -53,29 +67,32 @@ export function parseAttributes(attrString: string): GFF3Attributes {
53
67
 
54
68
  const attrs: GFF3Attributes = {}
55
69
 
56
- attrString
57
- .replace(/\r?\n$/, '')
58
- .split(';')
59
- .forEach(a => {
60
- const nv = a.split('=', 2)
61
- if (!nv[1]?.length) {
62
- return
63
- }
64
-
65
- nv[0] = nv[0].trim()
66
- let arec = attrs[nv[0].trim()]
67
- if (!arec) {
68
- arec = []
69
- attrs[nv[0]] = arec
70
- }
71
-
72
- arec.push(
73
- ...nv[1]
74
- .split(',')
75
- .map(s => s.trim())
76
- .map(unescape),
77
- )
78
- })
70
+ let str = attrString
71
+ if (str.endsWith('\n')) {
72
+ str = str.slice(0, str.endsWith('\r\n') ? -2 : -1)
73
+ }
74
+
75
+ for (const a of str.split(';')) {
76
+ const eqIdx = a.indexOf('=')
77
+ if (eqIdx === -1) {
78
+ continue
79
+ }
80
+ const value = a.slice(eqIdx + 1)
81
+ if (!value.length) {
82
+ continue
83
+ }
84
+
85
+ const tag = a.slice(0, eqIdx).trim()
86
+ let arec = attrs[tag]
87
+ if (!arec) {
88
+ arec = []
89
+ attrs[tag] = arec
90
+ }
91
+
92
+ for (const s of value.split(',')) {
93
+ arec.push(unescape(s.trim()))
94
+ }
95
+ }
79
96
  return attrs
80
97
  }
81
98
 
@@ -86,22 +103,41 @@ export function parseAttributes(attrString: string): GFF3Attributes {
86
103
  * @returns The parsed feature
87
104
  */
88
105
  export function parseFeature(line: string): GFF3FeatureLine {
89
- // split the line into columns and replace '.' with null in each column
90
- const f = line.split('\t').map(a => (a === '.' || a === '' ? null : a))
91
-
92
- // unescape only the ref, source, and type columns
93
- const parsed: GFF3FeatureLine = {
94
- seq_id: f[0] && unescape(f[0]),
95
- source: f[1] && unescape(f[1]),
96
- type: f[2] && unescape(f[2]),
97
- start: f[3] === null ? null : parseInt(f[3], 10),
98
- end: f[4] === null ? null : parseInt(f[4], 10),
99
- score: f[5] === null ? null : parseFloat(f[5]),
100
- strand: f[6],
101
- phase: f[7],
102
- attributes: f[8] === null ? null : parseAttributes(f[8]),
106
+ return parseFieldsArray(line.split('\t'))
107
+ }
108
+
109
+ /**
110
+ * Parse a GFF3 feature from a pre-split fields array
111
+ *
112
+ * @param f - Array of 9 GFF3 column values (use null or '.' for empty values)
113
+ * @returns The parsed feature
114
+ */
115
+ function norm(a: string | null | undefined) {
116
+ return a === '.' || a === '' || a === undefined ? null : a
117
+ }
118
+
119
+ export function parseFieldsArray(f: (string | null | undefined)[]): GFF3FeatureLine {
120
+ const seq_id = norm(f[0])
121
+ const source = norm(f[1])
122
+ const type = norm(f[2])
123
+ const start = norm(f[3])
124
+ const end = norm(f[4])
125
+ const score = norm(f[5])
126
+ const strand = norm(f[6])
127
+ const phase = norm(f[7])
128
+ const attrString = norm(f[8])
129
+
130
+ return {
131
+ seq_id: seq_id ? unescape(seq_id) : null,
132
+ source: source ? unescape(source) : null,
133
+ type: type ? unescape(type) : null,
134
+ start: start === null ? null : parseInt(start, 10),
135
+ end: end === null ? null : parseInt(end, 10),
136
+ score: score === null ? null : parseFloat(score),
137
+ strand,
138
+ phase,
139
+ attributes: attrString === null ? null : parseAttributes(attrString),
103
140
  }
104
- return parsed
105
141
  }
106
142
 
107
143
  /**
@@ -117,7 +153,7 @@ export function parseDirective(
117
153
  | GFF3SequenceRegionDirective
118
154
  | GFF3GenomeBuildDirective
119
155
  | null {
120
- const match = /^\s*##\s*(\S+)\s*(.*)/.exec(line)
156
+ const match = directiveRegex.exec(line)
121
157
  if (!match) {
122
158
  return null
123
159
  }
@@ -126,22 +162,22 @@ export function parseDirective(
126
162
  let [, , contents] = match
127
163
 
128
164
  const parsed: GFF3Directive = { directive: name }
129
- if (contents.length) {
130
- contents = contents.replace(/\r?\n$/, '')
165
+ if (contents!.length) {
166
+ contents = contents!.replace(lineEndRegex, '')
131
167
  parsed.value = contents
132
168
  }
133
169
 
134
170
  // do a little additional parsing for sequence-region and genome-build directives
135
171
  if (name === 'sequence-region') {
136
- const c = contents.split(/\s+/, 3)
172
+ const c = contents!.split(whitespaceRegex, 3)
137
173
  return {
138
174
  ...parsed,
139
175
  seq_id: c[0],
140
- start: c[1]?.replaceAll(/\D/g, ''),
141
- end: c[2]?.replaceAll(/\D/g, ''),
176
+ start: c[1]?.replaceAll(nonDigitRegex, ''),
177
+ end: c[2]?.replaceAll(nonDigitRegex, ''),
142
178
  } as GFF3SequenceRegionDirective
143
179
  } else if (name === 'genome-build') {
144
- const [source, buildName] = contents.split(/\s+/, 2)
180
+ const [source, buildName] = contents!.split(whitespaceRegex, 2)
145
181
  return {
146
182
  ...parsed,
147
183
  source,
@@ -160,22 +196,12 @@ export function parseDirective(
160
196
  */
161
197
  export function formatAttributes(attrs: GFF3Attributes): string {
162
198
  const attrOrder: string[] = []
163
- Object.entries(attrs).forEach(([tag, val]) => {
199
+ for (const [tag, val] of Object.entries(attrs)) {
164
200
  if (!val) {
165
- return
201
+ continue
166
202
  }
167
- let valstring
168
- if (val.hasOwnProperty('toString')) {
169
- valstring = escape(val.toString())
170
- // } else if (Array.isArray(val.values)) {
171
- // valstring = val.values.map(escape).join(',')
172
- } else if (Array.isArray(val)) {
173
- valstring = val.map(escape).join(',')
174
- } else {
175
- valstring = escape(val)
176
- }
177
- attrOrder.push(`${escape(tag)}=${valstring}`)
178
- })
203
+ attrOrder.push(`${escape(tag)}=${val.map(escape).join(',')}`)
204
+ }
179
205
  return attrOrder.length ? attrOrder.join(';') : '.'
180
206
  }
181
207
 
@@ -184,6 +210,7 @@ function _formatSingleFeature(
184
210
  seenFeature: Record<string, boolean | undefined>,
185
211
  ) {
186
212
  const attrString =
213
+ // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
187
214
  f.attributes === null || f.attributes === undefined
188
215
  ? '.'
189
216
  : formatAttributes(f.attributes)
@@ -366,7 +393,9 @@ function _isFeatureLineWithRefs(
366
393
  featureLine: GFF3FeatureLine | GFF3FeatureLineWithRefs,
367
394
  ): featureLine is GFF3FeatureLineWithRefs {
368
395
  return (
396
+ // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
369
397
  (featureLine as GFF3FeatureLineWithRefs).child_features !== undefined &&
398
+ // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
370
399
  (featureLine as GFF3FeatureLineWithRefs).derived_features !== undefined
371
400
  )
372
401
  }