gff-nostream 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parse.ts ADDED
@@ -0,0 +1,400 @@
1
+ import * as GFF3 from './util'
2
+
3
+ const containerAttributes = {
4
+ Parent: 'child_features' as const,
5
+ Derives_from: 'derived_features' as const,
6
+ }
7
+
8
+ export class FASTAParser {
9
+ seqCallback: (sequence: GFF3.GFF3Sequence) => void
10
+ currentSequence:
11
+ | { id: string; sequence: string; description?: string }
12
+ | undefined
13
+
14
+ constructor(seqCallback: (sequence: GFF3.GFF3Sequence) => void) {
15
+ this.seqCallback = seqCallback
16
+ this.currentSequence = undefined
17
+ }
18
+
19
+ addLine(line: string): void {
20
+ const defMatch = /^>\s*(\S+)\s*(.*)/.exec(line)
21
+ if (defMatch) {
22
+ this._flush()
23
+ this.currentSequence = { id: defMatch[1], sequence: '' }
24
+ if (defMatch[2]) {
25
+ this.currentSequence.description = defMatch[2].trim()
26
+ }
27
+ } else if (this.currentSequence && /\S/.test(line)) {
28
+ this.currentSequence.sequence += line.replaceAll(/\s/g, '')
29
+ }
30
+ }
31
+
32
+ private _flush() {
33
+ if (this.currentSequence) {
34
+ this.seqCallback(this.currentSequence)
35
+ }
36
+ }
37
+
38
+ finish(): void {
39
+ this._flush()
40
+ }
41
+ }
42
+
43
+ interface ParserArgs {
44
+ featureCallback?(feature: GFF3.GFF3Feature): void
45
+ endCallback?(): void
46
+ commentCallback?(comment: GFF3.GFF3Comment): void
47
+ errorCallback?(error: string): void
48
+ directiveCallback?(directive: GFF3.GFF3Directive): void
49
+ sequenceCallback?(sequence: GFF3.GFF3Sequence): void
50
+ bufferSize?: number
51
+ disableDerivesFromReferences?: boolean
52
+ }
53
+
54
+ interface References {
55
+ Parent: GFF3.GFF3Feature[]
56
+ Derives_from: GFF3.GFF3Feature[]
57
+ }
58
+
59
+ export default class Parser {
60
+ featureCallback: (feature: GFF3.GFF3Feature) => void
61
+ endCallback: () => void
62
+ commentCallback: (comment: GFF3.GFF3Comment) => void
63
+ errorCallback: (error: string) => void
64
+ disableDerivesFromReferences: boolean
65
+ directiveCallback: (directive: GFF3.GFF3Directive) => void
66
+ sequenceCallback: (sequence: GFF3.GFF3Sequence) => void
67
+ bufferSize: number
68
+ fastaParser: FASTAParser | undefined = undefined
69
+ // if this is true, the parser ignores the
70
+ // rest of the lines in the file. currently
71
+ // set when the file switches over to FASTA
72
+ eof = false
73
+ lineNumber = 0
74
+ // features that we have to keep on hand for now because they
75
+ // might be referenced by something else
76
+ private _underConstructionTopLevel: GFF3.GFF3Feature[] = []
77
+ // index of the above by ID
78
+ private _underConstructionById: Record<string, GFF3.GFF3Feature | undefined> =
79
+ {}
80
+ private _completedReferences: Record<
81
+ string,
82
+ Record<string, boolean | undefined> | undefined
83
+ > = {}
84
+ // features that reference something we have not seen yet
85
+ // structured as:
86
+ // { 'some_id' : {
87
+ // 'Parent' : [ orphans that have a Parent attr referencing it ],
88
+ // 'Derives_from' : [ orphans that have a Derives_from attr referencing it ],
89
+ // }
90
+ // }
91
+ private _underConstructionOrphans: Record<string, References | undefined> = {}
92
+
93
+ constructor(args: ParserArgs) {
94
+ // eslint-disable-next-line @typescript-eslint/no-empty-function
95
+ const nullFunc = () => {}
96
+
97
+ this.featureCallback = args.featureCallback || nullFunc
98
+ this.endCallback = args.endCallback || nullFunc
99
+ this.commentCallback = args.commentCallback || nullFunc
100
+ this.errorCallback = args.errorCallback || nullFunc
101
+ this.directiveCallback = args.directiveCallback || nullFunc
102
+ this.sequenceCallback = args.sequenceCallback || nullFunc
103
+ this.disableDerivesFromReferences =
104
+ args.disableDerivesFromReferences || false
105
+
106
+ // number of lines to buffer
107
+ this.bufferSize = args.bufferSize === undefined ? 1000 : args.bufferSize
108
+ }
109
+
110
+ addLine(line: string): void {
111
+ // if we have transitioned to a fasta section, just delegate to that parser
112
+ if (this.fastaParser) {
113
+ this.fastaParser.addLine(line)
114
+ return
115
+ }
116
+ if (this.eof) {
117
+ // otherwise, if we are done, ignore this line
118
+ return
119
+ }
120
+
121
+ this.lineNumber += 1
122
+
123
+ if (/^\s*[^#\s>]/.test(line)) {
124
+ // feature line, most common case
125
+ this._bufferLine(line)
126
+ return
127
+ }
128
+
129
+ const match = /^\s*(#+)(.*)/.exec(line)
130
+ if (match) {
131
+ // directive or comment
132
+ const [, hashsigns] = match
133
+ let [, , contents] = match
134
+
135
+ if (hashsigns.length === 3) {
136
+ // sync directive, all forward-references are resolved.
137
+ this._emitAllUnderConstructionFeatures()
138
+ } else if (hashsigns.length === 2) {
139
+ const directive = GFF3.parseDirective(line)
140
+ if (directive) {
141
+ if (directive.directive === 'FASTA') {
142
+ this._emitAllUnderConstructionFeatures()
143
+ this.eof = true
144
+ this.fastaParser = new FASTAParser(this.sequenceCallback)
145
+ } else {
146
+ this._emitItem(directive)
147
+ }
148
+ }
149
+ } else {
150
+ contents = contents.replace(/\s*/, '')
151
+ this._emitItem({ comment: contents })
152
+ }
153
+ } else if (/^\s*$/.test(line)) {
154
+ // blank line, do nothing
155
+ } else if (/^\s*>/.test(line)) {
156
+ // implicit beginning of a FASTA section
157
+ this._emitAllUnderConstructionFeatures()
158
+ this.eof = true
159
+ this.fastaParser = new FASTAParser(this.sequenceCallback)
160
+ this.fastaParser.addLine(line)
161
+ } else {
162
+ // it's a parse error
163
+ const errLine = line.replaceAll(/\r?\n?$/g, '')
164
+ throw new Error(`GFF3 parse error. Cannot parse '${errLine}'.`)
165
+ }
166
+ }
167
+
168
+ finish(): void {
169
+ this._emitAllUnderConstructionFeatures()
170
+ if (this.fastaParser) {
171
+ this.fastaParser.finish()
172
+ }
173
+ this.endCallback()
174
+ }
175
+
176
+ private _emitItem(
177
+ i: GFF3.GFF3Feature | GFF3.GFF3Directive | GFF3.GFF3Comment,
178
+ ) {
179
+ if (Array.isArray(i)) {
180
+ this.featureCallback(i)
181
+ } else if ('directive' in i) {
182
+ this.directiveCallback(i)
183
+ } else if ('comment' in i) {
184
+ this.commentCallback(i)
185
+ }
186
+ }
187
+
188
+ private _enforceBufferSizeLimit(additionalItemCount = 0) {
189
+ const _unbufferItem = (item?: GFF3.GFF3Feature) => {
190
+ if (item && Array.isArray(item) && item[0].attributes?.ID?.[0]) {
191
+ const ids = item[0].attributes.ID
192
+ ids.forEach(id => {
193
+ delete this._underConstructionById[id]
194
+ delete this._completedReferences[id]
195
+ })
196
+ item.forEach(i => {
197
+ if (i.child_features) {
198
+ i.child_features.forEach(c => _unbufferItem(c))
199
+ }
200
+ if (i.derived_features) {
201
+ i.derived_features.forEach(d => _unbufferItem(d))
202
+ }
203
+ })
204
+ }
205
+ }
206
+
207
+ while (
208
+ this._underConstructionTopLevel.length + additionalItemCount >
209
+ this.bufferSize
210
+ ) {
211
+ const item = this._underConstructionTopLevel.shift()
212
+ if (item) {
213
+ this._emitItem(item)
214
+ _unbufferItem(item)
215
+ }
216
+ }
217
+ }
218
+
219
+ /**
220
+ * return all under-construction features, called when we know
221
+ * there will be no additional data to attach to them
222
+ */
223
+ private _emitAllUnderConstructionFeatures() {
224
+ this._underConstructionTopLevel.forEach(this._emitItem.bind(this))
225
+
226
+ this._underConstructionTopLevel = []
227
+ this._underConstructionById = {}
228
+ this._completedReferences = {}
229
+
230
+ // if we have any orphans hanging around still, this is a
231
+ // problem. die with a parse error
232
+ if (Array.from(Object.values(this._underConstructionOrphans)).length) {
233
+ throw new Error(
234
+ `some features reference other features that do not exist in the file (or in the same '###' scope). ${Object.keys(
235
+ this._underConstructionOrphans,
236
+ ).join(',')}`,
237
+ )
238
+ }
239
+ }
240
+
241
+ // do the right thing with a newly-parsed feature line
242
+ private _bufferLine(line: string) {
243
+ const rawFeatureLine = GFF3.parseFeature(line)
244
+ const featureLine: GFF3.GFF3FeatureLineWithRefs = {
245
+ ...rawFeatureLine,
246
+ child_features: [],
247
+ derived_features: [],
248
+ }
249
+ // featureLine._lineNumber = this.lineNumber //< debugging aid
250
+
251
+ // NOTE: a feature is an arrayref of one or more feature lines.
252
+ const ids = featureLine.attributes?.ID || []
253
+ const parents = featureLine.attributes?.Parent || []
254
+ const derives = this.disableDerivesFromReferences
255
+ ? []
256
+ : featureLine.attributes?.Derives_from || []
257
+
258
+ if (!ids.length && !parents.length && !derives.length) {
259
+ // if it has no IDs and does not refer to anything, we can just
260
+ // output it
261
+ this._emitItem([featureLine])
262
+ return
263
+ }
264
+
265
+ let feature: GFF3.GFF3Feature | undefined = undefined
266
+ ids.forEach(id => {
267
+ const existing = this._underConstructionById[id]
268
+ if (existing) {
269
+ // another location of the same feature
270
+ if (existing[existing.length - 1].type !== featureLine.type) {
271
+ this._parseError(
272
+ `multi-line feature "${id}" has inconsistent types: "${
273
+ featureLine.type
274
+ }", "${existing[existing.length - 1].type}"`,
275
+ )
276
+ }
277
+ existing.push(featureLine)
278
+ feature = existing
279
+ } else {
280
+ // haven't seen it yet, so buffer it so we can attach
281
+ // child features to it
282
+ feature = [featureLine]
283
+
284
+ this._enforceBufferSizeLimit(1)
285
+ if (!parents.length && !derives.length) {
286
+ this._underConstructionTopLevel.push(feature)
287
+ }
288
+ this._underConstructionById[id] = feature
289
+
290
+ // see if we have anything buffered that refers to it
291
+ this._resolveReferencesTo(feature, id)
292
+ }
293
+ })
294
+
295
+ // try to resolve all its references
296
+ this._resolveReferencesFrom(
297
+ feature || [featureLine],
298
+ { Parent: parents, Derives_from: derives },
299
+ ids,
300
+ )
301
+ }
302
+
303
+ private _resolveReferencesTo(feature: GFF3.GFF3Feature, id: string) {
304
+ const references = this._underConstructionOrphans[id]
305
+ // references is of the form
306
+ // {
307
+ // 'Parent' : [ orphans that have a Parent attr referencing this feature ],
308
+ // 'Derives_from' : [ orphans that have a Derives_from attr referencing this feature ],
309
+ // }
310
+ if (!references) {
311
+ return
312
+ }
313
+ feature.forEach(loc => {
314
+ loc.child_features.push(...references.Parent)
315
+ })
316
+ feature.forEach(loc => {
317
+ loc.derived_features.push(...references.Derives_from)
318
+ })
319
+ delete this._underConstructionOrphans[id]
320
+ }
321
+
322
+ private _parseError(message: string) {
323
+ this.eof = true
324
+ this.errorCallback(`${this.lineNumber}: ${message}`)
325
+ }
326
+
327
+ private _resolveReferencesFrom(
328
+ feature: GFF3.GFF3Feature,
329
+ references: { Parent: string[]; Derives_from: string[] },
330
+ ids: string[],
331
+ ) {
332
+ // this is all a bit more awkward in javascript than it was in perl
333
+ function postSet(
334
+ obj: Record<string, Record<string, boolean | undefined> | undefined>,
335
+ slot1: string,
336
+ slot2: string,
337
+ ) {
338
+ let subObj = obj[slot1]
339
+ if (!subObj) {
340
+ subObj = {}
341
+ obj[slot1] = subObj
342
+ }
343
+ const returnVal = subObj[slot2] || false
344
+ subObj[slot2] = true
345
+ return returnVal
346
+ }
347
+
348
+ references.Parent.forEach(toId => {
349
+ const otherFeature = this._underConstructionById[toId]
350
+ if (otherFeature) {
351
+ const pname = containerAttributes.Parent
352
+ if (
353
+ !ids.filter(id =>
354
+ postSet(this._completedReferences, id, `Parent,${toId}`),
355
+ ).length
356
+ ) {
357
+ otherFeature.forEach(location => {
358
+ location[pname].push(feature)
359
+ })
360
+ }
361
+ } else {
362
+ let ref = this._underConstructionOrphans[toId]
363
+ if (!ref) {
364
+ ref = {
365
+ Parent: [],
366
+ Derives_from: [],
367
+ }
368
+ this._underConstructionOrphans[toId] = ref
369
+ }
370
+ ref.Parent.push(feature)
371
+ }
372
+ })
373
+
374
+ references.Derives_from.forEach(toId => {
375
+ const otherFeature = this._underConstructionById[toId]
376
+ if (otherFeature) {
377
+ const pname = containerAttributes.Derives_from
378
+ if (
379
+ !ids.filter(id =>
380
+ postSet(this._completedReferences, id, `Derives_from,${toId}`),
381
+ ).length
382
+ ) {
383
+ otherFeature.forEach(location => {
384
+ location[pname].push(feature)
385
+ })
386
+ }
387
+ } else {
388
+ let ref = this._underConstructionOrphans[toId]
389
+ if (!ref) {
390
+ ref = {
391
+ Parent: [],
392
+ Derives_from: [],
393
+ }
394
+ this._underConstructionOrphans[toId] = ref
395
+ }
396
+ ref.Derives_from.push(feature)
397
+ }
398
+ })
399
+ }
400
+ }