gff-nostream 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +163 -0
- package/dist/api.d.ts +23 -0
- package/dist/api.js +40 -0
- package/dist/api.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +6 -0
- package/dist/index.js.map +1 -0
- package/dist/parse.d.ts +55 -0
- package/dist/parse.js +341 -0
- package/dist/parse.js.map +1 -0
- package/dist/util.d.ts +166 -0
- package/dist/util.js +274 -0
- package/dist/util.js.map +1 -0
- package/esm/api.d.ts +23 -0
- package/esm/api.js +42 -0
- package/esm/api.js.map +1 -0
- package/esm/index.d.ts +3 -0
- package/esm/index.js +3 -0
- package/esm/index.js.map +1 -0
- package/esm/parse.d.ts +55 -0
- package/esm/parse.js +317 -0
- package/esm/parse.js.map +1 -0
- package/esm/util.d.ts +166 -0
- package/esm/util.js +268 -0
- package/esm/util.js.map +1 -0
- package/package.json +52 -0
- package/src/api.ts +86 -0
- package/src/index.ts +12 -0
- package/src/parse.ts +400 -0
- package/src/util.ts +425 -0
package/src/parse.ts
ADDED
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
import * as GFF3 from './util'
|
|
2
|
+
|
|
3
|
+
const containerAttributes = {
|
|
4
|
+
Parent: 'child_features' as const,
|
|
5
|
+
Derives_from: 'derived_features' as const,
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export class FASTAParser {
|
|
9
|
+
seqCallback: (sequence: GFF3.GFF3Sequence) => void
|
|
10
|
+
currentSequence:
|
|
11
|
+
| { id: string; sequence: string; description?: string }
|
|
12
|
+
| undefined
|
|
13
|
+
|
|
14
|
+
constructor(seqCallback: (sequence: GFF3.GFF3Sequence) => void) {
|
|
15
|
+
this.seqCallback = seqCallback
|
|
16
|
+
this.currentSequence = undefined
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
addLine(line: string): void {
|
|
20
|
+
const defMatch = /^>\s*(\S+)\s*(.*)/.exec(line)
|
|
21
|
+
if (defMatch) {
|
|
22
|
+
this._flush()
|
|
23
|
+
this.currentSequence = { id: defMatch[1], sequence: '' }
|
|
24
|
+
if (defMatch[2]) {
|
|
25
|
+
this.currentSequence.description = defMatch[2].trim()
|
|
26
|
+
}
|
|
27
|
+
} else if (this.currentSequence && /\S/.test(line)) {
|
|
28
|
+
this.currentSequence.sequence += line.replaceAll(/\s/g, '')
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
private _flush() {
|
|
33
|
+
if (this.currentSequence) {
|
|
34
|
+
this.seqCallback(this.currentSequence)
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
finish(): void {
|
|
39
|
+
this._flush()
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
interface ParserArgs {
|
|
44
|
+
featureCallback?(feature: GFF3.GFF3Feature): void
|
|
45
|
+
endCallback?(): void
|
|
46
|
+
commentCallback?(comment: GFF3.GFF3Comment): void
|
|
47
|
+
errorCallback?(error: string): void
|
|
48
|
+
directiveCallback?(directive: GFF3.GFF3Directive): void
|
|
49
|
+
sequenceCallback?(sequence: GFF3.GFF3Sequence): void
|
|
50
|
+
bufferSize?: number
|
|
51
|
+
disableDerivesFromReferences?: boolean
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
interface References {
|
|
55
|
+
Parent: GFF3.GFF3Feature[]
|
|
56
|
+
Derives_from: GFF3.GFF3Feature[]
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export default class Parser {
|
|
60
|
+
featureCallback: (feature: GFF3.GFF3Feature) => void
|
|
61
|
+
endCallback: () => void
|
|
62
|
+
commentCallback: (comment: GFF3.GFF3Comment) => void
|
|
63
|
+
errorCallback: (error: string) => void
|
|
64
|
+
disableDerivesFromReferences: boolean
|
|
65
|
+
directiveCallback: (directive: GFF3.GFF3Directive) => void
|
|
66
|
+
sequenceCallback: (sequence: GFF3.GFF3Sequence) => void
|
|
67
|
+
bufferSize: number
|
|
68
|
+
fastaParser: FASTAParser | undefined = undefined
|
|
69
|
+
// if this is true, the parser ignores the
|
|
70
|
+
// rest of the lines in the file. currently
|
|
71
|
+
// set when the file switches over to FASTA
|
|
72
|
+
eof = false
|
|
73
|
+
lineNumber = 0
|
|
74
|
+
// features that we have to keep on hand for now because they
|
|
75
|
+
// might be referenced by something else
|
|
76
|
+
private _underConstructionTopLevel: GFF3.GFF3Feature[] = []
|
|
77
|
+
// index of the above by ID
|
|
78
|
+
private _underConstructionById: Record<string, GFF3.GFF3Feature | undefined> =
|
|
79
|
+
{}
|
|
80
|
+
private _completedReferences: Record<
|
|
81
|
+
string,
|
|
82
|
+
Record<string, boolean | undefined> | undefined
|
|
83
|
+
> = {}
|
|
84
|
+
// features that reference something we have not seen yet
|
|
85
|
+
// structured as:
|
|
86
|
+
// { 'some_id' : {
|
|
87
|
+
// 'Parent' : [ orphans that have a Parent attr referencing it ],
|
|
88
|
+
// 'Derives_from' : [ orphans that have a Derives_from attr referencing it ],
|
|
89
|
+
// }
|
|
90
|
+
// }
|
|
91
|
+
private _underConstructionOrphans: Record<string, References | undefined> = {}
|
|
92
|
+
|
|
93
|
+
constructor(args: ParserArgs) {
|
|
94
|
+
// eslint-disable-next-line @typescript-eslint/no-empty-function
|
|
95
|
+
const nullFunc = () => {}
|
|
96
|
+
|
|
97
|
+
this.featureCallback = args.featureCallback || nullFunc
|
|
98
|
+
this.endCallback = args.endCallback || nullFunc
|
|
99
|
+
this.commentCallback = args.commentCallback || nullFunc
|
|
100
|
+
this.errorCallback = args.errorCallback || nullFunc
|
|
101
|
+
this.directiveCallback = args.directiveCallback || nullFunc
|
|
102
|
+
this.sequenceCallback = args.sequenceCallback || nullFunc
|
|
103
|
+
this.disableDerivesFromReferences =
|
|
104
|
+
args.disableDerivesFromReferences || false
|
|
105
|
+
|
|
106
|
+
// number of lines to buffer
|
|
107
|
+
this.bufferSize = args.bufferSize === undefined ? 1000 : args.bufferSize
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
addLine(line: string): void {
|
|
111
|
+
// if we have transitioned to a fasta section, just delegate to that parser
|
|
112
|
+
if (this.fastaParser) {
|
|
113
|
+
this.fastaParser.addLine(line)
|
|
114
|
+
return
|
|
115
|
+
}
|
|
116
|
+
if (this.eof) {
|
|
117
|
+
// otherwise, if we are done, ignore this line
|
|
118
|
+
return
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
this.lineNumber += 1
|
|
122
|
+
|
|
123
|
+
if (/^\s*[^#\s>]/.test(line)) {
|
|
124
|
+
// feature line, most common case
|
|
125
|
+
this._bufferLine(line)
|
|
126
|
+
return
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const match = /^\s*(#+)(.*)/.exec(line)
|
|
130
|
+
if (match) {
|
|
131
|
+
// directive or comment
|
|
132
|
+
const [, hashsigns] = match
|
|
133
|
+
let [, , contents] = match
|
|
134
|
+
|
|
135
|
+
if (hashsigns.length === 3) {
|
|
136
|
+
// sync directive, all forward-references are resolved.
|
|
137
|
+
this._emitAllUnderConstructionFeatures()
|
|
138
|
+
} else if (hashsigns.length === 2) {
|
|
139
|
+
const directive = GFF3.parseDirective(line)
|
|
140
|
+
if (directive) {
|
|
141
|
+
if (directive.directive === 'FASTA') {
|
|
142
|
+
this._emitAllUnderConstructionFeatures()
|
|
143
|
+
this.eof = true
|
|
144
|
+
this.fastaParser = new FASTAParser(this.sequenceCallback)
|
|
145
|
+
} else {
|
|
146
|
+
this._emitItem(directive)
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
} else {
|
|
150
|
+
contents = contents.replace(/\s*/, '')
|
|
151
|
+
this._emitItem({ comment: contents })
|
|
152
|
+
}
|
|
153
|
+
} else if (/^\s*$/.test(line)) {
|
|
154
|
+
// blank line, do nothing
|
|
155
|
+
} else if (/^\s*>/.test(line)) {
|
|
156
|
+
// implicit beginning of a FASTA section
|
|
157
|
+
this._emitAllUnderConstructionFeatures()
|
|
158
|
+
this.eof = true
|
|
159
|
+
this.fastaParser = new FASTAParser(this.sequenceCallback)
|
|
160
|
+
this.fastaParser.addLine(line)
|
|
161
|
+
} else {
|
|
162
|
+
// it's a parse error
|
|
163
|
+
const errLine = line.replaceAll(/\r?\n?$/g, '')
|
|
164
|
+
throw new Error(`GFF3 parse error. Cannot parse '${errLine}'.`)
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
finish(): void {
|
|
169
|
+
this._emitAllUnderConstructionFeatures()
|
|
170
|
+
if (this.fastaParser) {
|
|
171
|
+
this.fastaParser.finish()
|
|
172
|
+
}
|
|
173
|
+
this.endCallback()
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
private _emitItem(
|
|
177
|
+
i: GFF3.GFF3Feature | GFF3.GFF3Directive | GFF3.GFF3Comment,
|
|
178
|
+
) {
|
|
179
|
+
if (Array.isArray(i)) {
|
|
180
|
+
this.featureCallback(i)
|
|
181
|
+
} else if ('directive' in i) {
|
|
182
|
+
this.directiveCallback(i)
|
|
183
|
+
} else if ('comment' in i) {
|
|
184
|
+
this.commentCallback(i)
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
private _enforceBufferSizeLimit(additionalItemCount = 0) {
|
|
189
|
+
const _unbufferItem = (item?: GFF3.GFF3Feature) => {
|
|
190
|
+
if (item && Array.isArray(item) && item[0].attributes?.ID?.[0]) {
|
|
191
|
+
const ids = item[0].attributes.ID
|
|
192
|
+
ids.forEach(id => {
|
|
193
|
+
delete this._underConstructionById[id]
|
|
194
|
+
delete this._completedReferences[id]
|
|
195
|
+
})
|
|
196
|
+
item.forEach(i => {
|
|
197
|
+
if (i.child_features) {
|
|
198
|
+
i.child_features.forEach(c => _unbufferItem(c))
|
|
199
|
+
}
|
|
200
|
+
if (i.derived_features) {
|
|
201
|
+
i.derived_features.forEach(d => _unbufferItem(d))
|
|
202
|
+
}
|
|
203
|
+
})
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
while (
|
|
208
|
+
this._underConstructionTopLevel.length + additionalItemCount >
|
|
209
|
+
this.bufferSize
|
|
210
|
+
) {
|
|
211
|
+
const item = this._underConstructionTopLevel.shift()
|
|
212
|
+
if (item) {
|
|
213
|
+
this._emitItem(item)
|
|
214
|
+
_unbufferItem(item)
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* return all under-construction features, called when we know
|
|
221
|
+
* there will be no additional data to attach to them
|
|
222
|
+
*/
|
|
223
|
+
private _emitAllUnderConstructionFeatures() {
|
|
224
|
+
this._underConstructionTopLevel.forEach(this._emitItem.bind(this))
|
|
225
|
+
|
|
226
|
+
this._underConstructionTopLevel = []
|
|
227
|
+
this._underConstructionById = {}
|
|
228
|
+
this._completedReferences = {}
|
|
229
|
+
|
|
230
|
+
// if we have any orphans hanging around still, this is a
|
|
231
|
+
// problem. die with a parse error
|
|
232
|
+
if (Array.from(Object.values(this._underConstructionOrphans)).length) {
|
|
233
|
+
throw new Error(
|
|
234
|
+
`some features reference other features that do not exist in the file (or in the same '###' scope). ${Object.keys(
|
|
235
|
+
this._underConstructionOrphans,
|
|
236
|
+
).join(',')}`,
|
|
237
|
+
)
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// do the right thing with a newly-parsed feature line
|
|
242
|
+
private _bufferLine(line: string) {
|
|
243
|
+
const rawFeatureLine = GFF3.parseFeature(line)
|
|
244
|
+
const featureLine: GFF3.GFF3FeatureLineWithRefs = {
|
|
245
|
+
...rawFeatureLine,
|
|
246
|
+
child_features: [],
|
|
247
|
+
derived_features: [],
|
|
248
|
+
}
|
|
249
|
+
// featureLine._lineNumber = this.lineNumber //< debugging aid
|
|
250
|
+
|
|
251
|
+
// NOTE: a feature is an arrayref of one or more feature lines.
|
|
252
|
+
const ids = featureLine.attributes?.ID || []
|
|
253
|
+
const parents = featureLine.attributes?.Parent || []
|
|
254
|
+
const derives = this.disableDerivesFromReferences
|
|
255
|
+
? []
|
|
256
|
+
: featureLine.attributes?.Derives_from || []
|
|
257
|
+
|
|
258
|
+
if (!ids.length && !parents.length && !derives.length) {
|
|
259
|
+
// if it has no IDs and does not refer to anything, we can just
|
|
260
|
+
// output it
|
|
261
|
+
this._emitItem([featureLine])
|
|
262
|
+
return
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
let feature: GFF3.GFF3Feature | undefined = undefined
|
|
266
|
+
ids.forEach(id => {
|
|
267
|
+
const existing = this._underConstructionById[id]
|
|
268
|
+
if (existing) {
|
|
269
|
+
// another location of the same feature
|
|
270
|
+
if (existing[existing.length - 1].type !== featureLine.type) {
|
|
271
|
+
this._parseError(
|
|
272
|
+
`multi-line feature "${id}" has inconsistent types: "${
|
|
273
|
+
featureLine.type
|
|
274
|
+
}", "${existing[existing.length - 1].type}"`,
|
|
275
|
+
)
|
|
276
|
+
}
|
|
277
|
+
existing.push(featureLine)
|
|
278
|
+
feature = existing
|
|
279
|
+
} else {
|
|
280
|
+
// haven't seen it yet, so buffer it so we can attach
|
|
281
|
+
// child features to it
|
|
282
|
+
feature = [featureLine]
|
|
283
|
+
|
|
284
|
+
this._enforceBufferSizeLimit(1)
|
|
285
|
+
if (!parents.length && !derives.length) {
|
|
286
|
+
this._underConstructionTopLevel.push(feature)
|
|
287
|
+
}
|
|
288
|
+
this._underConstructionById[id] = feature
|
|
289
|
+
|
|
290
|
+
// see if we have anything buffered that refers to it
|
|
291
|
+
this._resolveReferencesTo(feature, id)
|
|
292
|
+
}
|
|
293
|
+
})
|
|
294
|
+
|
|
295
|
+
// try to resolve all its references
|
|
296
|
+
this._resolveReferencesFrom(
|
|
297
|
+
feature || [featureLine],
|
|
298
|
+
{ Parent: parents, Derives_from: derives },
|
|
299
|
+
ids,
|
|
300
|
+
)
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
private _resolveReferencesTo(feature: GFF3.GFF3Feature, id: string) {
|
|
304
|
+
const references = this._underConstructionOrphans[id]
|
|
305
|
+
// references is of the form
|
|
306
|
+
// {
|
|
307
|
+
// 'Parent' : [ orphans that have a Parent attr referencing this feature ],
|
|
308
|
+
// 'Derives_from' : [ orphans that have a Derives_from attr referencing this feature ],
|
|
309
|
+
// }
|
|
310
|
+
if (!references) {
|
|
311
|
+
return
|
|
312
|
+
}
|
|
313
|
+
feature.forEach(loc => {
|
|
314
|
+
loc.child_features.push(...references.Parent)
|
|
315
|
+
})
|
|
316
|
+
feature.forEach(loc => {
|
|
317
|
+
loc.derived_features.push(...references.Derives_from)
|
|
318
|
+
})
|
|
319
|
+
delete this._underConstructionOrphans[id]
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
private _parseError(message: string) {
|
|
323
|
+
this.eof = true
|
|
324
|
+
this.errorCallback(`${this.lineNumber}: ${message}`)
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
private _resolveReferencesFrom(
|
|
328
|
+
feature: GFF3.GFF3Feature,
|
|
329
|
+
references: { Parent: string[]; Derives_from: string[] },
|
|
330
|
+
ids: string[],
|
|
331
|
+
) {
|
|
332
|
+
// this is all a bit more awkward in javascript than it was in perl
|
|
333
|
+
function postSet(
|
|
334
|
+
obj: Record<string, Record<string, boolean | undefined> | undefined>,
|
|
335
|
+
slot1: string,
|
|
336
|
+
slot2: string,
|
|
337
|
+
) {
|
|
338
|
+
let subObj = obj[slot1]
|
|
339
|
+
if (!subObj) {
|
|
340
|
+
subObj = {}
|
|
341
|
+
obj[slot1] = subObj
|
|
342
|
+
}
|
|
343
|
+
const returnVal = subObj[slot2] || false
|
|
344
|
+
subObj[slot2] = true
|
|
345
|
+
return returnVal
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
references.Parent.forEach(toId => {
|
|
349
|
+
const otherFeature = this._underConstructionById[toId]
|
|
350
|
+
if (otherFeature) {
|
|
351
|
+
const pname = containerAttributes.Parent
|
|
352
|
+
if (
|
|
353
|
+
!ids.filter(id =>
|
|
354
|
+
postSet(this._completedReferences, id, `Parent,${toId}`),
|
|
355
|
+
).length
|
|
356
|
+
) {
|
|
357
|
+
otherFeature.forEach(location => {
|
|
358
|
+
location[pname].push(feature)
|
|
359
|
+
})
|
|
360
|
+
}
|
|
361
|
+
} else {
|
|
362
|
+
let ref = this._underConstructionOrphans[toId]
|
|
363
|
+
if (!ref) {
|
|
364
|
+
ref = {
|
|
365
|
+
Parent: [],
|
|
366
|
+
Derives_from: [],
|
|
367
|
+
}
|
|
368
|
+
this._underConstructionOrphans[toId] = ref
|
|
369
|
+
}
|
|
370
|
+
ref.Parent.push(feature)
|
|
371
|
+
}
|
|
372
|
+
})
|
|
373
|
+
|
|
374
|
+
references.Derives_from.forEach(toId => {
|
|
375
|
+
const otherFeature = this._underConstructionById[toId]
|
|
376
|
+
if (otherFeature) {
|
|
377
|
+
const pname = containerAttributes.Derives_from
|
|
378
|
+
if (
|
|
379
|
+
!ids.filter(id =>
|
|
380
|
+
postSet(this._completedReferences, id, `Derives_from,${toId}`),
|
|
381
|
+
).length
|
|
382
|
+
) {
|
|
383
|
+
otherFeature.forEach(location => {
|
|
384
|
+
location[pname].push(feature)
|
|
385
|
+
})
|
|
386
|
+
}
|
|
387
|
+
} else {
|
|
388
|
+
let ref = this._underConstructionOrphans[toId]
|
|
389
|
+
if (!ref) {
|
|
390
|
+
ref = {
|
|
391
|
+
Parent: [],
|
|
392
|
+
Derives_from: [],
|
|
393
|
+
}
|
|
394
|
+
this._underConstructionOrphans[toId] = ref
|
|
395
|
+
}
|
|
396
|
+
ref.Derives_from.push(feature)
|
|
397
|
+
}
|
|
398
|
+
})
|
|
399
|
+
}
|
|
400
|
+
}
|