msa-parsers 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/gff/gffToInterPro.d.ts +20 -0
- package/dist/gff/gffToInterPro.js +83 -0
- package/dist/gff/gffToInterPro.js.map +1 -0
- package/dist/gff/gffToInterPro.test.d.ts +1 -0
- package/dist/gff/gffToInterPro.test.js +181 -0
- package/dist/gff/gffToInterPro.test.js.map +1 -0
- package/dist/gff/index.d.ts +3 -0
- package/dist/gff/index.js +4 -0
- package/dist/gff/index.js.map +1 -0
- package/dist/gff/interProToGFF.d.ts +9 -0
- package/dist/gff/interProToGFF.js +48 -0
- package/dist/gff/interProToGFF.js.map +1 -0
- package/dist/gff/interProToGFF.test.d.ts +1 -0
- package/dist/gff/interProToGFF.test.js +189 -0
- package/dist/gff/interProToGFF.test.js.map +1 -0
- package/dist/gff/parseGFF.d.ts +2 -0
- package/dist/gff/parseGFF.js +41 -0
- package/dist/gff/parseGFF.js.map +1 -0
- package/dist/gff/parseGFF.test.d.ts +1 -0
- package/dist/gff/parseGFF.test.js +92 -0
- package/dist/gff/parseGFF.test.js.map +1 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -0
- package/dist/msa/A3mMSA.d.ts +33 -0
- package/dist/msa/A3mMSA.js +280 -0
- package/dist/msa/A3mMSA.js.map +1 -0
- package/dist/msa/A3mMSA.test.d.ts +1 -0
- package/dist/msa/A3mMSA.test.js +155 -0
- package/dist/msa/A3mMSA.test.js.map +1 -0
- package/dist/msa/ClustalMSA.d.ts +30 -0
- package/dist/msa/ClustalMSA.js +53 -0
- package/dist/msa/ClustalMSA.js.map +1 -0
- package/dist/msa/EmfMSA.d.ts +27 -0
- package/dist/msa/EmfMSA.js +53 -0
- package/dist/msa/EmfMSA.js.map +1 -0
- package/dist/msa/FastaMSA.d.ts +19 -0
- package/dist/msa/FastaMSA.js +69 -0
- package/dist/msa/FastaMSA.js.map +1 -0
- package/dist/msa/StockholmMSA.d.ts +54 -0
- package/dist/msa/StockholmMSA.js +113 -0
- package/dist/msa/StockholmMSA.js.map +1 -0
- package/dist/msa/index.d.ts +18 -0
- package/dist/msa/index.js +34 -0
- package/dist/msa/index.js.map +1 -0
- package/dist/msa/index.test.d.ts +1 -0
- package/dist/msa/index.test.js +60 -0
- package/dist/msa/index.test.js.map +1 -0
- package/dist/msa/parseNewick.d.ts +60 -0
- package/dist/msa/parseNewick.js +95 -0
- package/dist/msa/parseNewick.js.map +1 -0
- package/dist/msa/stockholmParser.d.ts +22 -0
- package/dist/msa/stockholmParser.js +141 -0
- package/dist/msa/stockholmParser.js.map +1 -0
- package/dist/msa/stockholmParser.test.d.ts +1 -0
- package/dist/msa/stockholmParser.test.js +111 -0
- package/dist/msa/stockholmParser.test.js.map +1 -0
- package/dist/types.d.ts +66 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/util.d.ts +2 -0
- package/dist/util.js +10 -0
- package/dist/util.js.map +1 -0
- package/package.json +25 -0
- package/src/gff/gffToInterPro.test.ts +202 -0
- package/src/gff/gffToInterPro.ts +113 -0
- package/src/gff/index.ts +3 -0
- package/src/gff/interProToGFF.test.ts +206 -0
- package/src/gff/interProToGFF.ts +59 -0
- package/src/gff/parseGFF.test.ts +106 -0
- package/src/gff/parseGFF.ts +46 -0
- package/src/index.ts +29 -0
- package/src/msa/A3mMSA.test.ts +192 -0
- package/src/msa/A3mMSA.ts +320 -0
- package/src/msa/ClustalMSA.ts +67 -0
- package/src/msa/EmfMSA.ts +67 -0
- package/src/msa/FastaMSA.ts +82 -0
- package/src/msa/StockholmMSA.ts +141 -0
- package/src/msa/index.test.ts +74 -0
- package/src/msa/index.ts +44 -0
- package/src/msa/parseNewick.ts +94 -0
- package/src/msa/stockholmParser.test.ts +123 -0
- package/src/msa/stockholmParser.ts +157 -0
- package/src/types.ts +68 -0
- package/src/util.ts +19 -0
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import { describe, expect, test } from 'vitest'
|
|
2
|
+
|
|
3
|
+
import { gffToInterProResponse, gffToInterProResults } from './gffToInterPro'
|
|
4
|
+
|
|
5
|
+
import type { GFFRecord } from '../types'
|
|
6
|
+
|
|
7
|
+
describe('gffToInterProResults', () => {
|
|
8
|
+
test('converts empty array', () => {
|
|
9
|
+
expect(gffToInterProResults([])).toEqual({})
|
|
10
|
+
})
|
|
11
|
+
|
|
12
|
+
test('converts single GFF record', () => {
|
|
13
|
+
const records: GFFRecord[] = [
|
|
14
|
+
{
|
|
15
|
+
seq_id: 'seq1',
|
|
16
|
+
source: 'Pfam',
|
|
17
|
+
type: 'protein_match',
|
|
18
|
+
start: 10,
|
|
19
|
+
end: 50,
|
|
20
|
+
score: 0,
|
|
21
|
+
strand: '.',
|
|
22
|
+
phase: '.',
|
|
23
|
+
Name: 'PF00001',
|
|
24
|
+
signature_desc: '7tm_1',
|
|
25
|
+
description: 'GPCR family',
|
|
26
|
+
},
|
|
27
|
+
]
|
|
28
|
+
const result = gffToInterProResults(records)
|
|
29
|
+
|
|
30
|
+
expect(result).toHaveProperty('seq1')
|
|
31
|
+
expect(result.seq1?.matches).toHaveLength(1)
|
|
32
|
+
expect(result.seq1?.matches[0]?.signature.entry).toEqual({
|
|
33
|
+
accession: 'PF00001',
|
|
34
|
+
name: '7tm_1',
|
|
35
|
+
description: 'GPCR family',
|
|
36
|
+
})
|
|
37
|
+
expect(result.seq1?.matches[0]?.locations).toEqual([{ start: 10, end: 50 }])
|
|
38
|
+
expect(result.seq1?.xref).toEqual([{ id: 'seq1' }])
|
|
39
|
+
})
|
|
40
|
+
|
|
41
|
+
test('groups multiple records for same sequence', () => {
|
|
42
|
+
const records: GFFRecord[] = [
|
|
43
|
+
{
|
|
44
|
+
seq_id: 'seq1',
|
|
45
|
+
source: 'Pfam',
|
|
46
|
+
type: 'protein_match',
|
|
47
|
+
start: 10,
|
|
48
|
+
end: 50,
|
|
49
|
+
score: 0,
|
|
50
|
+
strand: '.',
|
|
51
|
+
phase: '.',
|
|
52
|
+
Name: 'PF00001',
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
seq_id: 'seq1',
|
|
56
|
+
source: 'SMART',
|
|
57
|
+
type: 'protein_match',
|
|
58
|
+
start: 60,
|
|
59
|
+
end: 100,
|
|
60
|
+
score: 0,
|
|
61
|
+
strand: '.',
|
|
62
|
+
phase: '.',
|
|
63
|
+
Name: 'SM00001',
|
|
64
|
+
},
|
|
65
|
+
]
|
|
66
|
+
const result = gffToInterProResults(records)
|
|
67
|
+
|
|
68
|
+
expect(Object.keys(result)).toHaveLength(1)
|
|
69
|
+
expect(result.seq1?.matches).toHaveLength(2)
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
test('handles multiple sequences', () => {
|
|
73
|
+
const records: GFFRecord[] = [
|
|
74
|
+
{
|
|
75
|
+
seq_id: 'seq1',
|
|
76
|
+
source: 'Pfam',
|
|
77
|
+
type: 'protein_match',
|
|
78
|
+
start: 10,
|
|
79
|
+
end: 50,
|
|
80
|
+
score: 0,
|
|
81
|
+
strand: '.',
|
|
82
|
+
phase: '.',
|
|
83
|
+
Name: 'PF00001',
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
seq_id: 'seq2',
|
|
87
|
+
source: 'Pfam',
|
|
88
|
+
type: 'protein_match',
|
|
89
|
+
start: 5,
|
|
90
|
+
end: 40,
|
|
91
|
+
score: 0,
|
|
92
|
+
strand: '.',
|
|
93
|
+
phase: '.',
|
|
94
|
+
Name: 'PF00002',
|
|
95
|
+
},
|
|
96
|
+
]
|
|
97
|
+
const result = gffToInterProResults(records)
|
|
98
|
+
|
|
99
|
+
expect(Object.keys(result)).toHaveLength(2)
|
|
100
|
+
expect(result).toHaveProperty('seq1')
|
|
101
|
+
expect(result).toHaveProperty('seq2')
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
test('combines locations for same accession', () => {
|
|
105
|
+
const records: GFFRecord[] = [
|
|
106
|
+
{
|
|
107
|
+
seq_id: 'seq1',
|
|
108
|
+
source: 'Pfam',
|
|
109
|
+
type: 'protein_match',
|
|
110
|
+
start: 10,
|
|
111
|
+
end: 50,
|
|
112
|
+
score: 0,
|
|
113
|
+
strand: '.',
|
|
114
|
+
phase: '.',
|
|
115
|
+
Name: 'PF00001',
|
|
116
|
+
},
|
|
117
|
+
{
|
|
118
|
+
seq_id: 'seq1',
|
|
119
|
+
source: 'Pfam',
|
|
120
|
+
type: 'protein_match',
|
|
121
|
+
start: 100,
|
|
122
|
+
end: 150,
|
|
123
|
+
score: 0,
|
|
124
|
+
strand: '.',
|
|
125
|
+
phase: '.',
|
|
126
|
+
Name: 'PF00001',
|
|
127
|
+
},
|
|
128
|
+
]
|
|
129
|
+
const result = gffToInterProResults(records)
|
|
130
|
+
|
|
131
|
+
expect(result.seq1?.matches).toHaveLength(1)
|
|
132
|
+
expect(result.seq1?.matches[0]?.locations).toHaveLength(2)
|
|
133
|
+
expect(result.seq1?.matches[0]?.locations).toEqual([
|
|
134
|
+
{ start: 10, end: 50 },
|
|
135
|
+
{ start: 100, end: 150 },
|
|
136
|
+
])
|
|
137
|
+
})
|
|
138
|
+
|
|
139
|
+
test('uses ID as fallback for Name', () => {
|
|
140
|
+
const records: GFFRecord[] = [
|
|
141
|
+
{
|
|
142
|
+
seq_id: 'seq1',
|
|
143
|
+
source: 'Source',
|
|
144
|
+
type: 'protein_match',
|
|
145
|
+
start: 10,
|
|
146
|
+
end: 50,
|
|
147
|
+
score: 0,
|
|
148
|
+
strand: '.',
|
|
149
|
+
phase: '.',
|
|
150
|
+
ID: 'domain_123',
|
|
151
|
+
},
|
|
152
|
+
]
|
|
153
|
+
const result = gffToInterProResults(records)
|
|
154
|
+
|
|
155
|
+
expect(result.seq1?.matches[0]?.signature.entry?.accession).toBe(
|
|
156
|
+
'domain_123',
|
|
157
|
+
)
|
|
158
|
+
})
|
|
159
|
+
|
|
160
|
+
test('generates fallback accession from source and positions', () => {
|
|
161
|
+
const records: GFFRecord[] = [
|
|
162
|
+
{
|
|
163
|
+
seq_id: 'seq1',
|
|
164
|
+
source: 'CustomSource',
|
|
165
|
+
type: 'protein_match',
|
|
166
|
+
start: 10,
|
|
167
|
+
end: 50,
|
|
168
|
+
score: 0,
|
|
169
|
+
strand: '.',
|
|
170
|
+
phase: '.',
|
|
171
|
+
},
|
|
172
|
+
]
|
|
173
|
+
const result = gffToInterProResults(records)
|
|
174
|
+
|
|
175
|
+
expect(result.seq1?.matches[0]?.signature.entry?.accession).toBe(
|
|
176
|
+
'CustomSource_10_50',
|
|
177
|
+
)
|
|
178
|
+
})
|
|
179
|
+
})
|
|
180
|
+
|
|
181
|
+
describe('gffToInterProResponse', () => {
|
|
182
|
+
test('wraps results in response format', () => {
|
|
183
|
+
const records: GFFRecord[] = [
|
|
184
|
+
{
|
|
185
|
+
seq_id: 'seq1',
|
|
186
|
+
source: 'Pfam',
|
|
187
|
+
type: 'protein_match',
|
|
188
|
+
start: 10,
|
|
189
|
+
end: 50,
|
|
190
|
+
score: 0,
|
|
191
|
+
strand: '.',
|
|
192
|
+
phase: '.',
|
|
193
|
+
Name: 'PF00001',
|
|
194
|
+
},
|
|
195
|
+
]
|
|
196
|
+
const response = gffToInterProResponse(records)
|
|
197
|
+
|
|
198
|
+
expect(response).toHaveProperty('results')
|
|
199
|
+
expect(response.results).toHaveLength(1)
|
|
200
|
+
expect(response.results[0]?.xref[0]?.id).toBe('seq1')
|
|
201
|
+
})
|
|
202
|
+
})
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import type {
|
|
2
|
+
GFFRecord,
|
|
3
|
+
InterProScanResponse,
|
|
4
|
+
InterProScanResults,
|
|
5
|
+
} from '../types'
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Convert GFF records to InterProScan format
|
|
9
|
+
*
|
|
10
|
+
* InterProScan GFF3 output format:
|
|
11
|
+
* - seq_id: sequence identifier
|
|
12
|
+
* - source: database/signature (e.g., "Pfam", "SMART")
|
|
13
|
+
* - type: usually "protein_match"
|
|
14
|
+
* - start/end: domain positions (1-based)
|
|
15
|
+
* - Attributes: Name (accession), signature_desc (name), Dbxref, etc.
|
|
16
|
+
*/
|
|
17
|
+
export function gffToInterProResults(
|
|
18
|
+
gffRecords: GFFRecord[],
|
|
19
|
+
): Record<string, InterProScanResults> {
|
|
20
|
+
const bySequence = new Map<string, GFFRecord[]>()
|
|
21
|
+
|
|
22
|
+
for (const record of gffRecords) {
|
|
23
|
+
const existing = bySequence.get(record.seq_id)
|
|
24
|
+
if (existing) {
|
|
25
|
+
existing.push(record)
|
|
26
|
+
} else {
|
|
27
|
+
bySequence.set(record.seq_id, [record])
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const results: Record<string, InterProScanResults> = {}
|
|
32
|
+
|
|
33
|
+
for (const [seqId, records] of bySequence) {
|
|
34
|
+
const matchesByAccession = new Map<
|
|
35
|
+
string,
|
|
36
|
+
{ start: number; end: number }[]
|
|
37
|
+
>()
|
|
38
|
+
const matchInfo = new Map<
|
|
39
|
+
string,
|
|
40
|
+
{ name: string; description: string; accession: string }
|
|
41
|
+
>()
|
|
42
|
+
|
|
43
|
+
for (const record of records) {
|
|
44
|
+
const accession =
|
|
45
|
+
(record.Name as string) ||
|
|
46
|
+
(record.ID as string) ||
|
|
47
|
+
`${record.source}_${record.start}_${record.end}`
|
|
48
|
+
const name =
|
|
49
|
+
(record.signature_desc as string) ||
|
|
50
|
+
(record.Name as string) ||
|
|
51
|
+
accession
|
|
52
|
+
const description =
|
|
53
|
+
(record.Ontology_term as string) ||
|
|
54
|
+
(record.description as string) ||
|
|
55
|
+
(record.Note as string) ||
|
|
56
|
+
name
|
|
57
|
+
|
|
58
|
+
if (!matchInfo.has(accession)) {
|
|
59
|
+
matchInfo.set(accession, { name, description, accession })
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const locations = matchesByAccession.get(accession)
|
|
63
|
+
if (locations) {
|
|
64
|
+
locations.push({ start: record.start, end: record.end })
|
|
65
|
+
} else {
|
|
66
|
+
matchesByAccession.set(accession, [
|
|
67
|
+
{ start: record.start, end: record.end },
|
|
68
|
+
])
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const matches = []
|
|
73
|
+
for (const [accession, locations] of matchesByAccession) {
|
|
74
|
+
const info = matchInfo.get(accession)!
|
|
75
|
+
matches.push({
|
|
76
|
+
signature: {
|
|
77
|
+
entry: info,
|
|
78
|
+
},
|
|
79
|
+
locations,
|
|
80
|
+
})
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
results[seqId] = {
|
|
84
|
+
matches,
|
|
85
|
+
xref: [{ id: seqId }],
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return results
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Convert GFF string directly to InterProScan format
|
|
94
|
+
*/
|
|
95
|
+
export function parseGFFToInterPro(
|
|
96
|
+
gffStr: string,
|
|
97
|
+
parseGFFfn: (str: string) => GFFRecord[],
|
|
98
|
+
): Record<string, InterProScanResults> {
|
|
99
|
+
const records = parseGFFfn(gffStr)
|
|
100
|
+
return gffToInterProResults(records)
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Create a full InterProScanResponse from GFF records
|
|
105
|
+
*/
|
|
106
|
+
export function gffToInterProResponse(
|
|
107
|
+
gffRecords: GFFRecord[],
|
|
108
|
+
): InterProScanResponse {
|
|
109
|
+
const resultsMap = gffToInterProResults(gffRecords)
|
|
110
|
+
return {
|
|
111
|
+
results: Object.values(resultsMap),
|
|
112
|
+
}
|
|
113
|
+
}
|
package/src/gff/index.ts
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import { describe, expect, test } from 'vitest'
|
|
2
|
+
|
|
3
|
+
import { interProResponseToGFF, interProToGFF } from './interProToGFF'
|
|
4
|
+
|
|
5
|
+
import type { InterProScanResults } from '../types'
|
|
6
|
+
|
|
7
|
+
describe('interProToGFF', () => {
|
|
8
|
+
test('converts empty results', () => {
|
|
9
|
+
const gff = interProToGFF({})
|
|
10
|
+
expect(gff).toBe('##gff-version 3')
|
|
11
|
+
})
|
|
12
|
+
|
|
13
|
+
test('converts single result', () => {
|
|
14
|
+
const results: Record<string, InterProScanResults> = {
|
|
15
|
+
seq1: {
|
|
16
|
+
matches: [
|
|
17
|
+
{
|
|
18
|
+
signature: {
|
|
19
|
+
entry: {
|
|
20
|
+
accession: 'PF00001',
|
|
21
|
+
name: '7tm_1',
|
|
22
|
+
description: 'GPCR family',
|
|
23
|
+
},
|
|
24
|
+
},
|
|
25
|
+
locations: [{ start: 10, end: 50 }],
|
|
26
|
+
},
|
|
27
|
+
],
|
|
28
|
+
xref: [{ id: 'seq1' }],
|
|
29
|
+
},
|
|
30
|
+
}
|
|
31
|
+
const gff = interProToGFF(results)
|
|
32
|
+
const lines = gff.split('\n')
|
|
33
|
+
|
|
34
|
+
expect(lines[0]).toBe('##gff-version 3')
|
|
35
|
+
expect(lines[1]).toContain('seq1')
|
|
36
|
+
expect(lines[1]).toContain('InterProScan')
|
|
37
|
+
expect(lines[1]).toContain('protein_match')
|
|
38
|
+
expect(lines[1]).toContain('10')
|
|
39
|
+
expect(lines[1]).toContain('50')
|
|
40
|
+
expect(lines[1]).toContain('Name=PF00001')
|
|
41
|
+
expect(lines[1]).toContain('signature_desc=7tm_1')
|
|
42
|
+
expect(lines[1]).toContain('description=GPCR%20family')
|
|
43
|
+
})
|
|
44
|
+
|
|
45
|
+
test('handles multiple locations', () => {
|
|
46
|
+
const results: Record<string, InterProScanResults> = {
|
|
47
|
+
seq1: {
|
|
48
|
+
matches: [
|
|
49
|
+
{
|
|
50
|
+
signature: {
|
|
51
|
+
entry: {
|
|
52
|
+
accession: 'PF00001',
|
|
53
|
+
name: 'domain',
|
|
54
|
+
description: 'test',
|
|
55
|
+
},
|
|
56
|
+
},
|
|
57
|
+
locations: [
|
|
58
|
+
{ start: 10, end: 50 },
|
|
59
|
+
{ start: 100, end: 150 },
|
|
60
|
+
],
|
|
61
|
+
},
|
|
62
|
+
],
|
|
63
|
+
xref: [{ id: 'seq1' }],
|
|
64
|
+
},
|
|
65
|
+
}
|
|
66
|
+
const gff = interProToGFF(results)
|
|
67
|
+
const lines = gff.split('\n')
|
|
68
|
+
|
|
69
|
+
expect(lines).toHaveLength(3)
|
|
70
|
+
expect(lines[1]).toContain('10\t50')
|
|
71
|
+
expect(lines[2]).toContain('100\t150')
|
|
72
|
+
})
|
|
73
|
+
|
|
74
|
+
test('handles multiple sequences', () => {
|
|
75
|
+
const results: Record<string, InterProScanResults> = {
|
|
76
|
+
seq1: {
|
|
77
|
+
matches: [
|
|
78
|
+
{
|
|
79
|
+
signature: {
|
|
80
|
+
entry: {
|
|
81
|
+
accession: 'PF00001',
|
|
82
|
+
name: 'domain1',
|
|
83
|
+
description: 'test1',
|
|
84
|
+
},
|
|
85
|
+
},
|
|
86
|
+
locations: [{ start: 10, end: 50 }],
|
|
87
|
+
},
|
|
88
|
+
],
|
|
89
|
+
xref: [{ id: 'seq1' }],
|
|
90
|
+
},
|
|
91
|
+
seq2: {
|
|
92
|
+
matches: [
|
|
93
|
+
{
|
|
94
|
+
signature: {
|
|
95
|
+
entry: {
|
|
96
|
+
accession: 'PF00002',
|
|
97
|
+
name: 'domain2',
|
|
98
|
+
description: 'test2',
|
|
99
|
+
},
|
|
100
|
+
},
|
|
101
|
+
locations: [{ start: 5, end: 40 }],
|
|
102
|
+
},
|
|
103
|
+
],
|
|
104
|
+
xref: [{ id: 'seq2' }],
|
|
105
|
+
},
|
|
106
|
+
}
|
|
107
|
+
const gff = interProToGFF(results)
|
|
108
|
+
const lines = gff.split('\n')
|
|
109
|
+
|
|
110
|
+
expect(lines).toHaveLength(3)
|
|
111
|
+
expect(lines.some(l => l.includes('seq1'))).toBe(true)
|
|
112
|
+
expect(lines.some(l => l.includes('seq2'))).toBe(true)
|
|
113
|
+
})
|
|
114
|
+
|
|
115
|
+
test('skips matches without entry', () => {
|
|
116
|
+
const results: Record<string, InterProScanResults> = {
|
|
117
|
+
seq1: {
|
|
118
|
+
matches: [
|
|
119
|
+
{
|
|
120
|
+
signature: {},
|
|
121
|
+
locations: [{ start: 10, end: 50 }],
|
|
122
|
+
},
|
|
123
|
+
],
|
|
124
|
+
xref: [{ id: 'seq1' }],
|
|
125
|
+
},
|
|
126
|
+
}
|
|
127
|
+
const gff = interProToGFF(results)
|
|
128
|
+
const lines = gff.split('\n')
|
|
129
|
+
|
|
130
|
+
expect(lines).toHaveLength(1)
|
|
131
|
+
expect(lines[0]).toBe('##gff-version 3')
|
|
132
|
+
})
|
|
133
|
+
|
|
134
|
+
test('URL-encodes special characters in attributes', () => {
|
|
135
|
+
const results: Record<string, InterProScanResults> = {
|
|
136
|
+
seq1: {
|
|
137
|
+
matches: [
|
|
138
|
+
{
|
|
139
|
+
signature: {
|
|
140
|
+
entry: {
|
|
141
|
+
accession: 'PF00001',
|
|
142
|
+
name: 'test;name=value',
|
|
143
|
+
description: 'description with spaces',
|
|
144
|
+
},
|
|
145
|
+
},
|
|
146
|
+
locations: [{ start: 10, end: 50 }],
|
|
147
|
+
},
|
|
148
|
+
],
|
|
149
|
+
xref: [{ id: 'seq1' }],
|
|
150
|
+
},
|
|
151
|
+
}
|
|
152
|
+
const gff = interProToGFF(results)
|
|
153
|
+
|
|
154
|
+
expect(gff).toContain('signature_desc=test%3Bname%3Dvalue')
|
|
155
|
+
expect(gff).toContain('description=description%20with%20spaces')
|
|
156
|
+
})
|
|
157
|
+
})
|
|
158
|
+
|
|
159
|
+
describe('interProResponseToGFF', () => {
|
|
160
|
+
test('converts array of results', () => {
|
|
161
|
+
const results: InterProScanResults[] = [
|
|
162
|
+
{
|
|
163
|
+
matches: [
|
|
164
|
+
{
|
|
165
|
+
signature: {
|
|
166
|
+
entry: {
|
|
167
|
+
accession: 'PF00001',
|
|
168
|
+
name: 'domain',
|
|
169
|
+
description: 'test',
|
|
170
|
+
},
|
|
171
|
+
},
|
|
172
|
+
locations: [{ start: 10, end: 50 }],
|
|
173
|
+
},
|
|
174
|
+
],
|
|
175
|
+
xref: [{ id: 'seq1' }],
|
|
176
|
+
},
|
|
177
|
+
]
|
|
178
|
+
const gff = interProResponseToGFF(results)
|
|
179
|
+
|
|
180
|
+
expect(gff).toContain('##gff-version 3')
|
|
181
|
+
expect(gff).toContain('seq1')
|
|
182
|
+
})
|
|
183
|
+
|
|
184
|
+
test('handles results without xref', () => {
|
|
185
|
+
const results: InterProScanResults[] = [
|
|
186
|
+
{
|
|
187
|
+
matches: [
|
|
188
|
+
{
|
|
189
|
+
signature: {
|
|
190
|
+
entry: {
|
|
191
|
+
accession: 'PF00001',
|
|
192
|
+
name: 'domain',
|
|
193
|
+
description: 'test',
|
|
194
|
+
},
|
|
195
|
+
},
|
|
196
|
+
locations: [{ start: 10, end: 50 }],
|
|
197
|
+
},
|
|
198
|
+
],
|
|
199
|
+
xref: [],
|
|
200
|
+
},
|
|
201
|
+
]
|
|
202
|
+
const gff = interProResponseToGFF(results)
|
|
203
|
+
|
|
204
|
+
expect(gff).toBe('##gff-version 3')
|
|
205
|
+
})
|
|
206
|
+
})
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import type { InterProScanResults } from '../types'
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Convert InterProScan results to GFF3 format
|
|
5
|
+
*/
|
|
6
|
+
export function interProToGFF(
|
|
7
|
+
results: Record<string, InterProScanResults>,
|
|
8
|
+
): string {
|
|
9
|
+
const lines: string[] = ['##gff-version 3']
|
|
10
|
+
|
|
11
|
+
for (const [seqId, data] of Object.entries(results)) {
|
|
12
|
+
for (const match of data.matches) {
|
|
13
|
+
const entry = match.signature.entry
|
|
14
|
+
if (!entry) {
|
|
15
|
+
continue
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
for (const location of match.locations) {
|
|
19
|
+
const attributes = [
|
|
20
|
+
`Name=${encodeURIComponent(entry.accession)}`,
|
|
21
|
+
`signature_desc=${encodeURIComponent(entry.name)}`,
|
|
22
|
+
`description=${encodeURIComponent(entry.description)}`,
|
|
23
|
+
].join(';')
|
|
24
|
+
|
|
25
|
+
const line = [
|
|
26
|
+
seqId,
|
|
27
|
+
'InterProScan',
|
|
28
|
+
'protein_match',
|
|
29
|
+
location.start,
|
|
30
|
+
location.end,
|
|
31
|
+
'.',
|
|
32
|
+
'.',
|
|
33
|
+
'.',
|
|
34
|
+
attributes,
|
|
35
|
+
].join('\t')
|
|
36
|
+
|
|
37
|
+
lines.push(line)
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return lines.join('\n')
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Convert InterProScan JSON response to GFF3 format
|
|
47
|
+
*/
|
|
48
|
+
export function interProResponseToGFF(results: InterProScanResults[]): string {
|
|
49
|
+
const resultsMap: Record<string, InterProScanResults> = {}
|
|
50
|
+
|
|
51
|
+
for (const result of results) {
|
|
52
|
+
const seqId = result.xref[0]?.id
|
|
53
|
+
if (seqId) {
|
|
54
|
+
resultsMap[seqId] = result
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return interProToGFF(resultsMap)
|
|
59
|
+
}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { describe, expect, test } from 'vitest'
|
|
2
|
+
|
|
3
|
+
import { parseGFF } from './parseGFF'
|
|
4
|
+
|
|
5
|
+
describe('parseGFF', () => {
|
|
6
|
+
test('parses empty string', () => {
|
|
7
|
+
expect(parseGFF('')).toEqual([])
|
|
8
|
+
expect(parseGFF(undefined)).toEqual([])
|
|
9
|
+
})
|
|
10
|
+
|
|
11
|
+
test('parses basic GFF3 line', () => {
|
|
12
|
+
const gff =
|
|
13
|
+
'seq1\tInterProScan\tprotein_match\t10\t50\t.\t+\t.\tName=PF00001'
|
|
14
|
+
const result = parseGFF(gff)
|
|
15
|
+
expect(result).toHaveLength(1)
|
|
16
|
+
expect(result[0]).toEqual({
|
|
17
|
+
seq_id: 'seq1',
|
|
18
|
+
source: 'InterProScan',
|
|
19
|
+
type: 'protein_match',
|
|
20
|
+
start: 10,
|
|
21
|
+
end: 50,
|
|
22
|
+
score: 0,
|
|
23
|
+
strand: '+',
|
|
24
|
+
phase: '.',
|
|
25
|
+
Name: 'PF00001',
|
|
26
|
+
})
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
test('parses multiple attributes', () => {
|
|
30
|
+
const gff =
|
|
31
|
+
'seq1\tPfam\tprotein_match\t10\t50\t1.5\t.\t.\tName=PF00001;signature_desc=7tm_1;description=GPCR'
|
|
32
|
+
const result = parseGFF(gff)
|
|
33
|
+
expect(result[0]).toMatchObject({
|
|
34
|
+
Name: 'PF00001',
|
|
35
|
+
signature_desc: '7tm_1',
|
|
36
|
+
description: 'GPCR',
|
|
37
|
+
})
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
test('handles URL-encoded attribute values', () => {
|
|
41
|
+
const gff = 'seq1\tSource\ttype\t1\t10\t.\t.\t.\tNote=Hello%20World%3B%3D'
|
|
42
|
+
const result = parseGFF(gff)
|
|
43
|
+
expect(result[0]?.Note).toBe('Hello World;=')
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
test('skips comment lines', () => {
|
|
47
|
+
const gff = `##gff-version 3
|
|
48
|
+
# This is a comment
|
|
49
|
+
seq1\tSource\ttype\t1\t10\t.\t.\t.\tName=test`
|
|
50
|
+
const result = parseGFF(gff)
|
|
51
|
+
expect(result).toHaveLength(1)
|
|
52
|
+
expect(result[0]?.seq_id).toBe('seq1')
|
|
53
|
+
})
|
|
54
|
+
|
|
55
|
+
test('skips empty lines', () => {
|
|
56
|
+
const gff = `seq1\tSource\ttype\t1\t10\t.\t.\t.\tName=test1
|
|
57
|
+
|
|
58
|
+
seq2\tSource\ttype\t20\t30\t.\t.\t.\tName=test2`
|
|
59
|
+
const result = parseGFF(gff)
|
|
60
|
+
expect(result).toHaveLength(2)
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
test('handles missing attributes column', () => {
|
|
64
|
+
const gff = 'seq1\tSource\ttype\t1\t10\t.\t.\t.'
|
|
65
|
+
const result = parseGFF(gff)
|
|
66
|
+
expect(result).toHaveLength(1)
|
|
67
|
+
expect(result[0]?.seq_id).toBe('seq1')
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
test('handles partial GFF lines gracefully', () => {
|
|
71
|
+
const gff = 'seq1\tSource\ttype'
|
|
72
|
+
const result = parseGFF(gff)
|
|
73
|
+
expect(result).toHaveLength(1)
|
|
74
|
+
expect(result[0]).toMatchObject({
|
|
75
|
+
seq_id: 'seq1',
|
|
76
|
+
source: 'Source',
|
|
77
|
+
type: 'type',
|
|
78
|
+
start: 0,
|
|
79
|
+
end: 0,
|
|
80
|
+
})
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
test('parses numeric score', () => {
|
|
84
|
+
const gff = 'seq1\tSource\ttype\t1\t10\t45.6\t.\t.\tName=test'
|
|
85
|
+
const result = parseGFF(gff)
|
|
86
|
+
expect(result[0]?.score).toBe(45.6)
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
test('handles comma-separated values in attributes', () => {
|
|
90
|
+
const gff =
|
|
91
|
+
'seq1\tSource\ttype\t1\t10\t.\t.\t.\tOntology_term=GO:0001,GO:0002'
|
|
92
|
+
const result = parseGFF(gff)
|
|
93
|
+
expect(result[0]?.Ontology_term).toBe('GO:0001 GO:0002')
|
|
94
|
+
})
|
|
95
|
+
|
|
96
|
+
test('parses multiple lines', () => {
|
|
97
|
+
const gff = `seq1\tPfam\tprotein_match\t10\t50\t.\t.\t.\tName=PF00001
|
|
98
|
+
seq1\tSMART\tprotein_match\t60\t100\t.\t.\t.\tName=SM00001
|
|
99
|
+
seq2\tPfam\tprotein_match\t5\t40\t.\t.\t.\tName=PF00002`
|
|
100
|
+
const result = parseGFF(gff)
|
|
101
|
+
expect(result).toHaveLength(3)
|
|
102
|
+
expect(result[0]?.seq_id).toBe('seq1')
|
|
103
|
+
expect(result[1]?.seq_id).toBe('seq1')
|
|
104
|
+
expect(result[2]?.seq_id).toBe('seq2')
|
|
105
|
+
})
|
|
106
|
+
})
|