fstrozzi-Gmap 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ require File.dirname(__FILE__) + '/gmap/core.rb'
@@ -0,0 +1,269 @@
1
+
2
+ # Author:: Francesco Strozzi
3
+ # Email:: francesco.strozzi@gmail.com
4
+ # Copyright:: 2008 Francesco Strozzi
5
+ # License:: The Ruby License
6
+
7
+
8
+
9
+ module Gmap
10
+
11
+ # This module allows the parsing of the standard output of Gmap (http://www.gene.com/share/gmap/)
12
+ #
13
+ # Example:
14
+ #
15
+ # Gmap::Core.open("output.gmap") do |gmap|
16
+ #
17
+ # gmap.each_sequence do |seq|
18
+ #
19
+ # seq.each do |result|
20
+ #
21
+ # result.query (Query sequence name)
22
+ # result.target (Target sequence name)
23
+ # result.q_start (Start coordinate of the query sequence)
24
+ # result.q_end (End coordinate of the query sequence)
25
+ # result.start (Start coordintate of the target sequence)
26
+ # result.end (End coordinate of the target sequence)
27
+ # result.strand (Strand of the target sequence)
28
+ # result.exons (# exons found)
29
+ # result.coverage (Coverage of the query sequence)
30
+ # result.perc_identity (Pecentage of identity from the alignment)
31
+ # result.indels (# insertion or deletions)
32
+ # result.mismatch (# mismatch)
33
+ # result.aa_change (Prediction of AA changes from mismatches and indels found)
34
+ # result.aln (Raw alignment between target and query sequences)
35
+ #
36
+ # ONLY IF GENE MAPS ARE USED WITH GMAP
37
+ #
38
+ # result.gene_start (Start coordinate of the overlapping gene)
39
+ # result.gene_end (End coordinate of the overlapping gene)
40
+ # result.gene_id (ID of the overlapping gene)
41
+ # end
42
+ #
43
+ # end
44
+ #
45
+ # end
46
+ #
47
+
48
+ class Core
49
+
50
+ attr_reader :io
51
+ def initialize(io)
52
+ @io = io
53
+ end
54
+
55
+ # Open the gmap file for reading
56
+
57
+ def self.open(file)
58
+
59
+ f = File.open(file)
60
+ if block_given?
61
+ yield Gmap::Core.new(f)
62
+ f.close
63
+ else
64
+ Gmap::Core.new(f)
65
+ end
66
+ end
67
+
68
+ # Close the IO stream on the Gmap file
69
+
70
+ def close
71
+ @io.close
72
+ end
73
+
74
+ # Iterates on every sequence processed by Gmap and returns an array of Gmap::Result objects
75
+ # each of them corresponding to a Path (result) for that sequence
76
+
77
+
78
+ def each_sequence
79
+ start = false
80
+ res = Gmap::Result.new
81
+ all_results = []
82
+ query = nil
83
+ @io.each_line do |l|
84
+ if l=~/>(\d+|\w+)\s/ and !start then
85
+ start = true
86
+ query = "#{$1}"
87
+ elsif l=~/>(\d+|\w+)\s/ and start then
88
+ res.query = query
89
+ all_results << res.dup if res.target != nil
90
+ query = "#{$1}"
91
+ if block_given?
92
+ yield all_results
93
+ else
94
+ raise ArgumentError, "Block needed"
95
+ end
96
+ all_results.clear
97
+ res.clear
98
+ elsif l=~/Path\s\d+/ and res.target != nil then
99
+ res.query = query
100
+ all_results << res.dup
101
+ res.clear
102
+ end
103
+ res = parse_line(res,l)
104
+ end
105
+ if start then
106
+ res.query = query
107
+ all_results << res.dup if res.target != nil
108
+ if block_given?
109
+ yield all_results
110
+ else
111
+ raise ArgumentError, "Block needed"
112
+ end
113
+ end
114
+ end
115
+
116
+ private
117
+
118
+ # The method is called internally from the Gmap#each_result method,
119
+ # to parse the lines in the output of Gmap and save the information into a Gmap::Result object
120
+
121
+ def parse_line(res,l)
122
+ l.chomp!
123
+ if res.search_aln then
124
+ res = get_aln(res,l)
125
+ else
126
+ case l
127
+ when /Path \d+:\s+query\s+(\d+)--(\d+)\s+\(\d+ bp\)\s+=>/
128
+ res.q_start = "#{$1}".to_i
129
+ res.q_end = "#{$2}".to_i
130
+ when /Genomic pos:.*\((.*)\sstrand\)/
131
+ if "#{$1}"=~/\+/ then
132
+ res.strand = 1
133
+ else
134
+ res.strand = -1
135
+ end
136
+ when /Accessions:\s+(.*):(.*)--(.*)\s+\(out of.*/
137
+ res.target = "#{$1}"
138
+ t_start = "#{$2}"
139
+ t_end = "#{$3}"
140
+ t_start.gsub!(/,/,'')
141
+ t_end.gsub!(/,/,'')
142
+ res.start = t_start.to_i
143
+ res.end = t_end.to_i
144
+ when /Number of exons: (\d+)/
145
+ if res.exons.nil?
146
+ res.exons = "#{$1}".to_i
147
+ end
148
+ when /Trimmed coverage:\s(.*)\s\(trimmed length/
149
+ res.coverage = "#$1".to_f if res.coverage.nil?
150
+ when /Percent identity:\s(.*)\s\(\d+ matches, (\d+) mismatches, (\d+) indels,/
151
+ if res.perc_identity.nil?
152
+ res.perc_identity = "#{$1}".to_f
153
+ res.mismatch = "#{$2}".to_i
154
+ res.indels = "#{$3}".to_i
155
+ end
156
+ when /Amino acid changes: (.*)/
157
+ aa = "#{$1}"
158
+ res.aa_change = aa if aa.to_s=~/\w+/
159
+ when /Alignment for path \d+:/
160
+ res.set_search
161
+ when /.*gene_maps\s+\S+:(\d+)..(\d+)\s+(\d+)/
162
+ res.gene_start = "#{$1}".to_i
163
+ res.gene_end = "#{$2}".to_i
164
+ res.gene_id = "#{$3}".to_i
165
+ end
166
+ end
167
+ res
168
+ end
169
+
170
+ # The method is called from 'parse_line' to save the sequence alignment information from the gmap output
171
+
172
+ def get_aln(res,l)
173
+
174
+ if l =~/.*:\d+\s[A|T|C|G].+.*/ then
175
+ res.aln << l+"\n"
176
+ res.set_save
177
+ end
178
+
179
+ if res.c >= 1 and res.c < 3 then
180
+ res.aln << l+"\n"
181
+ end
182
+ res.aln << l+"\n" if l=~/aa.g/
183
+ res.aln << l+"\n" if l=~/aa.c/
184
+ if res.c == 3 then
185
+ res.aln.chomp!
186
+ res.set_search
187
+ res.set_save
188
+ end
189
+ if res.search_aln and res.save_aln then
190
+ res.count
191
+ end
192
+ res
193
+ end
194
+
195
+
196
+ end
197
+
198
+ # This class store the informations of a single Gmap result
199
+
200
+ class Result
201
+
202
+ attr_accessor :query, :target, :q_start, :q_end, :start, :end, :strand ,:exons, :coverage, :perc_identity, :indels, :mismatch, :aa_change, :gene_start, :gene_end, :gene_id, :aln
203
+ attr_reader :search_aln, :c, :save_aln
204
+
205
+ def initialize
206
+ clear
207
+ end
208
+ # Initializes all the attributes of the result
209
+ def clear
210
+ @query = nil
211
+ @target = nil
212
+ @start = nil
213
+ @end = nil
214
+ @strand = nil
215
+ @exons = nil
216
+ @coverage = nil
217
+ @perc_identity = nil
218
+ @indels = nil
219
+ @mismatch = nil
220
+ @aa_change = nil
221
+ @gene_start = nil
222
+ @gene_end = nil
223
+ @gene_id = nil
224
+ @q_start = nil
225
+ @q_end = nil
226
+ @aln = ""
227
+
228
+ # Inizalize control attributes
229
+
230
+ @maps = false
231
+ @search_aln = false
232
+ @save_aln = false
233
+ @c = 0
234
+ end
235
+
236
+ def set_search
237
+ if @search_aln then
238
+ @search_aln = false
239
+ else
240
+ @search_aln = true
241
+ end
242
+ end
243
+
244
+
245
+ def set_save
246
+ if @save_aln then
247
+ @save_aln = false
248
+ else
249
+ @save_aln = true
250
+ end
251
+ end
252
+
253
+ def count
254
+ @c += 1
255
+ end
256
+
257
+ protected
258
+
259
+ attr_writer :search_aln, :c, :path, :maps
260
+
261
+ end
262
+
263
+
264
+ end
265
+
266
+
267
+
268
+
269
+
@@ -0,0 +1,148 @@
1
+ >FIRST
2
+ Paths (8):
3
+ Path 1: query 14--26 (13 bp) => chr ENSBTAT00000028007:11,489--11,477 (-13 bp)
4
+ cDNA direction: indeterminate
5
+ Genomic pos: cow_trans:38,344,745--38,344,733 (- strand)
6
+ Accessions: ENSBTAT00000028007:11,477--11,489 (out of 16907 bp)
7
+ Number of exons: 1
8
+ Coverage: 36.1 (query length: 36 bp)
9
+ Trimmed coverage: 36.1 (trimmed length: 36 bp, trimmed region: 1..36)
10
+ Percent identity: 100.0 (13 matches, 0 mismatches, 0 indels, 0 unknowns)
11
+ Amino acid changes:
12
+
13
+ Alignments:
14
+ Alignment for path 1:
15
+
16
+ -ENSBTAT00000028007:11489-11477 (14-26) 100%
17
+
18
+ 0 . :
19
+
20
+ -ENSBTAT00000028007:11489 CTTCGTATTGCTG
21
+ |||||||||||||
22
+ 14 CTTCGTATTGCTG
23
+
24
+
25
+ Path 2: query 1--36 (36 bp) => chr ENSBTAT00000042528:264--229 (-36 bp)
26
+ cDNA direction: indeterminate
27
+ Genomic pos: cow_trans:264--229 (- strand)
28
+ Accessions: ENSBTAT00000042528:229--264 (out of 957 bp)
29
+ Number of exons: 1
30
+ Coverage: 100.0 (query length: 36 bp)
31
+ Trimmed coverage: 100.0 (trimmed length: 36 bp, trimmed region: 1..36)
32
+ Percent identity: 97.2 (35 matches, 1 mismatches, 0 indels, 0 unknowns)
33
+ Translation: 2..34 (11 aa)
34
+ Amino acid changes: K10T [28]
35
+
36
+ Alignments:
37
+ Alignment for path 2:
38
+
39
+ -ENSBTAT00000042528:264-229 (1-36) 97%
40
+
41
+ 0 . : . : . : .
42
+ aa.g 1 G I H M V K A R P K A
43
+ -ENSBTAT00000042528:264 GGGAATTCACATGGTTAAGGCTAGGCCTAAAGCTAT
44
+ ||||||||||||||||||||||||||||| ||||||
45
+ 1 GGGAATTCACATGGTTAAGGCTAGGCCTACAGCTAT
46
+ aa.c 1 G I H M V K A R P T A
47
+
48
+ Path 3: query 18--31 (14 bp) => chr ENSBTAT00000044819:611--624 (14 bp)
49
+ cDNA direction: indeterminate
50
+ Genomic pos: cow_trans:30,682,928--30,682,941 (+ strand)
51
+ Accessions: ENSBTAT00000044819:611--624 (out of 1972 bp)
52
+ Number of exons: 1
53
+ Coverage: 38.9 (query length: 36 bp)
54
+ Trimmed coverage: 38.9 (trimmed length: 36 bp, trimmed region: 1..36)
55
+ Percent identity: 100.0 (14 matches, 0 mismatches, 0 indels, 0 unknowns)
56
+ Amino acid changes:
57
+
58
+ Alignments:
59
+ Alignment for path 3:
60
+
61
+ +ENSBTAT00000044819:611-624 (18-31) 100%
62
+
63
+ 0 . :
64
+
65
+ +ENSBTAT00000044819:611 GAAATCTTGACTGA
66
+ ||||||||||||||
67
+ 18 GAAATCTTGACTGA
68
+
69
+ >SECOND
70
+ Paths (37):
71
+ Path 1: query 12--26 (15 bp) => chr chr17:21,154,442--21,154,428 (-15 bp)
72
+ cDNA direction: indeterminate
73
+ Genomic pos: bt3.1:609,940,407--609,940,393 (- strand)
74
+ Accessions: chr17:21,154,428--21,154,442 (out of 70149481 bp)
75
+ Number of exons: 1
76
+ Coverage: 41.7 (query length: 36 bp)
77
+ Trimmed coverage: 41.7 (trimmed length: 36 bp, trimmed region: 1..36)
78
+ Percent identity: 100.0 (15 matches, 0 mismatches, 0 indels, 0 unknowns)
79
+ Amino acid changes:
80
+
81
+ Alignments:
82
+ Alignment for path 1:
83
+
84
+ -chr17:21154442-21154428 (12-26) 100%
85
+
86
+ 0 . : .
87
+
88
+ -chr17:21154442 TTCGTATACCGTATT
89
+ |||||||||||||||
90
+ 12 TTCGTATACCGTATT
91
+
92
+
93
+ Maps:
94
+ Map hits for path 1 (0):
95
+
96
+ Path 2: query 9--27 (19 bp) => chr chr11:99,537,167--99,537,185 (19 bp)
97
+ cDNA direction: indeterminate
98
+ Genomic pos: bt3.1:195,355,821--195,355,839 (+ strand)
99
+ Accessions: chr11:99,537,167--99,537,185 (out of 101635058 bp)
100
+ Number of exons: 1
101
+ Coverage: 52.8 (query length: 36 bp)
102
+ Trimmed coverage: 52.8 (trimmed length: 36 bp, trimmed region: 1..36)
103
+ Percent identity: 94.7 (18 matches, 1 mismatches, 0 indels, 0 unknowns)
104
+ Amino acid changes:
105
+
106
+ Alignments:
107
+ Alignment for path 2:
108
+
109
+ +chr11:99537167-99537185 (9-27) 94%
110
+
111
+ 0 . : .
112
+
113
+ +chr11:99537167 AGGCATGCATGGCCCGAAC
114
+ | |||||||||||||||||
115
+ 9 ACGCATGCATGGCCCGAAC
116
+
117
+
118
+ Maps:
119
+ Map hits for path 2 (1):
120
+ gene_maps chr11:3711585..3721335 788340
121
+
122
+ Path 3: query 1--36 (36 bp) => chr chr22:57,923,909--57,926,444 (36 bp)
123
+ cDNA direction: indeterminate
124
+ Genomic pos: bt3.1:1,120,956,262--1,120,953,727 (- strand)
125
+ Accessions: chr22:57,923,909--57,926,444 (out of 59883977 bp)
126
+ Number of exons: 2
127
+ Coverage: 100.0 (query length: 36 bp)
128
+ Trimmed coverage: 100.0 (trimmed length: 36 bp, trimmed region: 1..36)
129
+ Percent identity: 100.0 (36 matches, 0 mismatches, 0 indels, 0 unknowns)
130
+ Translation: 1..36 (11 aa)
131
+ Amino acid changes:
132
+
133
+ Alignments:
134
+ Alignment for path 3:
135
+
136
+ -chr22:57923925-57923909 (20-36) 100%
137
+
138
+ 0 . : . : . : . : .
139
+ aa.g 1 R A P R R A G E G R G *
140
+ -chr22:57926444 CGCGCACCTCGGCGTGCAGGTG...CAGGTGAAGGGAGAGGATGA
141
+ ||||||||||||||||||||||||||||||||||||
142
+ 1 CGCGCACCTCGGCGTGCAG 2500 GTGAAGGGAGAGGATGA
143
+ aa.c 1 R A P R R A G E G R G *
144
+
145
+ Maps:
146
+ Map hits for path 3 (1):
147
+ gene_maps chr10:57912718..57926714 507939
148
+
@@ -0,0 +1,127 @@
1
+ require 'test/unit'
2
+ require 'lib/gmap'
3
+
4
+ class GmapTest < Test::Unit::TestCase
5
+
6
+ def setup
7
+ @data = 'samples/test.gmap'
8
+ end
9
+
10
+ def test_open
11
+ assert_nothing_raised do
12
+ g = Gmap::Core.open(@data)
13
+ g.close
14
+ end
15
+ end
16
+
17
+ def test_result
18
+ Gmap::Core.open(@data) do |g|
19
+ g.each_sequence do |seq|
20
+ assert_equal 3,seq.size
21
+ assert_equal 'Array',seq.class.to_s
22
+ check_first(seq) if seq[0].query == 'FIRST'
23
+ check_second(seq) if seq[0].query == 'SECOND'
24
+ end
25
+ end
26
+ end
27
+
28
+ def check_first(seq)
29
+ # first result
30
+ assert_equal 'ENSBTAT00000028007',seq[0].target
31
+ assert_equal 36.1, seq[0].coverage
32
+ assert_equal 100.0, seq[0].perc_identity
33
+ assert_equal 0, seq[0].mismatch
34
+ assert_equal 0, seq[0].indels
35
+ assert_equal 14, seq[0].q_start
36
+ assert_equal 26, seq[0].q_end
37
+ assert_equal -1, seq[0].strand
38
+ assert_equal 11477, seq[0].start
39
+ assert_equal 11489, seq[0].end
40
+ assert_equal 1, seq[0].exons
41
+ assert_equal nil, seq[0].aa_change
42
+ assert_match(/CTTCGTATTGCTG/,seq[0].aln)
43
+ # second result
44
+ assert_equal 'ENSBTAT00000042528',seq[1].target
45
+ assert_equal 100.0, seq[1].coverage
46
+ assert_equal 97.2, seq[1].perc_identity
47
+ assert_equal 1, seq[1].mismatch
48
+ assert_equal 0, seq[1].indels
49
+ assert_equal 1, seq[1].q_start
50
+ assert_equal 36, seq[1].q_end
51
+ assert_equal -1, seq[1].strand
52
+ assert_equal 229, seq[1].start
53
+ assert_equal 264, seq[1].end
54
+ assert_equal 1, seq[1].exons
55
+ assert_equal 'K10T [28]', seq[1].aa_change
56
+ assert_match(/G I H M V K A R P K A/,seq[1].aln)
57
+ assert_match(/GGGAATTCACATGGTTAAGGCTAGGCCTAAAGCTAT/,seq[1].aln)
58
+ assert_match(/GGGAATTCACATGGTTAAGGCTAGGCCTACAGCTAT/,seq[1].aln)
59
+ assert_match(/G I H M V K A R P T A/,seq[1].aln)
60
+ # third result
61
+ assert_equal 'ENSBTAT00000044819',seq[2].target
62
+ assert_equal 38.9, seq[2].coverage
63
+ assert_equal 100.0, seq[2].perc_identity
64
+ assert_equal 0, seq[2].mismatch
65
+ assert_equal 0, seq[2].indels
66
+ assert_equal 18, seq[2].q_start
67
+ assert_equal 31, seq[2].q_end
68
+ assert_equal 1, seq[2].strand
69
+ assert_equal 611, seq[2].start
70
+ assert_equal 624, seq[2].end
71
+ assert_equal 1, seq[2].exons
72
+ assert_equal nil, seq[2].aa_change
73
+ assert_match(/GAAATCTTGACTGA/,seq[2].aln)
74
+ end
75
+
76
+ def check_second(seq)
77
+ # first result
78
+ assert_equal 'chr17',seq[0].target
79
+ assert_equal 41.7, seq[0].coverage
80
+ assert_equal 100.0, seq[0].perc_identity
81
+ assert_equal 0, seq[0].mismatch
82
+ assert_equal 0, seq[0].indels
83
+ assert_equal 12, seq[0].q_start
84
+ assert_equal 26, seq[0].q_end
85
+ assert_equal -1, seq[0].strand
86
+ assert_equal 21154428, seq[0].start
87
+ assert_equal 21154442, seq[0].end
88
+ assert_equal 1, seq[0].exons
89
+ assert_equal nil, seq[0].aa_change
90
+ assert_match(/TTCGTATACCGTATT/,seq[0].aln)
91
+ # second result
92
+ assert_equal 'chr11',seq[1].target
93
+ assert_equal 52.8, seq[1].coverage
94
+ assert_equal 94.7, seq[1].perc_identity
95
+ assert_equal 1, seq[1].mismatch
96
+ assert_equal 0, seq[1].indels
97
+ assert_equal 9, seq[1].q_start
98
+ assert_equal 27, seq[1].q_end
99
+ assert_equal 1, seq[1].strand
100
+ assert_equal 99537167, seq[1].start
101
+ assert_equal 99537185, seq[1].end
102
+ assert_equal 1, seq[1].exons
103
+ assert_equal nil, seq[1].aa_change
104
+ assert_match(/AGGCATGCATGGCCCGAAC/,seq[1].aln)
105
+ assert_match(/ACGCATGCATGGCCCGAAC/,seq[1].aln)
106
+ # third result
107
+ assert_equal 'chr22',seq[2].target
108
+ assert_equal 100.0, seq[2].coverage
109
+ assert_equal 100.0, seq[2].perc_identity
110
+ assert_equal 0, seq[2].mismatch
111
+ assert_equal 0, seq[2].indels
112
+ assert_equal 1, seq[2].q_start
113
+ assert_equal 36, seq[2].q_end
114
+ assert_equal -1, seq[2].strand
115
+ assert_equal 57923909, seq[2].start
116
+ assert_equal 57926444, seq[2].end
117
+ assert_equal 2, seq[2].exons
118
+ assert_equal nil, seq[2].aa_change
119
+ assert_match(/R A P R R A G E G R G \*/,seq[2].aln)
120
+ assert_match(/CGCGCACCTCGGCGTGCAGGTG\.\.\.CAGGTGAAGGGAGAGGATGA/,seq[2].aln)
121
+ assert_match(/CGCGCACCTCGGCGTGCAG 2500 GTGAAGGGAGAGGATGA/,seq[2].aln)
122
+ assert_equal 57912718,seq[2].gene_start
123
+ assert_equal 57926714,seq[2].gene_end
124
+ assert_equal 507939,seq[2].gene_id
125
+ end
126
+
127
+ end