mgnu 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,87 @@
1
+ # require 'xml/libxml'
2
+ require 'mgnu/loggable'
3
+ require 'ox'
4
+
5
+ module MgNu
6
+ module Parser
7
+ class Blast
8
+ require_relative 'blast/format7'
9
+ require_relative 'blast/format8'
10
+ require_relative 'blast/format0'
11
+
12
+ include Loggable
13
+ include Enumerable
14
+
15
+ attr_accessor :format, :input
16
+
17
+ # create a new blast parser
18
+ def initialize(input = nil, format = nil)
19
+ if input
20
+ if File.exists?(input) and File.readable?(input)
21
+ @file = File.open(input)
22
+ @input_type = File
23
+ elsif input.class == String
24
+ # assume a string containing the blast report
25
+ @input_type = String
26
+ else
27
+ raise "\n\n -- No file by that name (#{input}). Exiting\n\n"
28
+ exit(1)
29
+ end
30
+ else
31
+ error("MgNu::Parser::Blast.new(): needs a filename or atring of Blast data")
32
+ exit(1)
33
+ end
34
+
35
+ @input = input
36
+ @format = format
37
+
38
+ # don't overwrite a format if given one
39
+ if @format.nil?
40
+ @format = 7 if @input =~ /.*\.xml/ and @input_type == File
41
+ @format = 8 if @input =~ /.*8$/ and @input_type == File
42
+ end
43
+
44
+ if @format.nil?
45
+ error("Please set the format type!");
46
+ exit(1)
47
+ end
48
+
49
+ case @format
50
+ when 7
51
+ #XML::SaxParser.file(@input)
52
+ if @input_type == File
53
+ @parser = Format7.new()
54
+ else
55
+ # string input?
56
+ end
57
+ when 8
58
+ if @input_type == File
59
+ @parser = Format8.new(@file)
60
+ elsif @input_type == String
61
+ @parser = Format8.new(@input)
62
+ end
63
+ when 0
64
+ if @input_type == File
65
+ @parser = Format0.new(@file)
66
+ elsif @input_type == String
67
+ @parser = Format0.new(@input)
68
+ end
69
+ end
70
+ end # end initialize
71
+
72
+ def parse
73
+ if @format == 7
74
+ Ox.sax_parse(@parser, @file)
75
+ else
76
+ @parser.parse
77
+ end
78
+ return(@parser.queries)
79
+ end
80
+
81
+ def each(&b)
82
+ @parser.each(&b)
83
+ end
84
+ end # end of MgNu::Parser::Blast class
85
+
86
+ end # end of MgNu::File module
87
+ end # end of MgNu module
@@ -0,0 +1,290 @@
1
+ require 'mgnu/parser/blast/query'
2
+ require 'mgnu/parser/blast/sbjct'
3
+ require 'mgnu/parser/blast/hsp'
4
+
5
+ module MgNu
6
+ module Parser
7
+ class Blast
8
+ class Format0
9
+ include MgNu::Parser
10
+
11
+ attr_accessor :queries, :blast_type
12
+
13
+ # create a new Format0 parser
14
+ def initialize(file)
15
+ @query = nil
16
+ @sbjct = nil
17
+ @sbjct_number = 0
18
+ @blast_type = nil
19
+ @queries = []
20
+
21
+ @file = file
22
+ end
23
+
24
+ # parse the input blast file
25
+ def parse
26
+ line = @file.readline
27
+ @blast_type = line.split[0]
28
+ buffer = parse_until(@file,/^Query=/) # get the 1st chunk of the blast report
29
+ while buffer.length > 0
30
+ if buffer[0] =~ /^Query=/
31
+ process_buffer(buffer)
32
+ end
33
+ buffer = parse_until(@file,/^Query=/)
34
+ end # end while
35
+ end # end parse
36
+
37
+ # filter a blast query entry for important parts
38
+ # @param [Array] buffer containing a "Query=" block from
39
+ # the blast output file
40
+ # @return [Bool] success or failure of the processing
41
+ def process_buffer(buffer)
42
+ return false if buffer.length == 0
43
+ extract_query(buffer)
44
+ @queries << @query
45
+ end # end process_buffer
46
+
47
+ def extract_query(buffer)
48
+ str = ""
49
+ while line = buffer.shift
50
+ break if line =~ /^\s*$/ # empty line, break
51
+ str += line.chomp
52
+ end
53
+ str.gsub!(/\s+/, " ")
54
+ @query = Query.new
55
+ if str =~ /^Query= (.+?) (.*) ?\(([0-9,]+) letters\)/
56
+ @query.query_id = $1
57
+ @query.definition = $2
58
+ @query.length = $3.gsub(",","").to_i
59
+ else
60
+ # Blast+ output has query id/definition and length on separate lines
61
+ while line = buffer.shift
62
+ str += line.chomp
63
+ break if line =~ /Length=/
64
+ end
65
+ if str =~ /^Query=\s([^\s]+)\s?(.*)\s?Length=([,\d]+)/
66
+ @query.query_id = $1
67
+ @query.definition = $2
68
+ @query.length = $3.gsub(",","").to_i
69
+ end
70
+ end
71
+ extract_dbinfo(buffer)
72
+
73
+ sbjct_buffer = Array.new
74
+ while line = buffer.shift
75
+ if line =~ /^>/ and sbjct_buffer.length > 0
76
+ extract_sbjct(sbjct_buffer)
77
+ sbjct_buffer.clear
78
+ buffer.unshift(line)
79
+ else
80
+ sbjct_buffer << line
81
+ end
82
+ end
83
+ extract_sbjct(sbjct_buffer) if sbjct_buffer.length > 0
84
+
85
+ end # end extract_query
86
+
87
+ def extract_dbinfo(buffer)
88
+ str = ""
89
+ while line = buffer.shift
90
+ break if line =~ /^\s*$/ # empty line, break
91
+ str += line.chomp
92
+ end
93
+
94
+ str.gsub!(/\s+/," ")
95
+ if str =~ /Database:\s+(.+)\.?\s+([0-9,]+)\s+sequences;\s+([0-9,]+)\s+total\s+letters/
96
+ db_name, db_seq_count, db_total_letters = $1, $2, $3
97
+ @query.database = db_name
98
+ @query.database_sequence_count = db_seq_count.gsub(",","").to_i
99
+ @query.database_total_letters = db_total_letters.gsub(",","").to_i
100
+ else
101
+ $stderr.puts "extract_dbinfo: Database line mismatch!"
102
+ $stderr.puts "database, database_sequence_count and database_total_letters are not set"
103
+ $stderr.puts str
104
+ end
105
+
106
+ # eat up single-line summary cruft until beginning of subjects
107
+ while line = buffer.shift
108
+ if line =~ /^>/ # first sbjct, break
109
+ buffer.unshift(line)
110
+ break
111
+ end
112
+ end
113
+ end
114
+
115
+ def extract_sbjct(buffer)
116
+ if buffer[0] !~ /^>/
117
+ $stderr.puts "can't process subject buffer - missing fasta header line!"
118
+ exit(1)
119
+ end
120
+
121
+ str = ""
122
+ # read until blank line to get header, but ensure that we already have Length= line
123
+ while line = buffer.shift
124
+ break if line =~ /^\s*$/ && str =~ /Length\s*=/
125
+ str += line.chomp
126
+ end
127
+ str.gsub!(/\s+/," ") # shrink spaces
128
+ @sbjct = Sbjct.new
129
+ if str =~ />(.+?)\s+(.*)\s*Length\s+=\s+(\d+)/m or
130
+ str =~ />\s(.+?)\s+(.*)\s*Length=\s?(\d+)/m
131
+ @sbjct.number = @query.sbjcts.length + 1
132
+ @sbjct.sbjct_id = $1
133
+ @sbjct.definition = $2.rstrip
134
+ @sbjct.length = $3.to_i
135
+ @sbjct.query = @query
136
+ end
137
+
138
+ hsp_buffer = Array.new
139
+ while line = buffer.shift
140
+ if line =~ /^>/ and hsp_buffer.length > 0
141
+ extract_all_hsps(hsp_buffer)
142
+ hsp_buffer.clear
143
+ buffer.unshift(line)
144
+ break
145
+ else
146
+ hsp_buffer << line
147
+ end
148
+ end
149
+ extract_all_hsps(hsp_buffer) if hsp_buffer.length > 0
150
+
151
+ @query.sbjcts << @sbjct
152
+ end
153
+
154
+ # create Hsp objects from the complete alignment section
155
+ #
156
+ # @param [Array] buffer containing all the lines from the
157
+ # alignment section
158
+ def extract_all_hsps(buffer)
159
+ unless buffer[0] =~ /^\s+Score =/
160
+ $stderr.puts "can't process HSP buffer - missing Score = line!"
161
+ exit(1)
162
+ end
163
+
164
+ hsp_buffer = Array.new
165
+ while line = buffer.shift
166
+ if line =~ /^\s+Score =/ and hsp_buffer.length > 0
167
+ process_hsp(hsp_buffer)
168
+ hsp_buffer.clear
169
+ buffer.unshift(line)
170
+ else
171
+ hsp_buffer << line
172
+ end
173
+ end
174
+ process_hsp(hsp_buffer) if hsp_buffer.length > 0
175
+ end
176
+
177
+ def process_hsp(buffer)
178
+ unless buffer[0] =~ /^\s+Score =/
179
+ $stderr.puts "can't process HSP buffer - missing Score = line!"
180
+ exit(1)
181
+ end
182
+
183
+ str = ""
184
+ # read until blank line to get header
185
+ while line = buffer.shift
186
+ break if line =~ /^\s*$/
187
+ str += line.chomp
188
+ end
189
+
190
+ hsp = Hsp.new
191
+ if str =~ / Score =\s+(\d+(?:\.\d+)?)\s+bits\s+\((\d+)\)/
192
+ hsp.bit_score = $1.to_f
193
+ hsp.score = $2.to_i
194
+ end
195
+
196
+ if str =~ /Expect.*\s+=\s+(\d+\.\d+)/ or
197
+ str =~ /Expect.*\s+=\s+(\d+e-\d+)/ or
198
+ str =~ /Expect.*\s+=\s+(e-\d+)/
199
+ hsp.evalue = $1.to_f
200
+ end
201
+
202
+ if str =~ /Identities\s+=\s+(\d+)\/(\d+)\s+\((\d+%)\)/
203
+ hsp.length = $2.to_i
204
+ hsp.identity = $3.to_i
205
+ end
206
+
207
+ if str =~ /Positives\s+=\s+(\d+)\/(\d+)\s+\((\d+)%\)/
208
+ hsp.positive = $1.to_i
209
+ end
210
+
211
+ if str =~ /Gaps\s+=\s+(\d+)\/(\d+)\s+\((\d+%)\)/
212
+ hsp.gap_count = $1.to_i
213
+ end
214
+
215
+ if str =~ /Frame\s+=\s+([+-]\d)/
216
+ hsp.query_frame = $1
217
+ elsif str =~ /Frame\s+=\s+([+-]\d)\s+\/\s+([+-]\d)/
218
+ hsp.query_frame = $1
219
+ hsp.sbjct_frame = $2
220
+ end
221
+
222
+ if str =~ /Strand\s+=\s+(Plus|Minus)\s+\/\s+(Plus|Minus)/
223
+ hsp.query_frame = $1 == "Plus" ? 1 : -1
224
+ hsp.sbjct_frame = $2 == "Plus" ? 1 : -1
225
+ end
226
+
227
+ # read remaining buffer lines for the alignment
228
+ # buffer.delete_if {|x| x =~ /^\s*$/} # drop empty lines
229
+
230
+ query_to = nil
231
+ sbjct_to = nil
232
+ while buffer.length > 0
233
+ line = buffer.shift
234
+ if line =~ /Query/
235
+ q_line = line
236
+ m_line = buffer.shift
237
+ s_line = buffer.shift
238
+ leader = 0
239
+
240
+ break if q_line =~ /#{@blast_type}/ # end of hsps so exit
241
+ if q_line =~ /\s+Database:\s+#{@query.database}/ # end of report so exit
242
+ break
243
+ else
244
+ end
245
+
246
+ # process query line
247
+ unless q_line =~ /^Query/
248
+ $stderr.puts "Query line is malformed - skipping alignment"
249
+ $stderr.puts q_line
250
+ break
251
+ end
252
+
253
+ q_line =~ /^Query:?\s+(\d+)\s*(.+?)\s+(\d+)$/
254
+ if hsp.query_from.nil?
255
+ hsp.query_from = $1.to_i
256
+ end
257
+ hsp.query_sequence += $2
258
+ query_to = $3.to_i
259
+
260
+ if leader == 0
261
+ q_line =~ /^(Query:?\s+\d+\s*)/
262
+ leader = $1.length
263
+ end
264
+
265
+ # process mid line
266
+ hsp.midline += m_line[leader,m_line.length]
267
+
268
+ # process sbjct line
269
+ unless s_line =~ /^Sbjct/
270
+ $stderr.puts "Sbjct line is malformed - skipping alignment"
271
+ $stderr.puts s_line
272
+ break
273
+ end
274
+
275
+ s_line =~ /^Sbjct:?\s+(\d+)\s+(.+?)\s+(\d+)$/
276
+ if hsp.sbjct_from.nil?
277
+ hsp.sbjct_from = $1.to_i
278
+ end
279
+ hsp.sbjct_sequence += $2
280
+ sbjct_to = $3.to_i
281
+ end # end of if line =~ /Query/
282
+ end # end while buffer.length > 0
283
+ hsp.query_to = query_to
284
+ hsp.sbjct_to = sbjct_to
285
+ @sbjct.hsps << hsp # add this hsp to the sbjct
286
+ end # end extract_hsp
287
+ end # end of MgNu::Parser::Blast::Format0 class
288
+ end # end of MgNu::Parser::Blast class
289
+ end # end of MgNu::Parser module
290
+ end # end of MgNu module
@@ -0,0 +1,121 @@
1
+ # require 'xml/libxml'
2
+ require 'ox'
3
+ require 'mgnu/parser/blast/query'
4
+ require 'mgnu/parser/blast/sbjct'
5
+ require 'mgnu/parser/blast/hsp'
6
+
7
+ module MgNu
8
+ module Parser
9
+ class Blast
10
+ class Format7 < ::Ox::Sax
11
+
12
+ attr_accessor :queries
13
+
14
+ # create a new Format7 parser
15
+ def initialize()
16
+ @query = nil
17
+ @sbjct = nil
18
+ @hsp = nil
19
+ @current_element = nil
20
+ @queries = []
21
+ end
22
+
23
+ def start_element(element)
24
+ # set the current element - used during character parsing
25
+ @current_element = element
26
+
27
+ case element
28
+ when :Iteration
29
+ # start a new Query
30
+ @query = Query.new if @query.nil?
31
+ when :Hit
32
+ # start a new Sbjct
33
+ @sbjct = Sbjct.new if @sbjct.nil?
34
+ @sbjct.query = @query
35
+ when :Hsp
36
+ # start a new Hsp
37
+ @hsp = Hsp.new if @hsp.nil?
38
+ @hsp.sbjct = @sbjct
39
+ @hsp.query = @query
40
+ end
41
+ end
42
+
43
+ def text(characters = '')
44
+ return if characters =~ /^\s+$/
45
+ case @current_element
46
+ when :"Iteration_iter-num"
47
+ @query.number = characters.to_i
48
+ when :"Iteration_query-ID"
49
+ @query.query_id += characters
50
+ when :"Iteration_query-def"
51
+ @query.definition += characters
52
+ when :"Iteration_query-len"
53
+ @query.length = characters.to_i
54
+
55
+ when :Hit_num
56
+ @sbjct.number = characters.to_i
57
+ when :Hit_id
58
+ @sbjct.sbjct_id += characters
59
+ when :Hit_def
60
+ @sbjct.definition += characters
61
+ when :Hit_accession
62
+ @sbjct.accession += characters
63
+ when :Hit_len
64
+ @sbjct.length = characters.to_i
65
+
66
+ when :Hsp_num
67
+ @hsp.number = characters.to_i
68
+ when :"Hsp_bit-score"
69
+ @hsp.bit_score = characters.to_i
70
+ when :Hsp_score
71
+ @hsp.score = characters.to_i
72
+ when :Hsp_evalue
73
+ @hsp.evalue = characters.to_f
74
+ when :"Hsp_query-from"
75
+ @hsp.query_from = characters.to_i
76
+ when :"Hsp_query-to"
77
+ @hsp.query_to = characters.to_i
78
+ when :"Hsp_hit-from"
79
+ @hsp.sbjct_from = characters.to_i
80
+ when :"Hsp_hit-to"
81
+ @hsp.sbjct_to = characters.to_i
82
+ when :"Hsp_query-frame"
83
+ @hsp.query_frame = characters.to_i
84
+ when :"Hsp_hit-frame"
85
+ @hsp.sbjct_frame = characters.to_i
86
+ when :Hsp_identity
87
+ @hsp.identity = characters.to_i
88
+ when :Hsp_positive
89
+ @hsp.positive = characters.to_i
90
+ when :"Hsp_align-len"
91
+ @hsp.length = characters.to_i
92
+ when :Hsp_qseq
93
+ @hsp.query_sequence += characters
94
+ when :Hsp_hseq
95
+ @hsp.sbjct_sequence += characters
96
+ when :Hsp_midline
97
+ @hsp.midline += characters
98
+ end
99
+ end
100
+
101
+ def end_element(element)
102
+ case element
103
+ when :Iteration
104
+ # end of a query
105
+ @queries << @query
106
+ @query = Query.new
107
+ when :Hit
108
+ # end of a sbjct
109
+ @query.sbjcts << @sbjct
110
+ @sbjct = Sbjct.new
111
+ when :Hsp
112
+ # end of a hsp
113
+ @sbjct.hsps << @hsp
114
+ @hsp = Hsp.new
115
+ end
116
+ end
117
+
118
+ end # end of MgNu::Parser::Blast::Format7 class
119
+ end # end of MgNu::Parser::Blast class
120
+ end # end of MgNu::Parser module
121
+ end # end of MgNu module