mgnu 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,87 @@
1
+ # require 'xml/libxml'
2
+ require 'mgnu/loggable'
3
+ require 'ox'
4
+
5
+ module MgNu
6
+ module Parser
7
+ class Blast
8
+ require_relative 'blast/format7'
9
+ require_relative 'blast/format8'
10
+ require_relative 'blast/format0'
11
+
12
+ include Loggable
13
+ include Enumerable
14
+
15
+ attr_accessor :format, :input
16
+
17
+ # create a new blast parser
18
+ def initialize(input = nil, format = nil)
19
+ if input
20
+ if File.exists?(input) and File.readable?(input)
21
+ @file = File.open(input)
22
+ @input_type = File
23
+ elsif input.class == String
24
+ # assume a string containing the blast report
25
+ @input_type = String
26
+ else
27
+ raise "\n\n -- No file by that name (#{input}). Exiting\n\n"
28
+ exit(1)
29
+ end
30
+ else
31
+ error("MgNu::Parser::Blast.new(): needs a filename or atring of Blast data")
32
+ exit(1)
33
+ end
34
+
35
+ @input = input
36
+ @format = format
37
+
38
+ # don't overwrite a format if given one
39
+ if @format.nil?
40
+ @format = 7 if @input =~ /.*\.xml/ and @input_type == File
41
+ @format = 8 if @input =~ /.*8$/ and @input_type == File
42
+ end
43
+
44
+ if @format.nil?
45
+ error("Please set the format type!");
46
+ exit(1)
47
+ end
48
+
49
+ case @format
50
+ when 7
51
+ #XML::SaxParser.file(@input)
52
+ if @input_type == File
53
+ @parser = Format7.new()
54
+ else
55
+ # string input?
56
+ end
57
+ when 8
58
+ if @input_type == File
59
+ @parser = Format8.new(@file)
60
+ elsif @input_type == String
61
+ @parser = Format8.new(@input)
62
+ end
63
+ when 0
64
+ if @input_type == File
65
+ @parser = Format0.new(@file)
66
+ elsif @input_type == String
67
+ @parser = Format0.new(@input)
68
+ end
69
+ end
70
+ end # end initialize
71
+
72
+ def parse
73
+ if @format == 7
74
+ Ox.sax_parse(@parser, @file)
75
+ else
76
+ @parser.parse
77
+ end
78
+ return(@parser.queries)
79
+ end
80
+
81
+ def each(&b)
82
+ @parser.each(&b)
83
+ end
84
+ end # end of MgNu::Parser::Blast class
85
+
86
+ end # end of MgNu::File module
87
+ end # end of MgNu module
@@ -0,0 +1,290 @@
1
+ require 'mgnu/parser/blast/query'
2
+ require 'mgnu/parser/blast/sbjct'
3
+ require 'mgnu/parser/blast/hsp'
4
+
5
+ module MgNu
6
+ module Parser
7
+ class Blast
8
+ class Format0
9
+ include MgNu::Parser
10
+
11
+ attr_accessor :queries, :blast_type
12
+
13
+ # create a new Format0 parser
14
+ def initialize(file)
15
+ @query = nil
16
+ @sbjct = nil
17
+ @sbjct_number = 0
18
+ @blast_type = nil
19
+ @queries = []
20
+
21
+ @file = file
22
+ end
23
+
24
+ # parse the input blast file
25
+ def parse
26
+ line = @file.readline
27
+ @blast_type = line.split[0]
28
+ buffer = parse_until(@file,/^Query=/) # get the 1st chunk of the blast report
29
+ while buffer.length > 0
30
+ if buffer[0] =~ /^Query=/
31
+ process_buffer(buffer)
32
+ end
33
+ buffer = parse_until(@file,/^Query=/)
34
+ end # end while
35
+ end # end parse
36
+
37
+ # filter a blast query entry for important parts
38
+ # @param [Array] buffer containing a "Query=" block from
39
+ # the blast output file
40
+ # @return [Bool] success or failure of the processing
41
+ def process_buffer(buffer)
42
+ return false if buffer.length == 0
43
+ extract_query(buffer)
44
+ @queries << @query
45
+ end # end process_buffer
46
+
47
+ def extract_query(buffer)
48
+ str = ""
49
+ while line = buffer.shift
50
+ break if line =~ /^\s*$/ # empty line, break
51
+ str += line.chomp
52
+ end
53
+ str.gsub!(/\s+/, " ")
54
+ @query = Query.new
55
+ if str =~ /^Query= (.+?) (.*) ?\(([0-9,]+) letters\)/
56
+ @query.query_id = $1
57
+ @query.definition = $2
58
+ @query.length = $3.gsub(",","").to_i
59
+ else
60
+ # Blast+ output has query id/definition and length on separate lines
61
+ while line = buffer.shift
62
+ str += line.chomp
63
+ break if line =~ /Length=/
64
+ end
65
+ if str =~ /^Query=\s([^\s]+)\s?(.*)\s?Length=([,\d]+)/
66
+ @query.query_id = $1
67
+ @query.definition = $2
68
+ @query.length = $3.gsub(",","").to_i
69
+ end
70
+ end
71
+ extract_dbinfo(buffer)
72
+
73
+ sbjct_buffer = Array.new
74
+ while line = buffer.shift
75
+ if line =~ /^>/ and sbjct_buffer.length > 0
76
+ extract_sbjct(sbjct_buffer)
77
+ sbjct_buffer.clear
78
+ buffer.unshift(line)
79
+ else
80
+ sbjct_buffer << line
81
+ end
82
+ end
83
+ extract_sbjct(sbjct_buffer) if sbjct_buffer.length > 0
84
+
85
+ end # end extract_query
86
+
87
+ def extract_dbinfo(buffer)
88
+ str = ""
89
+ while line = buffer.shift
90
+ break if line =~ /^\s*$/ # empty line, break
91
+ str += line.chomp
92
+ end
93
+
94
+ str.gsub!(/\s+/," ")
95
+ if str =~ /Database:\s+(.+)\.?\s+([0-9,]+)\s+sequences;\s+([0-9,]+)\s+total\s+letters/
96
+ db_name, db_seq_count, db_total_letters = $1, $2, $3
97
+ @query.database = db_name
98
+ @query.database_sequence_count = db_seq_count.gsub(",","").to_i
99
+ @query.database_total_letters = db_total_letters.gsub(",","").to_i
100
+ else
101
+ $stderr.puts "extract_dbinfo: Database line mismatch!"
102
+ $stderr.puts "database, database_sequence_count and database_total_letters are not set"
103
+ $stderr.puts str
104
+ end
105
+
106
+ # eat up single-line summary cruft until beginning of subjects
107
+ while line = buffer.shift
108
+ if line =~ /^>/ # first sbjct, break
109
+ buffer.unshift(line)
110
+ break
111
+ end
112
+ end
113
+ end
114
+
115
+ def extract_sbjct(buffer)
116
+ if buffer[0] !~ /^>/
117
+ $stderr.puts "can't process subject buffer - missing fasta header line!"
118
+ exit(1)
119
+ end
120
+
121
+ str = ""
122
+ # read until blank line to get header, but ensure that we already have Length= line
123
+ while line = buffer.shift
124
+ break if line =~ /^\s*$/ && str =~ /Length\s*=/
125
+ str += line.chomp
126
+ end
127
+ str.gsub!(/\s+/," ") # shrink spaces
128
+ @sbjct = Sbjct.new
129
+ if str =~ />(.+?)\s+(.*)\s*Length\s+=\s+(\d+)/m or
130
+ str =~ />\s(.+?)\s+(.*)\s*Length=\s?(\d+)/m
131
+ @sbjct.number = @query.sbjcts.length + 1
132
+ @sbjct.sbjct_id = $1
133
+ @sbjct.definition = $2.rstrip
134
+ @sbjct.length = $3.to_i
135
+ @sbjct.query = @query
136
+ end
137
+
138
+ hsp_buffer = Array.new
139
+ while line = buffer.shift
140
+ if line =~ /^>/ and hsp_buffer.length > 0
141
+ extract_all_hsps(hsp_buffer)
142
+ hsp_buffer.clear
143
+ buffer.unshift(line)
144
+ break
145
+ else
146
+ hsp_buffer << line
147
+ end
148
+ end
149
+ extract_all_hsps(hsp_buffer) if hsp_buffer.length > 0
150
+
151
+ @query.sbjcts << @sbjct
152
+ end
153
+
154
+ # create Hsp objects from the complete alignment section
155
+ #
156
+ # @param [Array] buffer containing all the lines from the
157
+ # alignment section
158
+ def extract_all_hsps(buffer)
159
+ unless buffer[0] =~ /^\s+Score =/
160
+ $stderr.puts "can't process HSP buffer - missing Score = line!"
161
+ exit(1)
162
+ end
163
+
164
+ hsp_buffer = Array.new
165
+ while line = buffer.shift
166
+ if line =~ /^\s+Score =/ and hsp_buffer.length > 0
167
+ process_hsp(hsp_buffer)
168
+ hsp_buffer.clear
169
+ buffer.unshift(line)
170
+ else
171
+ hsp_buffer << line
172
+ end
173
+ end
174
+ process_hsp(hsp_buffer) if hsp_buffer.length > 0
175
+ end
176
+
177
+ def process_hsp(buffer)
178
+ unless buffer[0] =~ /^\s+Score =/
179
+ $stderr.puts "can't process HSP buffer - missing Score = line!"
180
+ exit(1)
181
+ end
182
+
183
+ str = ""
184
+ # read until blank line to get header
185
+ while line = buffer.shift
186
+ break if line =~ /^\s*$/
187
+ str += line.chomp
188
+ end
189
+
190
+ hsp = Hsp.new
191
+ if str =~ / Score =\s+(\d+(?:\.\d+)?)\s+bits\s+\((\d+)\)/
192
+ hsp.bit_score = $1.to_f
193
+ hsp.score = $2.to_i
194
+ end
195
+
196
+ if str =~ /Expect.*\s+=\s+(\d+\.\d+)/ or
197
+ str =~ /Expect.*\s+=\s+(\d+e-\d+)/ or
198
+ str =~ /Expect.*\s+=\s+(e-\d+)/
199
+ hsp.evalue = $1.to_f
200
+ end
201
+
202
+ if str =~ /Identities\s+=\s+(\d+)\/(\d+)\s+\((\d+%)\)/
203
+ hsp.length = $2.to_i
204
+ hsp.identity = $3.to_i
205
+ end
206
+
207
+ if str =~ /Positives\s+=\s+(\d+)\/(\d+)\s+\((\d+)%\)/
208
+ hsp.positive = $1.to_i
209
+ end
210
+
211
+ if str =~ /Gaps\s+=\s+(\d+)\/(\d+)\s+\((\d+%)\)/
212
+ hsp.gap_count = $1.to_i
213
+ end
214
+
215
+ if str =~ /Frame\s+=\s+([+-]\d)/
216
+ hsp.query_frame = $1
217
+ elsif str =~ /Frame\s+=\s+([+-]\d)\s+\/\s+([+-]\d)/
218
+ hsp.query_frame = $1
219
+ hsp.sbjct_frame = $2
220
+ end
221
+
222
+ if str =~ /Strand\s+=\s+(Plus|Minus)\s+\/\s+(Plus|Minus)/
223
+ hsp.query_frame = $1 == "Plus" ? 1 : -1
224
+ hsp.sbjct_frame = $2 == "Plus" ? 1 : -1
225
+ end
226
+
227
+ # read remaining buffer lines for the alignment
228
+ # buffer.delete_if {|x| x =~ /^\s*$/} # drop empty lines
229
+
230
+ query_to = nil
231
+ sbjct_to = nil
232
+ while buffer.length > 0
233
+ line = buffer.shift
234
+ if line =~ /Query/
235
+ q_line = line
236
+ m_line = buffer.shift
237
+ s_line = buffer.shift
238
+ leader = 0
239
+
240
+ break if q_line =~ /#{@blast_type}/ # end of hsps so exit
241
+ if q_line =~ /\s+Database:\s+#{@query.database}/ # end of report so exit
242
+ break
243
+ else
244
+ end
245
+
246
+ # process query line
247
+ unless q_line =~ /^Query/
248
+ $stderr.puts "Query line is malformed - skipping alignment"
249
+ $stderr.puts q_line
250
+ break
251
+ end
252
+
253
+ q_line =~ /^Query:?\s+(\d+)\s*(.+?)\s+(\d+)$/
254
+ if hsp.query_from.nil?
255
+ hsp.query_from = $1.to_i
256
+ end
257
+ hsp.query_sequence += $2
258
+ query_to = $3.to_i
259
+
260
+ if leader == 0
261
+ q_line =~ /^(Query:?\s+\d+\s*)/
262
+ leader = $1.length
263
+ end
264
+
265
+ # process mid line
266
+ hsp.midline += m_line[leader,m_line.length]
267
+
268
+ # process sbjct line
269
+ unless s_line =~ /^Sbjct/
270
+ $stderr.puts "Sbjct line is malformed - skipping alignment"
271
+ $stderr.puts s_line
272
+ break
273
+ end
274
+
275
+ s_line =~ /^Sbjct:?\s+(\d+)\s+(.+?)\s+(\d+)$/
276
+ if hsp.sbjct_from.nil?
277
+ hsp.sbjct_from = $1.to_i
278
+ end
279
+ hsp.sbjct_sequence += $2
280
+ sbjct_to = $3.to_i
281
+ end # end of if line =~ /Query/
282
+ end # end while buffer.length > 0
283
+ hsp.query_to = query_to
284
+ hsp.sbjct_to = sbjct_to
285
+ @sbjct.hsps << hsp # add this hsp to the sbjct
286
+ end # end extract_hsp
287
+ end # end of MgNu::Parser::Blast::Format0 class
288
+ end # end of MgNu::Parser::Blast class
289
+ end # end of MgNu::Parser module
290
+ end # end of MgNu module
@@ -0,0 +1,121 @@
1
+ # require 'xml/libxml'
2
+ require 'ox'
3
+ require 'mgnu/parser/blast/query'
4
+ require 'mgnu/parser/blast/sbjct'
5
+ require 'mgnu/parser/blast/hsp'
6
+
7
+ module MgNu
8
+ module Parser
9
+ class Blast
10
+ class Format7 < ::Ox::Sax
11
+
12
+ attr_accessor :queries
13
+
14
+ # create a new Format7 parser
15
+ def initialize()
16
+ @query = nil
17
+ @sbjct = nil
18
+ @hsp = nil
19
+ @current_element = nil
20
+ @queries = []
21
+ end
22
+
23
+ def start_element(element)
24
+ # set the current element - used during character parsing
25
+ @current_element = element
26
+
27
+ case element
28
+ when :Iteration
29
+ # start a new Query
30
+ @query = Query.new if @query.nil?
31
+ when :Hit
32
+ # start a new Sbjct
33
+ @sbjct = Sbjct.new if @sbjct.nil?
34
+ @sbjct.query = @query
35
+ when :Hsp
36
+ # start a new Hsp
37
+ @hsp = Hsp.new if @hsp.nil?
38
+ @hsp.sbjct = @sbjct
39
+ @hsp.query = @query
40
+ end
41
+ end
42
+
43
+ def text(characters = '')
44
+ return if characters =~ /^\s+$/
45
+ case @current_element
46
+ when :"Iteration_iter-num"
47
+ @query.number = characters.to_i
48
+ when :"Iteration_query-ID"
49
+ @query.query_id += characters
50
+ when :"Iteration_query-def"
51
+ @query.definition += characters
52
+ when :"Iteration_query-len"
53
+ @query.length = characters.to_i
54
+
55
+ when :Hit_num
56
+ @sbjct.number = characters.to_i
57
+ when :Hit_id
58
+ @sbjct.sbjct_id += characters
59
+ when :Hit_def
60
+ @sbjct.definition += characters
61
+ when :Hit_accession
62
+ @sbjct.accession += characters
63
+ when :Hit_len
64
+ @sbjct.length = characters.to_i
65
+
66
+ when :Hsp_num
67
+ @hsp.number = characters.to_i
68
+ when :"Hsp_bit-score"
69
+ @hsp.bit_score = characters.to_i
70
+ when :Hsp_score
71
+ @hsp.score = characters.to_i
72
+ when :Hsp_evalue
73
+ @hsp.evalue = characters.to_f
74
+ when :"Hsp_query-from"
75
+ @hsp.query_from = characters.to_i
76
+ when :"Hsp_query-to"
77
+ @hsp.query_to = characters.to_i
78
+ when :"Hsp_hit-from"
79
+ @hsp.sbjct_from = characters.to_i
80
+ when :"Hsp_hit-to"
81
+ @hsp.sbjct_to = characters.to_i
82
+ when :"Hsp_query-frame"
83
+ @hsp.query_frame = characters.to_i
84
+ when :"Hsp_hit-frame"
85
+ @hsp.sbjct_frame = characters.to_i
86
+ when :Hsp_identity
87
+ @hsp.identity = characters.to_i
88
+ when :Hsp_positive
89
+ @hsp.positive = characters.to_i
90
+ when :"Hsp_align-len"
91
+ @hsp.length = characters.to_i
92
+ when :Hsp_qseq
93
+ @hsp.query_sequence += characters
94
+ when :Hsp_hseq
95
+ @hsp.sbjct_sequence += characters
96
+ when :Hsp_midline
97
+ @hsp.midline += characters
98
+ end
99
+ end
100
+
101
+ def end_element(element)
102
+ case element
103
+ when :Iteration
104
+ # end of a query
105
+ @queries << @query
106
+ @query = Query.new
107
+ when :Hit
108
+ # end of a sbjct
109
+ @query.sbjcts << @sbjct
110
+ @sbjct = Sbjct.new
111
+ when :Hsp
112
+ # end of a hsp
113
+ @sbjct.hsps << @hsp
114
+ @hsp = Hsp.new
115
+ end
116
+ end
117
+
118
+ end # end of MgNu::Parser::Blast::Format7 class
119
+ end # end of MgNu::Parser::Blast class
120
+ end # end of MgNu::Parser module
121
+ end # end of MgNu module