mgnu 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,120 @@
1
+ require 'mgnu/parser/blast/query'
2
+ require 'mgnu/parser/blast/sbjct'
3
+ require 'mgnu/parser/blast/hsp'
4
+
5
+
6
+ module MgNu
7
+ module Parser
8
+ class Blast
9
+ class Format8
10
+ include Enumerable
11
+
12
+ attr_accessor :queries
13
+
14
+ # create a new Format8 parser object
15
+ def initialize(input)
16
+ @query = nil
17
+ @sbjct = nil
18
+ @queries = []
19
+
20
+ @input = input
21
+ end
22
+
23
+ def each
24
+ @input.each do |line|
25
+ next if line =~ /^#/ # skip comments
26
+
27
+ temp = line.split(/\t/)
28
+
29
+ query_id = temp.shift
30
+
31
+ if @query.nil?
32
+ @query = Query.new
33
+ @query.query_id = query_id
34
+ end
35
+
36
+ if @query.query_id == query_id
37
+ # already on this query, so just add the sbject
38
+ extract_sbjct(temp)
39
+ else
40
+ # new query_id, save this one and start on new one
41
+ @query.sbjcts << @sbjct
42
+ @sbjct = nil
43
+ yield @query
44
+ @query = Query.new
45
+ @query.query_id = query_id
46
+ extract_sbjct(temp)
47
+ end
48
+ end
49
+ end
50
+
51
+ def parse
52
+ @input.each do |line|
53
+ next if line =~ /^#/ # skip comments
54
+
55
+ temp = line.split
56
+
57
+ query_id = temp.shift
58
+
59
+ if @query.nil?
60
+ @query = Query.new
61
+ @query.query_id = query_id
62
+ end
63
+
64
+ if @query.query_id == query_id
65
+ # already on this query, so just add the sbject
66
+ extract_sbjct(temp)
67
+ else
68
+ # new query_id, save this one and start on new one
69
+ @query.sbjcts << @sbjct
70
+ @queries << @query
71
+ @sbjct = nil
72
+ @query = Query.new
73
+ @query.query_id = query_id
74
+ extract_sbjct(temp)
75
+ end
76
+ end # end of input.each do |line|
77
+
78
+ #grab the last ones, if present
79
+ unless @query.nil?
80
+ @query.sbjcts << @sbjct
81
+ @queries << @query
82
+ end
83
+ end
84
+
85
+ def extract_sbjct(input)
86
+ sbjct_id = input.shift
87
+ if @sbjct.nil?
88
+ @sbjct = Sbjct.new
89
+ @sbjct.sbjct_id = sbjct_id
90
+ end
91
+
92
+ if @sbjct.sbjct_id == sbjct_id
93
+ extract_hsp(input)
94
+ else
95
+ @query.sbjcts << @sbjct
96
+ @sbjct = Sbjct.new
97
+ @sbjct.sbjct_id = sbjct_id
98
+ extract_hsp(input)
99
+ end
100
+ end
101
+
102
+ def extract_hsp(input)
103
+ hsp = Hsp.new
104
+ hsp.identity = input.shift.to_f
105
+ hsp.length = input.shift.to_i
106
+ hsp.mismatches = input.shift.to_i
107
+ hsp.gap_count = input.shift.to_i
108
+ hsp.query_from = input.shift.to_i
109
+ hsp.query_to = input.shift.to_i
110
+ hsp.sbjct_from = input.shift.to_i
111
+ hsp.sbjct_to = input.shift.to_i
112
+ hsp.evalue = input.shift.to_f
113
+ hsp.bit_score = input.shift.to_f
114
+ @sbjct.hsps << hsp
115
+ end
116
+
117
+ end # end of MgNu::Parser::Blast::Format8 class
118
+ end # end of MgNu::Parser::Blast class
119
+ end # end of MgNu::Parser module
120
+ end # end of MgNu module
@@ -0,0 +1,75 @@
1
+ module MgNu
2
+ module Parser
3
+ class Blast
4
+ class Hsp
5
+
6
+ attr_accessor :number, :bit_score, :score, :evalue
7
+ attr_accessor :query_from, :query_to, :sbjct_from, :sbjct_to
8
+ attr_accessor :query_frame, :sbjct_frame, :identity, :positive
9
+ attr_accessor :length, :query_sequence, :sbjct_sequence, :midline
10
+ attr_accessor :gap_count, :mismatches, :sbjct, :query
11
+
12
+ # create a new Hsp object
13
+ def initialize
14
+ @number = nil
15
+ @bit_score = nil
16
+ @score = nil
17
+ @evalue = nil
18
+ @query_from = nil
19
+ @query_to = nil
20
+ @sbjct_from = nil
21
+ @sbjct_to = nil
22
+ @query_frame = nil
23
+ @sbjct_frame = nil
24
+ @identity = nil
25
+ @positive = nil
26
+ @length = nil
27
+ @query_sequence = ""
28
+ @sbjct_sequence = ""
29
+ @midline = ""
30
+ @gap_count = nil
31
+ @mismatches = nil
32
+ @sbjct = nil
33
+ @query = nil
34
+ end
35
+
36
+ def query_frameshifts
37
+ if @query_sequence =~ /(?:\/|\\)/
38
+ loc2frame = Hash.new
39
+ re = /[\/\\]{1,2}/
40
+ re.global_match(@query_sequence.gsub(/[- ]/,'')) do |m|
41
+ frame = nil
42
+ # m.begin(0) is location of char match
43
+ # (m.begin(0) - 1) * 3 is the length of the coding dna
44
+ # up to the (but not including) the char match
45
+ # (m.begin(0) - 1) * 3 + @query_from - 1 is the corrected
46
+ # position taking into account the start of the query
47
+ # sequence. query_from is reported in nt
48
+ if @query_from > @query_to
49
+ location = @query_from - (m.begin(0) * 3 - 1)
50
+ else
51
+ location = (m.begin(0) * 3 - 1) + @query_from
52
+ end
53
+ case m[0]
54
+ when '/'
55
+ frame = 1
56
+ when '//'
57
+ frame = 2
58
+ when '\\'
59
+ frame = -1
60
+ when '\\\\'
61
+ frame = -2
62
+ end
63
+ loc2frame[location] = frame
64
+ end # end re.global_match
65
+ return loc2frame
66
+ else
67
+ return nil
68
+ end # end if @query_sequence =~ /(?:\/|\\)/
69
+ end # end query_frameshifts
70
+
71
+ end # end of MgNu::Parser::Blast::Hsp class
72
+ end # end of MgNu::Parser::Blast class
73
+
74
+ end # end of MgNu::Parser module
75
+ end # end of MgNu module
@@ -0,0 +1,45 @@
1
+ module MgNu
2
+ module Parser
3
+ class Blast
4
+ class Query
5
+
6
+ attr_accessor :number, :query_id, :definition, :length, :sbjcts
7
+ attr_accessor :database, :database_sequence_count, :database_total_letters
8
+
9
+ # create a new Query object
10
+ def initialize
11
+ @number = nil
12
+ @query_id = ""
13
+ @definition = ""
14
+ @length = nil
15
+ @sbjcts = []
16
+ @best_hit = nil
17
+ @database = nil
18
+ @database_sequence_count = 0
19
+ @database_total_letters = 0
20
+ end
21
+
22
+ # Returns the @best_hit instance variable. If not set, it
23
+ # will search this query's sbjcts and find the one with the best
24
+ # evalue and return it
25
+ #
26
+ # @return [MgNu::Parser::Blast::Sbjct] the best hit for this
27
+ # query
28
+ def best_hit
29
+ return @best_hit unless @best_hit.nil?
30
+ if @sbjcts.length > 0 # make sure there are some hits
31
+ best_hit = @sbjcts[0]
32
+ @sbjcts.each do |s|
33
+ if s.evalue < best_hit.evalue
34
+ best_hit = s
35
+ end
36
+ end
37
+ @best_hit = best_hit
38
+ return best_hit
39
+ end
40
+ return nil
41
+ end
42
+ end # end of MgNu::Parser::Blast::Query class
43
+ end # end of MgNu::Parser::Blast class
44
+ end # end of MgNu::Parser module
45
+ end # end of MgNu module
@@ -0,0 +1,62 @@
1
+ module MgNu
2
+ module Parser
3
+ class Blast
4
+ class Sbjct
5
+
6
+ attr_accessor :hsps
7
+ attr_accessor :number, :sbjct_id, :definition, :length
8
+ attr_accessor :accession, :query
9
+
10
+ # create a new Sbjct object
11
+ def initialize
12
+ @number = nil
13
+ @sbjct_id = ""
14
+ @definition = ""
15
+ @length = nil
16
+ @accession = ""
17
+ @hsps = []
18
+ @best_hsp = nil
19
+ @query = nil
20
+ end
21
+
22
+ # searches hsps and looks for the best and returns it's evalue
23
+ def evalue
24
+ # call the best_hsp method and see if result is nil
25
+ best_hsp.nil? ? nil : @best_hsp.evalue
26
+ end # end of Sbjct#evalue
27
+
28
+ # searches hsps and looks for the best and returns it's
29
+ # bit_score
30
+ def bit_score
31
+ # call the best_hsp method and see if result is nil
32
+ best_hsp.nil? ? nil : @best_hsp.bit_score
33
+ end # end of Sbjct#bit_score
34
+
35
+ # searches hsps and looks for the best and returns it's
36
+ # identity
37
+ def identity
38
+ # call the best_hsp method and see if result is nil
39
+ best_hsp.nil? ? nil : @best_hsp.identity
40
+ end # end of Sbjct#bit_score
41
+
42
+ # searches hsps and looks for the best and sets the instance
43
+ # variable
44
+ def best_hsp
45
+ if @best_hsp.nil?
46
+ if @hsps.length > 0 # have some hsps for this hit
47
+ temp_best = @hsps[0]
48
+ @hsps.each do |h|
49
+ if h.evalue < temp_best.evalue
50
+ temp_best = h
51
+ end
52
+ end
53
+ @best_hsp = temp_best
54
+ end
55
+ end
56
+ @best_hsp
57
+ end # end of Sbjct#evalue
58
+ end # end of MgNu::Parser::Blast::Sbjct class
59
+ end # end of MgNu::Parser::Blast class
60
+
61
+ end # end of MgNu::Parser module
62
+ end # end of MgNu module
@@ -0,0 +1,72 @@
1
+ module MgNu
2
+ module Parser
3
+ # ClustalW is the class used for parsing clustalw multiple alignment output.
4
+ class ClustalW
5
+ attr_accessor :buffer, :raw
6
+ attr_reader :file, :alignment
7
+
8
+ # params [String] alignment file (*.aln)
9
+ # params [Boolean] is this a file (default is true), or a string?
10
+ # returns [MgNu::Alignment]
11
+ def initialize(input = nil, file = true)
12
+ if input
13
+ if file
14
+ if File.exists?(input) and File.readable?(input)
15
+ @raw = File.read(input)
16
+ end # end of exists and readable file checks
17
+ else # file is false, so this must be a string with input
18
+ @raw = input
19
+ end
20
+ @buffer = @raw.split(/\r?\n\r?\n/)
21
+ @alignment = nil
22
+ self.parse
23
+ if @buffer.length == 0
24
+ puts "ClustalW alignment file #{input} did not parse!"
25
+ exit(1);
26
+ end
27
+ else
28
+ error("MgNu::Parser::ClustalW.new(): need an existing file")
29
+ end
30
+ end
31
+
32
+ # process the input multiple alignement
33
+ def parse
34
+ if @alignment == nil
35
+ header = @buffer.shift
36
+ @buffer[0].gsub!(/^(\r?\n)+/, '') # drop newline at start of section
37
+ @buffer.collect! { |section| section.split(/\r?\n/) }
38
+
39
+ match_lines = []
40
+ # drop numbers if the alignment was run with "-SEQNOS=on"
41
+ @buffer.each do |section|
42
+ section.each { |line| line.sub!(/\s+\d+\s*$/, '') }
43
+ match_lines << section.pop
44
+ end
45
+
46
+ # get the 1st position of a space from the right using
47
+ # rindex. Increment this by 1 to get the seq_start
48
+ seq_start = (@buffer[0][0].rindex(/\s/) || -1) + 1
49
+
50
+ # create ordered array of hashes with
51
+ # seqname => sequence and create an array with a order of
52
+ # sequences (seqname as value)
53
+ order = Array.new
54
+ h = Hash.new
55
+ @buffer.each do |section|
56
+ section.each do |line|
57
+ name = line[0, seq_start].sub(/\s+\z/, '')
58
+ sequence = line[seq_start..-1]
59
+ if h.has_key?(name)
60
+ h[name] += sequence
61
+ else
62
+ order << name
63
+ h[name] = sequence
64
+ end
65
+ end
66
+ end
67
+ end
68
+ @alignment = MgNu::Alignment.new(h, order)
69
+ end # end of #parse method
70
+ end # end of MgNu::Parser::ClustalW class
71
+ end # end of MgNu::Parser module
72
+ end # end of MgNu module
@@ -0,0 +1,61 @@
1
+ module MgNu
2
+ module Parser
3
+ class Fasta
4
+ include Enumerable
5
+
6
+ attr_reader :file, :filename
7
+
8
+ # create a new Fasta parser
9
+ def initialize(filename = nil, quality_file = false)
10
+ @quality_file = quality_file
11
+ @filename = filename
12
+ if filename
13
+ if File.exists?(filename) and File.readable?(filename)
14
+ @file = File.open(filename)
15
+ else
16
+ raise "\n\n -- No file by that name (#{filename}). Exiting\n\n"
17
+ exit(1)
18
+ #@file = File.new(filename, "w")
19
+ end
20
+ else
21
+ error("MgNu::Parser::Fasta.new(): need a filename or an existing file")
22
+ end
23
+ end
24
+
25
+ # override enumerables
26
+ def each
27
+ @buffer = [] # temp storage
28
+ @file.each_line do |line|
29
+ line.chomp!
30
+ if line =~ />(.*)/ # got a header line
31
+ if @buffer.length > 0
32
+ if @quality_file
33
+ yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
34
+ :sequence => @buffer.join(" "))
35
+ else
36
+ yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
37
+ :sequence => @buffer.join(""))
38
+ end
39
+ end
40
+ @buffer = []
41
+ @buffer << $1
42
+ else # got a sequence line
43
+ @buffer << line
44
+ end
45
+ end # end of file io
46
+ @file.close
47
+
48
+ # don't forget to yield the last one
49
+ if @buffer.length > 0
50
+ if @quality_file
51
+ yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
52
+ :sequence => @buffer.join(" "))
53
+ else
54
+ yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
55
+ :sequence => @buffer.join(""))
56
+ end
57
+ end
58
+ end # end of #each
59
+ end # end of MgNu::Parser::Fasta class
60
+ end # end of MgNu::File module
61
+ end # end of MgNu module
@@ -0,0 +1,39 @@
1
+ require 'moneta'
2
+ module MgNu
3
+ module Parser
4
+ class FastaHeaderIndex
5
+ attr_reader :filename, :db_name, :db
6
+
7
+ def initialize(filename)
8
+ @filename = filename
9
+ if filename =~ /^.+\.hdr\.tch$/
10
+ @db_name = @filename
11
+ else
12
+ @db_name = @filename + ".hdr.tch"
13
+ end
14
+
15
+ if File.exist?(@db_name)
16
+ @db = Moneta.new(:TokyoCabinet, file: @db_name)
17
+ else
18
+ @db = Moneta.new(:TokyoCabinet, file: @db_name)
19
+ parse
20
+ end
21
+ end
22
+
23
+ # setup parse method for creating tokyo cabinet
24
+ def parse
25
+ MgNu::Parser::Fasta.new(@filename).each do |f|
26
+ @db[f.header_name] = f.header_description
27
+ end
28
+ end # end of #parse
29
+
30
+ def [](name)
31
+ @db[name] ? @db[name] : nil
32
+ end
33
+
34
+ def close
35
+ @db.close unless @db.nil?
36
+ end
37
+ end # end of MgNu::Parser::FastaHeaderIndex class
38
+ end # end of MgNu::Parser module
39
+ end # end of MgNu module