mgnu 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,120 @@
1
+ require 'mgnu/parser/blast/query'
2
+ require 'mgnu/parser/blast/sbjct'
3
+ require 'mgnu/parser/blast/hsp'
4
+
5
+
6
+ module MgNu
7
+ module Parser
8
+ class Blast
9
+ class Format8
10
+ include Enumerable
11
+
12
+ attr_accessor :queries
13
+
14
+ # create a new Format8 parser object
15
+ def initialize(input)
16
+ @query = nil
17
+ @sbjct = nil
18
+ @queries = []
19
+
20
+ @input = input
21
+ end
22
+
23
+ def each
24
+ @input.each do |line|
25
+ next if line =~ /^#/ # skip comments
26
+
27
+ temp = line.split(/\t/)
28
+
29
+ query_id = temp.shift
30
+
31
+ if @query.nil?
32
+ @query = Query.new
33
+ @query.query_id = query_id
34
+ end
35
+
36
+ if @query.query_id == query_id
37
+ # already on this query, so just add the sbject
38
+ extract_sbjct(temp)
39
+ else
40
+ # new query_id, save this one and start on new one
41
+ @query.sbjcts << @sbjct
42
+ @sbjct = nil
43
+ yield @query
44
+ @query = Query.new
45
+ @query.query_id = query_id
46
+ extract_sbjct(temp)
47
+ end
48
+ end
49
+ end
50
+
51
+ def parse
52
+ @input.each do |line|
53
+ next if line =~ /^#/ # skip comments
54
+
55
+ temp = line.split
56
+
57
+ query_id = temp.shift
58
+
59
+ if @query.nil?
60
+ @query = Query.new
61
+ @query.query_id = query_id
62
+ end
63
+
64
+ if @query.query_id == query_id
65
+ # already on this query, so just add the sbject
66
+ extract_sbjct(temp)
67
+ else
68
+ # new query_id, save this one and start on new one
69
+ @query.sbjcts << @sbjct
70
+ @queries << @query
71
+ @sbjct = nil
72
+ @query = Query.new
73
+ @query.query_id = query_id
74
+ extract_sbjct(temp)
75
+ end
76
+ end # end of input.each do |line|
77
+
78
+ #grab the last ones, if present
79
+ unless @query.nil?
80
+ @query.sbjcts << @sbjct
81
+ @queries << @query
82
+ end
83
+ end
84
+
85
+ def extract_sbjct(input)
86
+ sbjct_id = input.shift
87
+ if @sbjct.nil?
88
+ @sbjct = Sbjct.new
89
+ @sbjct.sbjct_id = sbjct_id
90
+ end
91
+
92
+ if @sbjct.sbjct_id == sbjct_id
93
+ extract_hsp(input)
94
+ else
95
+ @query.sbjcts << @sbjct
96
+ @sbjct = Sbjct.new
97
+ @sbjct.sbjct_id = sbjct_id
98
+ extract_hsp(input)
99
+ end
100
+ end
101
+
102
+ def extract_hsp(input)
103
+ hsp = Hsp.new
104
+ hsp.identity = input.shift.to_f
105
+ hsp.length = input.shift.to_i
106
+ hsp.mismatches = input.shift.to_i
107
+ hsp.gap_count = input.shift.to_i
108
+ hsp.query_from = input.shift.to_i
109
+ hsp.query_to = input.shift.to_i
110
+ hsp.sbjct_from = input.shift.to_i
111
+ hsp.sbjct_to = input.shift.to_i
112
+ hsp.evalue = input.shift.to_f
113
+ hsp.bit_score = input.shift.to_f
114
+ @sbjct.hsps << hsp
115
+ end
116
+
117
+ end # end of MgNu::Parser::Blast::Format8 class
118
+ end # end of MgNu::Parser::Blast class
119
+ end # end of MgNu::Parser module
120
+ end # end of MgNu module
@@ -0,0 +1,75 @@
1
+ module MgNu
2
+ module Parser
3
+ class Blast
4
+ class Hsp
5
+
6
+ attr_accessor :number, :bit_score, :score, :evalue
7
+ attr_accessor :query_from, :query_to, :sbjct_from, :sbjct_to
8
+ attr_accessor :query_frame, :sbjct_frame, :identity, :positive
9
+ attr_accessor :length, :query_sequence, :sbjct_sequence, :midline
10
+ attr_accessor :gap_count, :mismatches, :sbjct, :query
11
+
12
+ # create a new Hsp object
13
+ def initialize
14
+ @number = nil
15
+ @bit_score = nil
16
+ @score = nil
17
+ @evalue = nil
18
+ @query_from = nil
19
+ @query_to = nil
20
+ @sbjct_from = nil
21
+ @sbjct_to = nil
22
+ @query_frame = nil
23
+ @sbjct_frame = nil
24
+ @identity = nil
25
+ @positive = nil
26
+ @length = nil
27
+ @query_sequence = ""
28
+ @sbjct_sequence = ""
29
+ @midline = ""
30
+ @gap_count = nil
31
+ @mismatches = nil
32
+ @sbjct = nil
33
+ @query = nil
34
+ end
35
+
36
+ def query_frameshifts
37
+ if @query_sequence =~ /(?:\/|\\)/
38
+ loc2frame = Hash.new
39
+ re = /[\/\\]{1,2}/
40
+ re.global_match(@query_sequence.gsub(/[- ]/,'')) do |m|
41
+ frame = nil
42
+ # m.begin(0) is location of char match
43
+ # (m.begin(0) - 1) * 3 is the length of the coding dna
44
+ # up to the (but not including) the char match
45
+ # (m.begin(0) - 1) * 3 + @query_from - 1 is the corrected
46
+ # position taking into account the start of the query
47
+ # sequence. query_from is reported in nt
48
+ if @query_from > @query_to
49
+ location = @query_from - (m.begin(0) * 3 - 1)
50
+ else
51
+ location = (m.begin(0) * 3 - 1) + @query_from
52
+ end
53
+ case m[0]
54
+ when '/'
55
+ frame = 1
56
+ when '//'
57
+ frame = 2
58
+ when '\\'
59
+ frame = -1
60
+ when '\\\\'
61
+ frame = -2
62
+ end
63
+ loc2frame[location] = frame
64
+ end # end re.global_match
65
+ return loc2frame
66
+ else
67
+ return nil
68
+ end # end if @query_sequence =~ /(?:\/|\\)/
69
+ end # end query_frameshifts
70
+
71
+ end # end of MgNu::Parser::Blast::Hsp class
72
+ end # end of MgNu::Parser::Blast class
73
+
74
+ end # end of MgNu::Parser module
75
+ end # end of MgNu module
@@ -0,0 +1,45 @@
1
+ module MgNu
2
+ module Parser
3
+ class Blast
4
+ class Query
5
+
6
+ attr_accessor :number, :query_id, :definition, :length, :sbjcts
7
+ attr_accessor :database, :database_sequence_count, :database_total_letters
8
+
9
+ # create a new Query object
10
+ def initialize
11
+ @number = nil
12
+ @query_id = ""
13
+ @definition = ""
14
+ @length = nil
15
+ @sbjcts = []
16
+ @best_hit = nil
17
+ @database = nil
18
+ @database_sequence_count = 0
19
+ @database_total_letters = 0
20
+ end
21
+
22
+ # Returns the @best_hit instance variable. If not set, it
23
+ # will search this query's sbjcts and find the one with the best
24
+ # evalue and return it
25
+ #
26
+ # @return [MgNu::Parser::Blast::Sbjct] the best hit for this
27
+ # query
28
+ def best_hit
29
+ return @best_hit unless @best_hit.nil?
30
+ if @sbjcts.length > 0 # make sure there are some hits
31
+ best_hit = @sbjcts[0]
32
+ @sbjcts.each do |s|
33
+ if s.evalue < best_hit.evalue
34
+ best_hit = s
35
+ end
36
+ end
37
+ @best_hit = best_hit
38
+ return best_hit
39
+ end
40
+ return nil
41
+ end
42
+ end # end of MgNu::Parser::Blast::Query class
43
+ end # end of MgNu::Parser::Blast class
44
+ end # end of MgNu::Parser module
45
+ end # end of MgNu module
@@ -0,0 +1,62 @@
1
+ module MgNu
2
+ module Parser
3
+ class Blast
4
+ class Sbjct
5
+
6
+ attr_accessor :hsps
7
+ attr_accessor :number, :sbjct_id, :definition, :length
8
+ attr_accessor :accession, :query
9
+
10
+ # create a new Sbjct object
11
+ def initialize
12
+ @number = nil
13
+ @sbjct_id = ""
14
+ @definition = ""
15
+ @length = nil
16
+ @accession = ""
17
+ @hsps = []
18
+ @best_hsp = nil
19
+ @query = nil
20
+ end
21
+
22
+ # searches hsps and looks for the best and returns it's evalue
23
+ def evalue
24
+ # call the best_hsp method and see if result is nil
25
+ best_hsp.nil? ? nil : @best_hsp.evalue
26
+ end # end of Sbjct#evalue
27
+
28
+ # searches hsps and looks for the best and returns it's
29
+ # bit_score
30
+ def bit_score
31
+ # call the best_hsp method and see if result is nil
32
+ best_hsp.nil? ? nil : @best_hsp.bit_score
33
+ end # end of Sbjct#bit_score
34
+
35
+ # searches hsps and looks for the best and returns it's
36
+ # identity
37
+ def identity
38
+ # call the best_hsp method and see if result is nil
39
+ best_hsp.nil? ? nil : @best_hsp.identity
40
+ end # end of Sbjct#bit_score
41
+
42
+ # searches hsps and looks for the best and sets the instance
43
+ # variable
44
+ def best_hsp
45
+ if @best_hsp.nil?
46
+ if @hsps.length > 0 # have some hsps for this hit
47
+ temp_best = @hsps[0]
48
+ @hsps.each do |h|
49
+ if h.evalue < temp_best.evalue
50
+ temp_best = h
51
+ end
52
+ end
53
+ @best_hsp = temp_best
54
+ end
55
+ end
56
+ @best_hsp
57
+ end # end of Sbjct#evalue
58
+ end # end of MgNu::Parser::Blast::Sbjct class
59
+ end # end of MgNu::Parser::Blast class
60
+
61
+ end # end of MgNu::Parser module
62
+ end # end of MgNu module
@@ -0,0 +1,72 @@
1
+ module MgNu
2
+ module Parser
3
+ # ClustalW is the class used for parsing clustalw multiple alignment output.
4
+ class ClustalW
5
+ attr_accessor :buffer, :raw
6
+ attr_reader :file, :alignment
7
+
8
+ # params [String] alignment file (*.aln)
9
+ # params [Boolean] is this a file (default is true), or a string?
10
+ # returns [MgNu::Alignment]
11
+ def initialize(input = nil, file = true)
12
+ if input
13
+ if file
14
+ if File.exists?(input) and File.readable?(input)
15
+ @raw = File.read(input)
16
+ end # end of exists and readable file checks
17
+ else # file is false, so this must be a string with input
18
+ @raw = input
19
+ end
20
+ @buffer = @raw.split(/\r?\n\r?\n/)
21
+ @alignment = nil
22
+ self.parse
23
+ if @buffer.length == 0
24
+ puts "ClustalW alignment file #{input} did not parse!"
25
+ exit(1);
26
+ end
27
+ else
28
+ error("MgNu::Parser::ClustalW.new(): need an existing file")
29
+ end
30
+ end
31
+
32
+ # process the input multiple alignement
33
+ def parse
34
+ if @alignment == nil
35
+ header = @buffer.shift
36
+ @buffer[0].gsub!(/^(\r?\n)+/, '') # drop newline at start of section
37
+ @buffer.collect! { |section| section.split(/\r?\n/) }
38
+
39
+ match_lines = []
40
+ # drop numbers if the alignment was run with "-SEQNOS=on"
41
+ @buffer.each do |section|
42
+ section.each { |line| line.sub!(/\s+\d+\s*$/, '') }
43
+ match_lines << section.pop
44
+ end
45
+
46
+ # get the 1st position of a space from the right using
47
+ # rindex. Increment this by 1 to get the seq_start
48
+ seq_start = (@buffer[0][0].rindex(/\s/) || -1) + 1
49
+
50
+ # create ordered array of hashes with
51
+ # seqname => sequence and create an array with a order of
52
+ # sequences (seqname as value)
53
+ order = Array.new
54
+ h = Hash.new
55
+ @buffer.each do |section|
56
+ section.each do |line|
57
+ name = line[0, seq_start].sub(/\s+\z/, '')
58
+ sequence = line[seq_start..-1]
59
+ if h.has_key?(name)
60
+ h[name] += sequence
61
+ else
62
+ order << name
63
+ h[name] = sequence
64
+ end
65
+ end
66
+ end
67
+ end
68
+ @alignment = MgNu::Alignment.new(h, order)
69
+ end # end of #parse method
70
+ end # end of MgNu::Parser::ClustalW class
71
+ end # end of MgNu::Parser module
72
+ end # end of MgNu module
@@ -0,0 +1,61 @@
1
+ module MgNu
2
+ module Parser
3
+ class Fasta
4
+ include Enumerable
5
+
6
+ attr_reader :file, :filename
7
+
8
+ # create a new Fasta parser
9
+ def initialize(filename = nil, quality_file = false)
10
+ @quality_file = quality_file
11
+ @filename = filename
12
+ if filename
13
+ if File.exists?(filename) and File.readable?(filename)
14
+ @file = File.open(filename)
15
+ else
16
+ raise "\n\n -- No file by that name (#{filename}). Exiting\n\n"
17
+ exit(1)
18
+ #@file = File.new(filename, "w")
19
+ end
20
+ else
21
+ error("MgNu::Parser::Fasta.new(): need a filename or an existing file")
22
+ end
23
+ end
24
+
25
+ # override enumerables
26
+ def each
27
+ @buffer = [] # temp storage
28
+ @file.each_line do |line|
29
+ line.chomp!
30
+ if line =~ />(.*)/ # got a header line
31
+ if @buffer.length > 0
32
+ if @quality_file
33
+ yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
34
+ :sequence => @buffer.join(" "))
35
+ else
36
+ yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
37
+ :sequence => @buffer.join(""))
38
+ end
39
+ end
40
+ @buffer = []
41
+ @buffer << $1
42
+ else # got a sequence line
43
+ @buffer << line
44
+ end
45
+ end # end of file io
46
+ @file.close
47
+
48
+ # don't forget to yield the last one
49
+ if @buffer.length > 0
50
+ if @quality_file
51
+ yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
52
+ :sequence => @buffer.join(" "))
53
+ else
54
+ yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
55
+ :sequence => @buffer.join(""))
56
+ end
57
+ end
58
+ end # end of #each
59
+ end # end of MgNu::Parser::Fasta class
60
+ end # end of MgNu::File module
61
+ end # end of MgNu module
@@ -0,0 +1,39 @@
1
+ require 'moneta'
2
+ module MgNu
3
+ module Parser
4
+ class FastaHeaderIndex
5
+ attr_reader :filename, :db_name, :db
6
+
7
+ def initialize(filename)
8
+ @filename = filename
9
+ if filename =~ /^.+\.hdr\.tch$/
10
+ @db_name = @filename
11
+ else
12
+ @db_name = @filename + ".hdr.tch"
13
+ end
14
+
15
+ if File.exist?(@db_name)
16
+ @db = Moneta.new(:TokyoCabinet, file: @db_name)
17
+ else
18
+ @db = Moneta.new(:TokyoCabinet, file: @db_name)
19
+ parse
20
+ end
21
+ end
22
+
23
+ # setup parse method for creating tokyo cabinet
24
+ def parse
25
+ MgNu::Parser::Fasta.new(@filename).each do |f|
26
+ @db[f.header_name] = f.header_description
27
+ end
28
+ end # end of #parse
29
+
30
+ def [](name)
31
+ @db[name] ? @db[name] : nil
32
+ end
33
+
34
+ def close
35
+ @db.close unless @db.nil?
36
+ end
37
+ end # end of MgNu::Parser::FastaHeaderIndex class
38
+ end # end of MgNu::Parser module
39
+ end # end of MgNu module