mgnu 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,150 @@
1
+ require 'mgnu/sequence'
2
+
3
+ module MgNu
4
+ class Genbank
5
+ class Location
6
+ InvalidLocation = Class.new(StandardError)
7
+ LocationWithRemoteAccession = Class.new(StandardError)
8
+
9
+ BASERANGE_REGEX = /
10
+ (?<complement>complement)?\(?
11
+ (?<remote_accession>[A-Z\d\.]+:)?
12
+ (?<start_continues><)?
13
+ (?<start>\d+)\.\.
14
+ (?<stop_continues>>)?
15
+ (?<stop>\d+)
16
+ \)?/x
17
+
18
+ attr_accessor :raw_value, :start, :stop, :start_continues, :stop_continues
19
+ attr_accessor :complement, :type, :parts
20
+
21
+ # create a new Location object
22
+ def initialize(raw_value)
23
+ @raw_value = raw_value.gsub(/\s/, '')
24
+ parse_raw_value
25
+ end
26
+
27
+ # parsing the location from a loc line
28
+ def parse_raw_value
29
+ case raw_value
30
+ when /^complement\(join\((.+)\)/
31
+ @type = 'complement_with_join'
32
+ @complement = true
33
+ @parts = Regexp.last_match[1].split(/,/)
34
+ set_properties_for_join_types
35
+ when /^(?:join|order)\((.+)\)/
36
+ @type = 'join'
37
+ @parts = Regexp.last_match[1].split(/,/)
38
+ set_properties_for_join_types
39
+ when BASERANGE_REGEX
40
+ @type = 'standard'
41
+ set_basic_properties(raw_value)
42
+ when /^(\d+)\.(\d+)$/
43
+ @type = 'between_range'
44
+ @start, @stop = Regexp.last_match[1].to_i, Regexp.last_match[2].to_i
45
+ when /^(\d+)^(\d+)$/
46
+ @type = 'between_adjoining'
47
+ @start, @stop = Regexp.last_match[1].to_i, Regexp.last_match[2].to_i
48
+ when /^(complement)?\(?(\d+)\)?$/
49
+ @type = 'single'
50
+ @complement = !!Regexp.last_match[1]
51
+ @start = Regexp.last_match[2].to_i
52
+ else
53
+ fail InvalidLocation, 'This is not a valid Genbank location'
54
+ end
55
+ end
56
+
57
+ def set_properties_for_join_types
58
+ non_remote = parts.select { |part| part !~ /[A-Z\d\.]+:/ }
59
+ if non_remote.length == 1
60
+ set_basic_properties(non_remote.first)
61
+ else
62
+ @complement ||= !!(non_remote.first =~ /complement/)
63
+ # sets start and stop based on first and last non remote part, taking into account complement strand
64
+ stop_match, start_match = nil
65
+ if complement
66
+ stop_match = /(?<stop_continues><)?(?<stop>\d+)/.match(non_remote.first)
67
+ start_match = /\.\.(?<start_continues>>)?(?<start>\d+)/.match(non_remote.last)
68
+ else
69
+ start_match = /(?<start_continues><)?(?<start>\d+)/.match(non_remote.first)
70
+ stop_match = /\.\.(?<stop_continues>>)?(?<stop>\d+)/.match(non_remote.last)
71
+ end
72
+ @start = start_match[:start].to_i
73
+ @stop = stop_match[:stop].to_i
74
+ @start_continues = start_match[:start_continues]
75
+ @stop_continues = stop_match[:stop_continues]
76
+ end
77
+ end
78
+
79
+ def set_basic_properties(str)
80
+ md = BASERANGE_REGEX.match(str)
81
+ @complement ||= !!md[:complement]
82
+ @remote_accession = md[:remote_accession]
83
+ # start/stop continues takes into account the complement strand
84
+ @start = complement ? md[:stop].to_i : md[:start].to_i
85
+ @stop = complement ? md[:start].to_i : md[:stop].to_i
86
+ @start_continues = complement ? !!md[:stop_continues] : !!md[:start_continues]
87
+ @stop_continues = complement ? !!md[:start_continues] : !!md[:stop_continues]
88
+ end
89
+
90
+ def get_sequence(seq)
91
+ s = case type
92
+ when 'complement_with_join', 'join'
93
+ str = buidup_sequence_from_parts(seq)
94
+ str ? build_sequence(str) : nil
95
+ when 'standard'
96
+ if complement
97
+ build_sequence(seq[stop - 1 .. start - 1])
98
+ else
99
+ build_sequence(seq[start - 1 .. stop - 1])
100
+ end
101
+ when 'single'
102
+ build_sequence(seq[start - 1])
103
+ else
104
+ return nil
105
+ end
106
+
107
+ if s && ((%w(single standard).include?(type) && complement) || type == 'complement_with_join')
108
+ s.reverse_complement!
109
+ end
110
+ s
111
+ end
112
+
113
+ def buidup_sequence_from_parts(seq)
114
+ to_be_joined = ''
115
+ parts.each do |part|
116
+ md = BASERANGE_REGEX.match(part)
117
+ if md[:remote_accession]
118
+ return nil
119
+ else
120
+ temp = seq[md[:start].to_i - 1 .. md[:stop].to_i - 1]
121
+ temp.tr!('actg', 'tgac').reverse! if md[:complement]
122
+ to_be_joined += temp
123
+ end
124
+ end
125
+ to_be_joined
126
+ end
127
+
128
+ def build_sequence(str)
129
+ MgNu::Sequence.new(:value => str, :type => 'dna')
130
+ end
131
+
132
+ # string representation
133
+ def to_s
134
+ max = 79 - 21 # max length of location line
135
+ out = ''
136
+ if raw_value.length > max
137
+ split_str = raw_value.scan(/(.{1,#{max}})(,|$)/)
138
+ out += (split_str[0].first + split_str[0].last)
139
+ split_str[1 .. - 1].each do |a, b|
140
+ out << ("\n" + ' ' * 21 + a)
141
+ out << b unless b.empty?
142
+ end
143
+ else
144
+ out << raw_value
145
+ end
146
+ out
147
+ end
148
+ end # end of MgNu::Parser::Genbank::Location
149
+ end # end of MgNu::Parser::Genbank class
150
+ end # end of MgNu module
@@ -0,0 +1,45 @@
1
+ module MgNu
2
+ class Genbank
3
+ class Qualifier
4
+ include MgNu::Parser
5
+ attr_accessor :name, :value, :quoted
6
+
7
+ # create new Qualifier object
8
+ def initialize(opts = {})
9
+ @name = opts.key?(:name) ? strip_quotes(opts[:name]).downcase : nil
10
+ @value = opts.key?(:value) ? opts[:value] : nil
11
+ @quoted = opts.key?(:quoted) ? opts[:quoted] : false
12
+ end
13
+
14
+ # string representation
15
+ def to_s
16
+ out = ("\n" + ' ' * 21)
17
+ out << "/#{name}"
18
+ if value
19
+ out << '='
20
+ out << '"' if quoted
21
+ # calculate max length for first line of qualifier value
22
+ x = 79 - 21 - (name.length + 2) # length of name + equal sign
23
+ x -= 1 if quoted
24
+ if value.length > x
25
+ first_line_max = nil
26
+ x.downto(0).each do |i|
27
+ if value[i].chr =~ /[^\w-]/
28
+ first_line_max = i
29
+ break
30
+ end
31
+ end
32
+ first_line_max ||= x
33
+ out << value[0 .. first_line_max - 1]
34
+ out << "\n"
35
+ out << (' ' * 21 + value[first_line_max .. -1].print_multiline(79, :indent => 21).strip)
36
+ else
37
+ out << value
38
+ end
39
+ out << '"' if quoted
40
+ end
41
+ out
42
+ end
43
+ end # end
44
+ end # end Genbank class
45
+ end # end MgNu module
@@ -0,0 +1,114 @@
1
+ module MgNu
2
+ class Genbank
3
+ class Reference
4
+ attr_accessor :title, :number, :base_range, :authors, :journal
5
+ attr_accessor :consrtm, :pubmed, :remark
6
+
7
+ # create a new Reference object
8
+ def initialize
9
+ @title = nil
10
+ @base_range = nil
11
+ @number = nil
12
+ @authors = []
13
+ @consrtm = nil
14
+ @journal = nil
15
+ @pubmed = nil
16
+ @remark = nil
17
+ end
18
+
19
+ # REFERENCE 1 (bases 1 to 9334)
20
+ # AUTHORS Morowitz,M.J., Denef,V.J., Costello,E.K., Thomas,B.C., Relman,D.A.
21
+ # and Banfield,J.F.
22
+ # TITLE Direct Submission
23
+ # JOURNAL Submitted (08-APR-2011) Earth and Planetary Sciences, University of
24
+ # California - Berkeley, 369 McCone Hall, Berkeley, CA 94720, USA
25
+ # REMARK Strain-resolved community genomic analysis of gut microbial
26
+ # colonization in a premature infant
27
+
28
+ # class method to parse raw ref line
29
+ def self.parse(raw_string)
30
+ ref = Reference.new
31
+ buffer = raw_string.split("\n")
32
+ buffer.each_with_index do |line, i|
33
+ line.chomp!
34
+ if line =~ /^REFERENCE\s+(\d+)/
35
+ ref.number = Regexp.last_match[1].to_i
36
+ if line =~ /\(bases (\d+) to (\d+)\)/
37
+ ref.base_range = Range.new(Regexp.last_match[1].to_i, Regexp.last_match[2].to_i)
38
+ end
39
+ elsif line =~ /AUTHORS\s+(.+)/
40
+ author_line = Regexp.last_match[1]
41
+ while next_line = buffer[i + 1]
42
+ if next_line =~ /^\s*[A-Z]+\s/ # break if next sub-header line reached
43
+ break
44
+ else
45
+ author_line += next_line
46
+ buffer.delete_at(i + 1)
47
+ end
48
+ end
49
+ # process author_line
50
+ authors = author_line.split(/,\s+/)
51
+ last_author = authors.pop
52
+ authors += last_author.split(/\s*and\s*/)
53
+ ref.authors = authors
54
+ elsif line =~ /^\s*([A-Z]+)\s+(.+)/
55
+ type, content_line = Regexp.last_match[1], Regexp.last_match[2]
56
+ next unless ref.respond_to?(type.downcase.to_sym)
57
+ while next_line = buffer[i + 1]
58
+ if next_line =~ /^\s*[A-Z]+\s/
59
+ break
60
+ else
61
+ content_line += next_line
62
+ buffer.delete_at(i + 1)
63
+ end
64
+ end
65
+ # process content_line
66
+ ref.send(:"#{type.downcase}=", content_line.strip.squeeze(' '))
67
+ end
68
+ end
69
+ ref
70
+ end
71
+
72
+ def to_s
73
+ out = ''
74
+ out += "#{'REFERENCE'.ljust(12)}#{number}"
75
+ if base_range
76
+ out += number.to_s.length == 1 ? ' ' : ' '
77
+ out += "(bases #{base_range.first} to #{base_range.last})\n"
78
+ else
79
+ out += "\n"
80
+ end
81
+ if authors.any?
82
+ out += " #{'AUTHORS'.ljust(10)}"
83
+ case authors.length
84
+ when 1
85
+ out += authors[0]
86
+ when 2
87
+ out += "#{authors[0]} and #{authors[1]}"
88
+ else
89
+ out += "#{authors[0...-1].join(', ')} and #{authors[-1]}".print_multiline
90
+ end
91
+ out += "\n" unless [consrtm, title, journal, remark].none?
92
+ end
93
+ if consrtm
94
+ out += " #{'CONSRTM'.ljust(10)}#{consrtm.print_multiline}"
95
+ out += "\n" unless [title, journal, remark].none?
96
+ end
97
+ if title
98
+ out += " #{'TITLE'.ljust(10)}#{title.print_multiline}"
99
+ out += "\n" unless [journal, remark].none?
100
+ end
101
+ if journal
102
+ out += " #{'JOURNAL'.ljust(10)}#{journal.print_multiline}"
103
+ out += "\n" unless [pubmed, remark].none?
104
+ end
105
+ out += " #{'PUBMED'.ljust(9)}#{pubmed}" if pubmed
106
+ if remark
107
+ out += "\n"
108
+ out += " #{'REMARK'.ljust(10)}#{remark.print_multiline}"
109
+ end
110
+ out
111
+ end
112
+ end # end MgNu::Parser::Genbank::Reference class
113
+ end # end of MgNu::Parser::Genbank class
114
+ end # end of MgNu module
@@ -0,0 +1,39 @@
1
+ module MgNu
2
+ class Genbank
3
+ class Source
4
+ attr_accessor :organism, :common_name, :lineage
5
+
6
+ def initialize(common_name = nil, organism = '', lineage = '')
7
+ @common_name = common_name
8
+ @organism = organism
9
+ @lineage = lineage
10
+ end
11
+
12
+ # class method for parsing a buffer of Source data
13
+ def self.parse(buffer)
14
+ s = Source.new
15
+ buffer.each do |line|
16
+ if line =~ /^SOURCE\s+(.+)$/
17
+ s.common_name = Regexp.last_match[1].strip.squeeze(' ')
18
+ elsif line =~ /ORGANISM\s+(.+)/
19
+ s.organism += Regexp.last_match[1].strip.squeeze(' ')
20
+ elsif line =~ /[\w]+;\s/ # lineage line reached
21
+ temp = line.strip.squeeze(' ')
22
+ s.lineage += s.lineage.empty? ? temp : " #{temp}"
23
+ end
24
+ end
25
+ s.lineage.chop! if s.lineage =~ /(.+)\.$/ # remove period at the end
26
+ s
27
+ end
28
+
29
+ def to_s
30
+ out = ''
31
+ out << "#{'SOURCE'.ljust(12)}#{common_name}\n"
32
+ out << " #{'ORGANISM'.ljust(10)}#{organism.print_multiline}\n"
33
+ out << ''.ljust(12) # first lineage line
34
+ out << lineage.print_multiline unless lineage.empty?
35
+ out << '.'
36
+ end
37
+ end # end MgNu::Parser::Genbank::Source class
38
+ end # end of MgNu::Parser::Genbank class
39
+ end # end of MgNu module
@@ -0,0 +1,61 @@
1
+ require 'logger'
2
+
3
+ module MgNu
4
+ class Logger < ::Logger
5
+ @@log = nil
6
+
7
+ def self.get
8
+ if not @@log
9
+ @@log = self.new(STDOUT)
10
+ @@log.level = MgNu::Logger::DEBUG
11
+ end
12
+ @@log
13
+ end # end Logger.get
14
+
15
+ def self.log=(log)
16
+ @@log = log
17
+ end # end Logger set
18
+
19
+ def level(new_level)
20
+ get if not @@log
21
+ case new_level
22
+ when "WARN"
23
+ @@log.level = MgNu::Logger::WARN
24
+ when "INFO"
25
+ @@log.level = MgNu::Logger::INFO
26
+ when "DEBUG"
27
+ @@log.level = MgNu::Logger::DEBUG
28
+ when "ERROR"
29
+ @@log.level = MgNu::Logger::ERROR
30
+ when "FATAL"
31
+ @@log.level = MgNu::Logger::FATAL
32
+ when "UNKNOWN"
33
+ @@log.level = MgNu::Logger::UNKNOWN
34
+ end
35
+ end
36
+
37
+ end # end MgNu::Logger class
38
+
39
+ # mixin to add logging to any class
40
+ module Loggable
41
+ def debug(msg)
42
+ MgNu::Logger.get.debug(msg)
43
+ end
44
+
45
+ def info(msg)
46
+ MgNu::Logger.get.info(msg)
47
+ end
48
+ def warn(msg)
49
+ MgNu::Logger.get.warn(msg)
50
+ end
51
+ def error(msg)
52
+ MgNu::Logger.get.error(msg)
53
+ end
54
+ def fatal(msg)
55
+ MgNu::Logger.get.fatal(msg)
56
+ end
57
+ def unknown(msg)
58
+ MgNu::Logger.get.unknown(msg)
59
+ end
60
+ end # end MgNu::Logger::Loggable module
61
+ end # end MgNu module
@@ -0,0 +1,50 @@
1
+ module MgNu
2
+ module Parser
3
+ require_relative 'parser/blast'
4
+ require_relative 'parser/clustalw'
5
+ require_relative 'parser/fasta'
6
+ require_relative 'parser/fasta_index'
7
+ require_relative 'parser/fasta_header_index'
8
+ require_relative 'parser/fastq'
9
+ require_relative 'parser/gff'
10
+ require_relative 'parser/genbank'
11
+ #require_relative 'parser/iprscan_file'
12
+ #require_relative 'parser/kegg_ontology_index'
13
+ #require_relative 'parser/sam'
14
+ #require_relative 'parser/pilercr'
15
+ require_relative 'parser/prodigal'
16
+
17
+ # Remove quotes from a string
18
+ #
19
+ # @param [String] input string to strip
20
+ # @return [String] input string with quotes removed
21
+ def strip_quotes(input)
22
+ input = Regexp.last_match[1] if input =~ /^["'](.+)["']$/
23
+ input
24
+ end
25
+
26
+ # Reads a file until the given regexp is found
27
+ #
28
+ # @param [File, Regexp, Bool] file object and regular expression to
29
+ # search for and a boolean indicating whether or not to discard
30
+ # the regexp line or push it back onto the file
31
+ # @return [Array] lines from file up to but NOT including the
32
+ # regexp matchline
33
+ def parse_until(file, regexp, discard = true)
34
+ buffer = Array.new
35
+ file.each do |line|
36
+ if line =~ regexp and buffer.length != 0
37
+ # found exit condition
38
+ if discard
39
+ file.seek(-line.length, IO::SEEK_CUR) # push this line back on and return
40
+ end
41
+ return buffer
42
+ else
43
+ buffer << line.chomp
44
+ end
45
+ end # end of file.each do |line|
46
+ return buffer
47
+ end # end of parse_until
48
+
49
+ end # end of module Parser
50
+ end # end of module MgNu