mgnu 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,150 @@
1
+ require 'mgnu/sequence'
2
+
3
+ module MgNu
4
+ class Genbank
5
+ class Location
6
+ InvalidLocation = Class.new(StandardError)
7
+ LocationWithRemoteAccession = Class.new(StandardError)
8
+
9
+ BASERANGE_REGEX = /
10
+ (?<complement>complement)?\(?
11
+ (?<remote_accession>[A-Z\d\.]+:)?
12
+ (?<start_continues><)?
13
+ (?<start>\d+)\.\.
14
+ (?<stop_continues>>)?
15
+ (?<stop>\d+)
16
+ \)?/x
17
+
18
+ attr_accessor :raw_value, :start, :stop, :start_continues, :stop_continues
19
+ attr_accessor :complement, :type, :parts
20
+
21
+ # create a new Location object
22
+ def initialize(raw_value)
23
+ @raw_value = raw_value.gsub(/\s/, '')
24
+ parse_raw_value
25
+ end
26
+
27
+ # parsing the location from a loc line
28
+ def parse_raw_value
29
+ case raw_value
30
+ when /^complement\(join\((.+)\)/
31
+ @type = 'complement_with_join'
32
+ @complement = true
33
+ @parts = Regexp.last_match[1].split(/,/)
34
+ set_properties_for_join_types
35
+ when /^(?:join|order)\((.+)\)/
36
+ @type = 'join'
37
+ @parts = Regexp.last_match[1].split(/,/)
38
+ set_properties_for_join_types
39
+ when BASERANGE_REGEX
40
+ @type = 'standard'
41
+ set_basic_properties(raw_value)
42
+ when /^(\d+)\.(\d+)$/
43
+ @type = 'between_range'
44
+ @start, @stop = Regexp.last_match[1].to_i, Regexp.last_match[2].to_i
45
+ when /^(\d+)^(\d+)$/
46
+ @type = 'between_adjoining'
47
+ @start, @stop = Regexp.last_match[1].to_i, Regexp.last_match[2].to_i
48
+ when /^(complement)?\(?(\d+)\)?$/
49
+ @type = 'single'
50
+ @complement = !!Regexp.last_match[1]
51
+ @start = Regexp.last_match[2].to_i
52
+ else
53
+ fail InvalidLocation, 'This is not a valid Genbank location'
54
+ end
55
+ end
56
+
57
+ def set_properties_for_join_types
58
+ non_remote = parts.select { |part| part !~ /[A-Z\d\.]+:/ }
59
+ if non_remote.length == 1
60
+ set_basic_properties(non_remote.first)
61
+ else
62
+ @complement ||= !!(non_remote.first =~ /complement/)
63
+ # sets start and stop based on first and last non remote part, taking into account complement strand
64
+ stop_match, start_match = nil
65
+ if complement
66
+ stop_match = /(?<stop_continues><)?(?<stop>\d+)/.match(non_remote.first)
67
+ start_match = /\.\.(?<start_continues>>)?(?<start>\d+)/.match(non_remote.last)
68
+ else
69
+ start_match = /(?<start_continues><)?(?<start>\d+)/.match(non_remote.first)
70
+ stop_match = /\.\.(?<stop_continues>>)?(?<stop>\d+)/.match(non_remote.last)
71
+ end
72
+ @start = start_match[:start].to_i
73
+ @stop = stop_match[:stop].to_i
74
+ @start_continues = start_match[:start_continues]
75
+ @stop_continues = stop_match[:stop_continues]
76
+ end
77
+ end
78
+
79
+ def set_basic_properties(str)
80
+ md = BASERANGE_REGEX.match(str)
81
+ @complement ||= !!md[:complement]
82
+ @remote_accession = md[:remote_accession]
83
+ # start/stop continues takes into account the complement strand
84
+ @start = complement ? md[:stop].to_i : md[:start].to_i
85
+ @stop = complement ? md[:start].to_i : md[:stop].to_i
86
+ @start_continues = complement ? !!md[:stop_continues] : !!md[:start_continues]
87
+ @stop_continues = complement ? !!md[:start_continues] : !!md[:stop_continues]
88
+ end
89
+
90
+ def get_sequence(seq)
91
+ s = case type
92
+ when 'complement_with_join', 'join'
93
+ str = buidup_sequence_from_parts(seq)
94
+ str ? build_sequence(str) : nil
95
+ when 'standard'
96
+ if complement
97
+ build_sequence(seq[stop - 1 .. start - 1])
98
+ else
99
+ build_sequence(seq[start - 1 .. stop - 1])
100
+ end
101
+ when 'single'
102
+ build_sequence(seq[start - 1])
103
+ else
104
+ return nil
105
+ end
106
+
107
+ if s && ((%w(single standard).include?(type) && complement) || type == 'complement_with_join')
108
+ s.reverse_complement!
109
+ end
110
+ s
111
+ end
112
+
113
+ def buidup_sequence_from_parts(seq)
114
+ to_be_joined = ''
115
+ parts.each do |part|
116
+ md = BASERANGE_REGEX.match(part)
117
+ if md[:remote_accession]
118
+ return nil
119
+ else
120
+ temp = seq[md[:start].to_i - 1 .. md[:stop].to_i - 1]
121
+ temp.tr!('actg', 'tgac').reverse! if md[:complement]
122
+ to_be_joined += temp
123
+ end
124
+ end
125
+ to_be_joined
126
+ end
127
+
128
+ def build_sequence(str)
129
+ MgNu::Sequence.new(:value => str, :type => 'dna')
130
+ end
131
+
132
+ # string representation
133
+ def to_s
134
+ max = 79 - 21 # max length of location line
135
+ out = ''
136
+ if raw_value.length > max
137
+ split_str = raw_value.scan(/(.{1,#{max}})(,|$)/)
138
+ out += (split_str[0].first + split_str[0].last)
139
+ split_str[1 .. - 1].each do |a, b|
140
+ out << ("\n" + ' ' * 21 + a)
141
+ out << b unless b.empty?
142
+ end
143
+ else
144
+ out << raw_value
145
+ end
146
+ out
147
+ end
148
+ end # end of MgNu::Parser::Genbank::Location
149
+ end # end of MgNu::Parser::Genbank class
150
+ end # end of MgNu module
@@ -0,0 +1,45 @@
1
+ module MgNu
2
+ class Genbank
3
+ class Qualifier
4
+ include MgNu::Parser
5
+ attr_accessor :name, :value, :quoted
6
+
7
+ # create new Qualifier object
8
+ def initialize(opts = {})
9
+ @name = opts.key?(:name) ? strip_quotes(opts[:name]).downcase : nil
10
+ @value = opts.key?(:value) ? opts[:value] : nil
11
+ @quoted = opts.key?(:quoted) ? opts[:quoted] : false
12
+ end
13
+
14
+ # string representation
15
+ def to_s
16
+ out = ("\n" + ' ' * 21)
17
+ out << "/#{name}"
18
+ if value
19
+ out << '='
20
+ out << '"' if quoted
21
+ # calculate max length for first line of qualifier value
22
+ x = 79 - 21 - (name.length + 2) # length of name + equal sign
23
+ x -= 1 if quoted
24
+ if value.length > x
25
+ first_line_max = nil
26
+ x.downto(0).each do |i|
27
+ if value[i].chr =~ /[^\w-]/
28
+ first_line_max = i
29
+ break
30
+ end
31
+ end
32
+ first_line_max ||= x
33
+ out << value[0 .. first_line_max - 1]
34
+ out << "\n"
35
+ out << (' ' * 21 + value[first_line_max .. -1].print_multiline(79, :indent => 21).strip)
36
+ else
37
+ out << value
38
+ end
39
+ out << '"' if quoted
40
+ end
41
+ out
42
+ end
43
+ end # end
44
+ end # end Genbank class
45
+ end # end MgNu module
@@ -0,0 +1,114 @@
1
+ module MgNu
2
+ class Genbank
3
+ class Reference
4
+ attr_accessor :title, :number, :base_range, :authors, :journal
5
+ attr_accessor :consrtm, :pubmed, :remark
6
+
7
+ # create a new Reference object
8
+ def initialize
9
+ @title = nil
10
+ @base_range = nil
11
+ @number = nil
12
+ @authors = []
13
+ @consrtm = nil
14
+ @journal = nil
15
+ @pubmed = nil
16
+ @remark = nil
17
+ end
18
+
19
+ # REFERENCE 1 (bases 1 to 9334)
20
+ # AUTHORS Morowitz,M.J., Denef,V.J., Costello,E.K., Thomas,B.C., Relman,D.A.
21
+ # and Banfield,J.F.
22
+ # TITLE Direct Submission
23
+ # JOURNAL Submitted (08-APR-2011) Earth and Planetary Sciences, University of
24
+ # California - Berkeley, 369 McCone Hall, Berkeley, CA 94720, USA
25
+ # REMARK Strain-resolved community genomic analysis of gut microbial
26
+ # colonization in a premature infant
27
+
28
+ # class method to parse raw ref line
29
+ def self.parse(raw_string)
30
+ ref = Reference.new
31
+ buffer = raw_string.split("\n")
32
+ buffer.each_with_index do |line, i|
33
+ line.chomp!
34
+ if line =~ /^REFERENCE\s+(\d+)/
35
+ ref.number = Regexp.last_match[1].to_i
36
+ if line =~ /\(bases (\d+) to (\d+)\)/
37
+ ref.base_range = Range.new(Regexp.last_match[1].to_i, Regexp.last_match[2].to_i)
38
+ end
39
+ elsif line =~ /AUTHORS\s+(.+)/
40
+ author_line = Regexp.last_match[1]
41
+ while next_line = buffer[i + 1]
42
+ if next_line =~ /^\s*[A-Z]+\s/ # break if next sub-header line reached
43
+ break
44
+ else
45
+ author_line += next_line
46
+ buffer.delete_at(i + 1)
47
+ end
48
+ end
49
+ # process author_line
50
+ authors = author_line.split(/,\s+/)
51
+ last_author = authors.pop
52
+ authors += last_author.split(/\s*and\s*/)
53
+ ref.authors = authors
54
+ elsif line =~ /^\s*([A-Z]+)\s+(.+)/
55
+ type, content_line = Regexp.last_match[1], Regexp.last_match[2]
56
+ next unless ref.respond_to?(type.downcase.to_sym)
57
+ while next_line = buffer[i + 1]
58
+ if next_line =~ /^\s*[A-Z]+\s/
59
+ break
60
+ else
61
+ content_line += next_line
62
+ buffer.delete_at(i + 1)
63
+ end
64
+ end
65
+ # process content_line
66
+ ref.send(:"#{type.downcase}=", content_line.strip.squeeze(' '))
67
+ end
68
+ end
69
+ ref
70
+ end
71
+
72
+ def to_s
73
+ out = ''
74
+ out += "#{'REFERENCE'.ljust(12)}#{number}"
75
+ if base_range
76
+ out += number.to_s.length == 1 ? ' ' : ' '
77
+ out += "(bases #{base_range.first} to #{base_range.last})\n"
78
+ else
79
+ out += "\n"
80
+ end
81
+ if authors.any?
82
+ out += " #{'AUTHORS'.ljust(10)}"
83
+ case authors.length
84
+ when 1
85
+ out += authors[0]
86
+ when 2
87
+ out += "#{authors[0]} and #{authors[1]}"
88
+ else
89
+ out += "#{authors[0...-1].join(', ')} and #{authors[-1]}".print_multiline
90
+ end
91
+ out += "\n" unless [consrtm, title, journal, remark].none?
92
+ end
93
+ if consrtm
94
+ out += " #{'CONSRTM'.ljust(10)}#{consrtm.print_multiline}"
95
+ out += "\n" unless [title, journal, remark].none?
96
+ end
97
+ if title
98
+ out += " #{'TITLE'.ljust(10)}#{title.print_multiline}"
99
+ out += "\n" unless [journal, remark].none?
100
+ end
101
+ if journal
102
+ out += " #{'JOURNAL'.ljust(10)}#{journal.print_multiline}"
103
+ out += "\n" unless [pubmed, remark].none?
104
+ end
105
+ out += " #{'PUBMED'.ljust(9)}#{pubmed}" if pubmed
106
+ if remark
107
+ out += "\n"
108
+ out += " #{'REMARK'.ljust(10)}#{remark.print_multiline}"
109
+ end
110
+ out
111
+ end
112
+ end # end MgNu::Parser::Genbank::Reference class
113
+ end # end of MgNu::Parser::Genbank class
114
+ end # end of MgNu module
@@ -0,0 +1,39 @@
1
+ module MgNu
2
+ class Genbank
3
+ class Source
4
+ attr_accessor :organism, :common_name, :lineage
5
+
6
+ def initialize(common_name = nil, organism = '', lineage = '')
7
+ @common_name = common_name
8
+ @organism = organism
9
+ @lineage = lineage
10
+ end
11
+
12
+ # class method for parsing a buffer of Source data
13
+ def self.parse(buffer)
14
+ s = Source.new
15
+ buffer.each do |line|
16
+ if line =~ /^SOURCE\s+(.+)$/
17
+ s.common_name = Regexp.last_match[1].strip.squeeze(' ')
18
+ elsif line =~ /ORGANISM\s+(.+)/
19
+ s.organism += Regexp.last_match[1].strip.squeeze(' ')
20
+ elsif line =~ /[\w]+;\s/ # lineage line reached
21
+ temp = line.strip.squeeze(' ')
22
+ s.lineage += s.lineage.empty? ? temp : " #{temp}"
23
+ end
24
+ end
25
+ s.lineage.chop! if s.lineage =~ /(.+)\.$/ # remove period at the end
26
+ s
27
+ end
28
+
29
+ def to_s
30
+ out = ''
31
+ out << "#{'SOURCE'.ljust(12)}#{common_name}\n"
32
+ out << " #{'ORGANISM'.ljust(10)}#{organism.print_multiline}\n"
33
+ out << ''.ljust(12) # first lineage line
34
+ out << lineage.print_multiline unless lineage.empty?
35
+ out << '.'
36
+ end
37
+ end # end MgNu::Parser::Genbank::Source class
38
+ end # end of MgNu::Parser::Genbank class
39
+ end # end of MgNu module
@@ -0,0 +1,61 @@
1
+ require 'logger'
2
+
3
+ module MgNu
4
+ class Logger < ::Logger
5
+ @@log = nil
6
+
7
+ def self.get
8
+ if not @@log
9
+ @@log = self.new(STDOUT)
10
+ @@log.level = MgNu::Logger::DEBUG
11
+ end
12
+ @@log
13
+ end # end Logger.get
14
+
15
+ def self.log=(log)
16
+ @@log = log
17
+ end # end Logger set
18
+
19
+ def level(new_level)
20
+ get if not @@log
21
+ case new_level
22
+ when "WARN"
23
+ @@log.level = MgNu::Logger::WARN
24
+ when "INFO"
25
+ @@log.level = MgNu::Logger::INFO
26
+ when "DEBUG"
27
+ @@log.level = MgNu::Logger::DEBUG
28
+ when "ERROR"
29
+ @@log.level = MgNu::Logger::ERROR
30
+ when "FATAL"
31
+ @@log.level = MgNu::Logger::FATAL
32
+ when "UNKNOWN"
33
+ @@log.level = MgNu::Logger::UNKNOWN
34
+ end
35
+ end
36
+
37
+ end # end MgNu::Logger class
38
+
39
+ # mixin to add logging to any class
40
+ module Loggable
41
+ def debug(msg)
42
+ MgNu::Logger.get.debug(msg)
43
+ end
44
+
45
+ def info(msg)
46
+ MgNu::Logger.get.info(msg)
47
+ end
48
+ def warn(msg)
49
+ MgNu::Logger.get.warn(msg)
50
+ end
51
+ def error(msg)
52
+ MgNu::Logger.get.error(msg)
53
+ end
54
+ def fatal(msg)
55
+ MgNu::Logger.get.fatal(msg)
56
+ end
57
+ def unknown(msg)
58
+ MgNu::Logger.get.unknown(msg)
59
+ end
60
+ end # end MgNu::Logger::Loggable module
61
+ end # end MgNu module
@@ -0,0 +1,50 @@
1
+ module MgNu
2
+ module Parser
3
+ require_relative 'parser/blast'
4
+ require_relative 'parser/clustalw'
5
+ require_relative 'parser/fasta'
6
+ require_relative 'parser/fasta_index'
7
+ require_relative 'parser/fasta_header_index'
8
+ require_relative 'parser/fastq'
9
+ require_relative 'parser/gff'
10
+ require_relative 'parser/genbank'
11
+ #require_relative 'parser/iprscan_file'
12
+ #require_relative 'parser/kegg_ontology_index'
13
+ #require_relative 'parser/sam'
14
+ #require_relative 'parser/pilercr'
15
+ require_relative 'parser/prodigal'
16
+
17
+ # Remove quotes from a string
18
+ #
19
+ # @param [String] input string to strip
20
+ # @return [String] input string with quotes removed
21
+ def strip_quotes(input)
22
+ input = Regexp.last_match[1] if input =~ /^["'](.+)["']$/
23
+ input
24
+ end
25
+
26
+ # Reads a file until the given regexp is found
27
+ #
28
+ # @param [File, Regexp, Bool] file object and regular expression to
29
+ # search for and a boolean indicating whether or not to discard
30
+ # the regexp line or push it back onto the file
31
+ # @return [Array] lines from file up to but NOT including the
32
+ # regexp matchline
33
+ def parse_until(file, regexp, discard = true)
34
+ buffer = Array.new
35
+ file.each do |line|
36
+ if line =~ regexp and buffer.length != 0
37
+ # found exit condition
38
+ if discard
39
+ file.seek(-line.length, IO::SEEK_CUR) # push this line back on and return
40
+ end
41
+ return buffer
42
+ else
43
+ buffer << line.chomp
44
+ end
45
+ end # end of file.each do |line|
46
+ return buffer
47
+ end # end of parse_until
48
+
49
+ end # end of module Parser
50
+ end # end of module MgNu