mgnu 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f41f975c84c1e898266e72770b0ba76c8ab4d42d3c50d53944bc4ddd74e1e0aa
4
+ data.tar.gz: 646a09a7ff525576d25ceb3e3b4250259263b418be11a2b24a5429cebec752c4
5
+ SHA512:
6
+ metadata.gz: b1945190d6d72e495e0eabe909122c8db9afe6fde8aff015af4d8b4d53298741cdeda8ba5351c2010524c623ce5ee869c3f7b2a2417dabbb8ebf6e0226893e58
7
+ data.tar.gz: 5600bc2b2cf0f3bec4f619261a9c230265d29c0e809a044391bcaf098dcd132c2bb735599e6a224c039efd4201fa2e6ab444f5f4f4fa93fa6c395a261ebbb95d
File without changes
@@ -0,0 +1,31 @@
1
+ <<<<<<< HEAD
2
+ # mgnu
3
+ Metagenomi Nu, a fast and small bioinformatics support library
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'mgnu'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install mgnu
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/mgnu/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
@@ -0,0 +1,33 @@
1
+ require 'bundler'
2
+ require 'rspec/core/rake_task'
3
+ Bundler::GemHelper.install_tasks
4
+
5
+ THRESHOLD_MIN = 70.0
6
+
7
+ RSpec::Core::RakeTask.new(:spec)
8
+
9
+ task :test => :spec
10
+
11
+ begin
12
+ require 'rubocop/rake_task'
13
+ RuboCop::RakeTask.new
14
+ rescue LoadError
15
+ task :rubocop do
16
+ $stderr.puts 'Rubocop is disabled'
17
+ end
18
+ end
19
+
20
+ require 'yard'
21
+ YARD::Rake::YardocTask.new
22
+
23
+ require 'yardstick/rake/measurement'
24
+ Yardstick::Rake::Measurement.new do |measurement|
25
+ measurement.output = 'measurement/report.txt'
26
+ end
27
+
28
+ require 'yardstick/rake/verify'
29
+ Yardstick::Rake::Verify.new do |verify|
30
+ verify.threshold = THRESHOLD_MIN
31
+ end
32
+
33
+ task :default => [:spec] #, :rubocop, :verify_measurements]
@@ -0,0 +1,9 @@
1
+ module Nu
2
+ require 'mgnu/version'
3
+ require 'mgnu/loggable'
4
+ require 'mgnu/common'
5
+ require 'mgnu/sequence'
6
+ require 'mgnu/alignment'
7
+ require 'mgnu/parser'
8
+ require 'mgnu/genbank'
9
+ end
@@ -0,0 +1,143 @@
1
+ module MgNu
2
+ class Alignment
3
+ include Enumerable
4
+ StrongConservationGroups = %w(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW).collect { |x| x.split('').sort }
5
+ WeakConservationGroups = %w(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY).collect { |x| x.split('').sort }
6
+
7
+ attr_reader :length
8
+ attr_accessor :sequences, :order
9
+
10
+ # create a new Alignment object
11
+ def initialize(sequences, order = nil)
12
+ @sequences = sequences
13
+ @order = order
14
+ @length = sequences[sequences.keys[0]].length
15
+ end
16
+
17
+ # override each
18
+ def each
19
+ if @order.nil?
20
+ @sequences.each do |name, seq|
21
+ yield seq
22
+ end
23
+ else
24
+ @order.each do |name|
25
+ yield @sequences[name]
26
+ end
27
+ end
28
+ end
29
+
30
+ # Returns an array of arrays containing the sequences at the position indicated.
31
+ # Can take a range
32
+ def each_position(range = nil)
33
+ matrix = []
34
+ if @order.nil?
35
+ @sequences.each do |name, seq|
36
+ if range.class == Range
37
+ matrix.push(seq[range].split(//))
38
+ elsif range.class == Integer
39
+ matrix.push(seq[range])
40
+ else
41
+ matrix.push(seq.split(//))
42
+ end
43
+ end
44
+ else
45
+ @order.each do |name|
46
+ if range.class == Range
47
+ # correct for 0 indexed arrays
48
+ matrix.push(@sequences[name][(range.begin - 1..range.end - 1)].split(//))
49
+ elsif range.class == Integer
50
+ matrix.push(@sequences[name][range - 1].chr)
51
+ else
52
+ matrix.push(@sequences[name].split(//))
53
+ end
54
+ end
55
+ end
56
+
57
+ positions = []
58
+ if range.class == Range
59
+ range.each do |pos|
60
+ position = []
61
+ matrix.each do |seq|
62
+ position.push(seq[(pos - 1) - (range.begin - 1)])
63
+ end
64
+ positions << position
65
+ if block_given?
66
+ yield position
67
+ end
68
+ end
69
+ unless block_given?
70
+ positions
71
+ end
72
+ elsif range.class == Integer
73
+ position = []
74
+ matrix.each do |seq|
75
+ position.push(seq)
76
+ end
77
+ positions << position
78
+ if block_given?
79
+ yield position
80
+ end
81
+ unless block_given?
82
+ positions
83
+ end
84
+ else
85
+ 0.upto(@length-1) do |pos|
86
+ position = []
87
+ matrix.each do |seq|
88
+ position.push(seq[pos])
89
+ end
90
+ positions << position
91
+ if block_given?
92
+ yield position
93
+ end
94
+ end
95
+ unless block_given?
96
+ positions
97
+ end
98
+ end
99
+ end
100
+
101
+ def [](range = nil)
102
+ each_position(range)
103
+ end
104
+
105
+ def match(range = nil)
106
+ # get the matrix for the whole alignment, or a portion if a
107
+ # range is given
108
+ m = each_position(range)
109
+ str = ""
110
+
111
+ # go through every row (position) in the array from
112
+ # each_position and compute the match symbol. Concat to str
113
+ m.each do |pos|
114
+ # if there's a gap in the alignment at this pos, return a space
115
+ if pos.index("-") != nil
116
+ str += " "
117
+ else
118
+ # no gaps, so determine strength of column
119
+ p = pos.collect { |c| c.upcase }.sort.uniq
120
+ if p.length == 1
121
+ str += "*"
122
+ elsif StrongConservationGroups.find { |x| (p - x).empty? }
123
+ str += ":"
124
+ elsif WeakConservationGroups.find { |x| (p - x).empty? }
125
+ str += "."
126
+ else
127
+ str += " "
128
+ end
129
+ end
130
+ end
131
+ str
132
+ end
133
+
134
+ def to_s
135
+ str = ""
136
+ self.order.each do |name|
137
+ str += "#{name}: #{self.sequences[name]}\n"
138
+ end
139
+ str += self.match + "\n"
140
+ str
141
+ end
142
+ end # end MgNu::Alignment class
143
+ end # end MgNu module
@@ -0,0 +1,68 @@
1
+ module MgNu
2
+ # codon table 11 from http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
3
+ # standard bacteria/archae/plastid codes
4
+ BACTERIA_CODONS = {'ttt' => 'F', 'tct' => 'S', 'tat' => 'Y', 'tgt' => 'C',
5
+ 'ttc' => 'F', 'tcc' => 'S', 'tac' => 'Y', 'tgc' => 'C',
6
+ 'tta' => 'L', 'tca' => 'S', 'taa' => '*', 'tga' => '*',
7
+ 'ttg' => 'L', 'tcg' => 'S', 'tag' => '*', 'tgg' => 'W',
8
+
9
+ 'ctt' => 'L', 'cct' => 'P', 'cat' => 'H', 'cgt' => 'R',
10
+ 'ctc' => 'L', 'ccc' => 'P', 'cac' => 'H', 'cgc' => 'R',
11
+ 'cta' => 'L', 'cca' => 'P', 'caa' => 'Q', 'cga' => 'R',
12
+ 'ctg' => 'L', 'ccg' => 'P', 'cag' => 'Q', 'cgg' => 'R',
13
+
14
+ 'att' => 'I', 'act' => 'T', 'aat' => 'N', 'agt' => 'S',
15
+ 'atc' => 'I', 'acc' => 'T', 'aac' => 'N', 'agc' => 'S',
16
+ 'ata' => 'I', 'aca' => 'T', 'aaa' => 'K', 'aga' => 'R',
17
+ 'atg' => 'M', 'acg' => 'T', 'aag' => 'K', 'agg' => 'R',
18
+
19
+ 'gtt' => 'V', 'gct' => 'A', 'gat' => 'D', 'ggt' => 'G',
20
+ 'gtc' => 'V', 'gcc' => 'A', 'gac' => 'D', 'ggc' => 'G',
21
+ 'gta' => 'V', 'gca' => 'A', 'gaa' => 'E', 'gga' => 'G',
22
+ 'gtg' => 'V', 'gcg' => 'A', 'gag' => 'E', 'ggg' => 'G'}
23
+ end
24
+
25
+ # example usage of Regexp#global_match
26
+ # re = /(\w+)/
27
+ # words = []
28
+ # re.global_match("cat dog house") do |m|
29
+ # words.push(m[0])
30
+ # end
31
+ # p words # ["cat", "dog", "house"]
32
+ class Regexp
33
+ def global_match(str, &proc)
34
+ retval = nil
35
+ loop do
36
+ res = str.sub(self) do |m|
37
+ proc.call($~) # pass MatchData obj
38
+ ''
39
+ end
40
+ break retval if res == str
41
+ str = res
42
+ retval ||= true
43
+ end
44
+ end # end of global_match
45
+ end # end of Regexp class
46
+
47
+ # add print_multiline method to String class
48
+ class String
49
+ def print_multiline(width=80, options={})
50
+ return unless self.length > 0
51
+ indent = ' ' * (options[:indent] || 12)
52
+ x = width - indent.length
53
+ # string broken up with spaces or solid string
54
+ split_str = self.scan(/(.{1,#{x}})(?: +|$)\n?|(.{#{x}})/)
55
+ out = ''
56
+ # print first line without indent
57
+ out += split_str.first[0] || split_str.first[1]
58
+
59
+ if split_str.length > 1
60
+ out += "\n"
61
+ end
62
+ # print all other lines with indent
63
+ out += split_str[1..-1].map do |str, other|
64
+ "#{indent}#{str || other}"
65
+ end.join("\n")
66
+ out
67
+ end # end of print_multiline
68
+ end
@@ -0,0 +1,117 @@
1
+ require 'forwardable'
2
+ require 'mgnu/genbank/feature'
3
+ require 'mgnu/genbank/location'
4
+ require 'mgnu/genbank/qualifier'
5
+ require 'mgnu/genbank/reference'
6
+ require 'mgnu/genbank/source'
7
+
8
+ module MgNu
9
+ class Genbank
10
+ attr_accessor :locus, :definition, :accession, :secondary_accession, :version, :dblink
11
+ attr_accessor :geninfo_identifier, :keywords, :segment, :source, :references, :comment
12
+ attr_accessor :features, :sequence
13
+ include MgNu::Loggable
14
+ extend Forwardable
15
+
16
+ STRUCTURE = [:locus, :definition, :accession, :version, :dblink,
17
+ :keywords, :segment, :source, :references, :comment,
18
+ :features, :sequence]
19
+
20
+ Locus = Struct.new :name, :length, :no_of_strands, :molecule_type, :molecule_structure, :genbank_division, :modification_date do
21
+ def to_s
22
+ str = ''
23
+ str << 'LOCUS'.ljust(12) # 1-12
24
+ str << name.ljust(17) # 13-29
25
+ str << length.rjust(11) # 30-41
26
+ str << ' bp ' # 41-44
27
+ str << "#{no_of_strands}".ljust(3) # ss- ds- ms-, 45-47
28
+ str << "#{molecule_type}".ljust(8) # 48-55
29
+ str << "#{molecule_structure}".ljust(8) # linear or circular, 56-63
30
+ str << " #{genbank_division} " # 65-68
31
+ str << modification_date # 69
32
+ end
33
+ end
34
+
35
+ # create a new Genbank object
36
+ def initialize
37
+ @locus = nil
38
+ @definition = ''
39
+ @accession = ''
40
+ @secondary_accession = []
41
+ @dblink = ''
42
+ @version = ''
43
+ @geninfo_identifier = ''
44
+ @keywords = nil
45
+ @segment = ''
46
+ @source = nil
47
+ @references = []
48
+ @comment = ''
49
+ @features = []
50
+ @sequence = ''
51
+ end
52
+
53
+ def_delegators :@locus, :name, :length, :no_of_strands, :molecule_type
54
+ def_delegators :molecule_structure, :genbank_division, :modification_date
55
+
56
+ # string representation
57
+ def to_s
58
+ str = ''
59
+ STRUCTURE.each do |part|
60
+ p = send(part)
61
+ p_exists = false
62
+ case part
63
+ when :locus, :source
64
+ if p
65
+ p_exists = true
66
+ str << p.to_s
67
+ end
68
+ when :definition, :dblink, :segment, :comment
69
+ if p && !p.empty?
70
+ p_exists = true
71
+ str << part.to_s.upcase.ljust(12)
72
+ str << p.print_multiline
73
+ str << '.' if part == :definition
74
+ end
75
+ when :accession
76
+ if p && !p.empty?
77
+ p_exists = true
78
+ str += 'ACCESSION'.ljust(12)
79
+ str += accession
80
+ if secondary_accession.any?
81
+ str += " #{secondary_accession.join(' ')}"
82
+ end
83
+ end
84
+ when :version
85
+ if p && !p.empty?
86
+ p_exists = true
87
+ str += 'VERSION'.ljust(12)
88
+ str += version
89
+ str += " GI:#{geninfo_identifier}" if geninfo_identifier
90
+ end
91
+ when :features, :references
92
+ unless p.empty?
93
+ p_exists = true
94
+ str += "FEATURES Location/Qualifiers\n" if part == :features
95
+ temp = p.collect { |x| x.to_s }
96
+ str += temp.join("\n")
97
+ end
98
+ when :sequence
99
+ unless p.value.empty?
100
+ p_exists = true
101
+ str << "#{'ORIGIN'.ljust(12)}\n"
102
+ str << @sequence.to_genbank
103
+ end
104
+ when :keywords
105
+ p_exists = true
106
+ str << 'KEYWORDS'.ljust(12)
107
+ str << p.join('; ').print_multiline if p
108
+ str << '.'
109
+ end
110
+ # print newline character if there are more parts
111
+ str << "\n" if p_exists && STRUCTURE[STRUCTURE.index(part) + 1]
112
+ end
113
+ str << '//'
114
+ end
115
+ end # end of MgNu::Parser::Genbank class
116
+ end # end of MgNu module
117
+ __END__
@@ -0,0 +1,84 @@
1
+ module MgNu
2
+ class Genbank
3
+ class Feature
4
+ attr_accessor :feature_type, :qualifiers, :location, :sequence
5
+ attr_accessor :start_continues, :stop_continues, :raw_qualifiers
6
+
7
+ # create a new Feature object
8
+ def initialize
9
+ @qualifiers = []
10
+ @raw_qualifiers = []
11
+ end
12
+
13
+ # for handling tags in gb format
14
+ def method_missing(method_name, *args)
15
+ quals = @qualifiers.select {|q| q.name == method_name.to_s}
16
+ if quals.length > 1
17
+ return quals.map {|q| q.value }
18
+ elsif quals.length == 1
19
+ return quals.first.value
20
+ else
21
+ return nil
22
+ end
23
+ end
24
+
25
+ # class method for parsing a gb entry in a buffer
26
+ def self.parse(buffer)
27
+ buffer = buffer.split("\n")
28
+ feature = Feature.new # create a new feature
29
+ buffer.each_with_index do |line,i|
30
+ if line =~ /^\s{5}([\w\-\*']+)\s+(.+)$/ #feature type and (beginning of) location line
31
+ feature.feature_type = Regexp.last_match[1]
32
+ loc = Regexp.last_match[2]
33
+
34
+ until buffer[i + 1] =~ /\/.+=.+/ # check for a continuation of Location line
35
+ break unless buffer[i+1]
36
+ loc += buffer[i + 1].lstrip!
37
+ buffer.delete_at(i + 1)
38
+ end
39
+ feature.location = Location.new(loc)
40
+ elsif line =~ /^\s{21}\/(.+)=(.+)$/
41
+ key, value = Regexp.last_match[1], Regexp.last_match[2]
42
+
43
+ # to handle multi-line qualifier values
44
+ until buffer[i+1] =~ /^\s{21}\/(?:.+?)=/ # next qualifier
45
+ break unless buffer[i + 1]
46
+ value += " #{buffer[i + 1].lstrip}"
47
+ buffer.delete_at(i + 1)
48
+ end
49
+ # parse out quotes
50
+ quoted = false
51
+ if value =~ /^"(.+)"$/
52
+ value = Regexp.last_match[1]
53
+ quoted = true # some qualifier values are part of a controlled vocabulary and, as such, unquoted
54
+ end
55
+ # make sure sequence contains no spaces
56
+ if key == 'translation'
57
+ value.gsub!(/\s/, '');
58
+ end
59
+ # add new qualifier to feature
60
+ feature.qualifiers << Qualifier.new(:name => key, :value => value.squeeze(' '), :quoted => quoted)
61
+ elsif line =~ /^\s{21}\/(.+)$/ # qualifier name w/out value
62
+ key = Regexp.last_match[1]
63
+ feature.qualifiers << Qualifier.new(:name => key)
64
+ else
65
+ raise "UNKNOWN FEATURE LINE TYPE: #{line} -- #{i}"
66
+ end
67
+ end # end loop through buffer
68
+ feature
69
+ end
70
+
71
+ # string representation of Feature
72
+ def to_s
73
+ out = ''
74
+ out += ' ' * 5
75
+ out += feature_type.ljust(16)
76
+ out += location.to_s
77
+ qualifiers.each do |q|
78
+ out += q.to_s
79
+ end
80
+ out
81
+ end
82
+ end # end MgNu::Genbank::Feature class
83
+ end # end MgNu::Genbank class
84
+ end # end MgNu module