mgnu 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f41f975c84c1e898266e72770b0ba76c8ab4d42d3c50d53944bc4ddd74e1e0aa
4
+ data.tar.gz: 646a09a7ff525576d25ceb3e3b4250259263b418be11a2b24a5429cebec752c4
5
+ SHA512:
6
+ metadata.gz: b1945190d6d72e495e0eabe909122c8db9afe6fde8aff015af4d8b4d53298741cdeda8ba5351c2010524c623ce5ee869c3f7b2a2417dabbb8ebf6e0226893e58
7
+ data.tar.gz: 5600bc2b2cf0f3bec4f619261a9c230265d29c0e809a044391bcaf098dcd132c2bb735599e6a224c039efd4201fa2e6ab444f5f4f4fa93fa6c395a261ebbb95d
File without changes
@@ -0,0 +1,31 @@
1
+ <<<<<<< HEAD
2
+ # mgnu
3
+ Metagenomi Nu, a fast and small bioinformatics support library
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'mgnu'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install mgnu
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/mgnu/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
@@ -0,0 +1,33 @@
1
+ require 'bundler'
2
+ require 'rspec/core/rake_task'
3
+ Bundler::GemHelper.install_tasks
4
+
5
+ THRESHOLD_MIN = 70.0
6
+
7
+ RSpec::Core::RakeTask.new(:spec)
8
+
9
+ task :test => :spec
10
+
11
+ begin
12
+ require 'rubocop/rake_task'
13
+ RuboCop::RakeTask.new
14
+ rescue LoadError
15
+ task :rubocop do
16
+ $stderr.puts 'Rubocop is disabled'
17
+ end
18
+ end
19
+
20
+ require 'yard'
21
+ YARD::Rake::YardocTask.new
22
+
23
+ require 'yardstick/rake/measurement'
24
+ Yardstick::Rake::Measurement.new do |measurement|
25
+ measurement.output = 'measurement/report.txt'
26
+ end
27
+
28
+ require 'yardstick/rake/verify'
29
+ Yardstick::Rake::Verify.new do |verify|
30
+ verify.threshold = THRESHOLD_MIN
31
+ end
32
+
33
+ task :default => [:spec] #, :rubocop, :verify_measurements]
@@ -0,0 +1,9 @@
1
+ module Nu
2
+ require 'mgnu/version'
3
+ require 'mgnu/loggable'
4
+ require 'mgnu/common'
5
+ require 'mgnu/sequence'
6
+ require 'mgnu/alignment'
7
+ require 'mgnu/parser'
8
+ require 'mgnu/genbank'
9
+ end
@@ -0,0 +1,143 @@
1
+ module MgNu
2
+ class Alignment
3
+ include Enumerable
4
+ StrongConservationGroups = %w(STA NEQK NHQK NDEQ QHRK MILV MILF HY FYW).collect { |x| x.split('').sort }
5
+ WeakConservationGroups = %w(CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY).collect { |x| x.split('').sort }
6
+
7
+ attr_reader :length
8
+ attr_accessor :sequences, :order
9
+
10
+ # create a new Alignment object
11
+ def initialize(sequences, order = nil)
12
+ @sequences = sequences
13
+ @order = order
14
+ @length = sequences[sequences.keys[0]].length
15
+ end
16
+
17
+ # override each
18
+ def each
19
+ if @order.nil?
20
+ @sequences.each do |name, seq|
21
+ yield seq
22
+ end
23
+ else
24
+ @order.each do |name|
25
+ yield @sequences[name]
26
+ end
27
+ end
28
+ end
29
+
30
+ # Returns an array of arrays containing the sequences at the position indicated.
31
+ # Can take a range
32
+ def each_position(range = nil)
33
+ matrix = []
34
+ if @order.nil?
35
+ @sequences.each do |name, seq|
36
+ if range.class == Range
37
+ matrix.push(seq[range].split(//))
38
+ elsif range.class == Integer
39
+ matrix.push(seq[range])
40
+ else
41
+ matrix.push(seq.split(//))
42
+ end
43
+ end
44
+ else
45
+ @order.each do |name|
46
+ if range.class == Range
47
+ # correct for 0 indexed arrays
48
+ matrix.push(@sequences[name][(range.begin - 1..range.end - 1)].split(//))
49
+ elsif range.class == Integer
50
+ matrix.push(@sequences[name][range - 1].chr)
51
+ else
52
+ matrix.push(@sequences[name].split(//))
53
+ end
54
+ end
55
+ end
56
+
57
+ positions = []
58
+ if range.class == Range
59
+ range.each do |pos|
60
+ position = []
61
+ matrix.each do |seq|
62
+ position.push(seq[(pos - 1) - (range.begin - 1)])
63
+ end
64
+ positions << position
65
+ if block_given?
66
+ yield position
67
+ end
68
+ end
69
+ unless block_given?
70
+ positions
71
+ end
72
+ elsif range.class == Integer
73
+ position = []
74
+ matrix.each do |seq|
75
+ position.push(seq)
76
+ end
77
+ positions << position
78
+ if block_given?
79
+ yield position
80
+ end
81
+ unless block_given?
82
+ positions
83
+ end
84
+ else
85
+ 0.upto(@length-1) do |pos|
86
+ position = []
87
+ matrix.each do |seq|
88
+ position.push(seq[pos])
89
+ end
90
+ positions << position
91
+ if block_given?
92
+ yield position
93
+ end
94
+ end
95
+ unless block_given?
96
+ positions
97
+ end
98
+ end
99
+ end
100
+
101
+ def [](range = nil)
102
+ each_position(range)
103
+ end
104
+
105
+ def match(range = nil)
106
+ # get the matrix for the whole alignment, or a portion if a
107
+ # range is given
108
+ m = each_position(range)
109
+ str = ""
110
+
111
+ # go through every row (position) in the array from
112
+ # each_position and compute the match symbol. Concat to str
113
+ m.each do |pos|
114
+ # if there's a gap in the alignment at this pos, return a space
115
+ if pos.index("-") != nil
116
+ str += " "
117
+ else
118
+ # no gaps, so determine strength of column
119
+ p = pos.collect { |c| c.upcase }.sort.uniq
120
+ if p.length == 1
121
+ str += "*"
122
+ elsif StrongConservationGroups.find { |x| (p - x).empty? }
123
+ str += ":"
124
+ elsif WeakConservationGroups.find { |x| (p - x).empty? }
125
+ str += "."
126
+ else
127
+ str += " "
128
+ end
129
+ end
130
+ end
131
+ str
132
+ end
133
+
134
+ def to_s
135
+ str = ""
136
+ self.order.each do |name|
137
+ str += "#{name}: #{self.sequences[name]}\n"
138
+ end
139
+ str += self.match + "\n"
140
+ str
141
+ end
142
+ end # end MgNu::Alignment class
143
+ end # end MgNu module
@@ -0,0 +1,68 @@
1
+ module MgNu
2
+ # codon table 11 from http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
3
+ # standard bacteria/archae/plastid codes
4
+ BACTERIA_CODONS = {'ttt' => 'F', 'tct' => 'S', 'tat' => 'Y', 'tgt' => 'C',
5
+ 'ttc' => 'F', 'tcc' => 'S', 'tac' => 'Y', 'tgc' => 'C',
6
+ 'tta' => 'L', 'tca' => 'S', 'taa' => '*', 'tga' => '*',
7
+ 'ttg' => 'L', 'tcg' => 'S', 'tag' => '*', 'tgg' => 'W',
8
+
9
+ 'ctt' => 'L', 'cct' => 'P', 'cat' => 'H', 'cgt' => 'R',
10
+ 'ctc' => 'L', 'ccc' => 'P', 'cac' => 'H', 'cgc' => 'R',
11
+ 'cta' => 'L', 'cca' => 'P', 'caa' => 'Q', 'cga' => 'R',
12
+ 'ctg' => 'L', 'ccg' => 'P', 'cag' => 'Q', 'cgg' => 'R',
13
+
14
+ 'att' => 'I', 'act' => 'T', 'aat' => 'N', 'agt' => 'S',
15
+ 'atc' => 'I', 'acc' => 'T', 'aac' => 'N', 'agc' => 'S',
16
+ 'ata' => 'I', 'aca' => 'T', 'aaa' => 'K', 'aga' => 'R',
17
+ 'atg' => 'M', 'acg' => 'T', 'aag' => 'K', 'agg' => 'R',
18
+
19
+ 'gtt' => 'V', 'gct' => 'A', 'gat' => 'D', 'ggt' => 'G',
20
+ 'gtc' => 'V', 'gcc' => 'A', 'gac' => 'D', 'ggc' => 'G',
21
+ 'gta' => 'V', 'gca' => 'A', 'gaa' => 'E', 'gga' => 'G',
22
+ 'gtg' => 'V', 'gcg' => 'A', 'gag' => 'E', 'ggg' => 'G'}
23
+ end
24
+
25
+ # example usage of Regexp#global_match
26
+ # re = /(\w+)/
27
+ # words = []
28
+ # re.global_match("cat dog house") do |m|
29
+ # words.push(m[0])
30
+ # end
31
+ # p words # ["cat", "dog", "house"]
32
+ class Regexp
33
+ def global_match(str, &proc)
34
+ retval = nil
35
+ loop do
36
+ res = str.sub(self) do |m|
37
+ proc.call($~) # pass MatchData obj
38
+ ''
39
+ end
40
+ break retval if res == str
41
+ str = res
42
+ retval ||= true
43
+ end
44
+ end # end of global_match
45
+ end # end of Regexp class
46
+
47
+ # add print_multiline method to String class
48
+ class String
49
+ def print_multiline(width=80, options={})
50
+ return unless self.length > 0
51
+ indent = ' ' * (options[:indent] || 12)
52
+ x = width - indent.length
53
+ # string broken up with spaces or solid string
54
+ split_str = self.scan(/(.{1,#{x}})(?: +|$)\n?|(.{#{x}})/)
55
+ out = ''
56
+ # print first line without indent
57
+ out += split_str.first[0] || split_str.first[1]
58
+
59
+ if split_str.length > 1
60
+ out += "\n"
61
+ end
62
+ # print all other lines with indent
63
+ out += split_str[1..-1].map do |str, other|
64
+ "#{indent}#{str || other}"
65
+ end.join("\n")
66
+ out
67
+ end # end of print_multiline
68
+ end
@@ -0,0 +1,117 @@
1
+ require 'forwardable'
2
+ require 'mgnu/genbank/feature'
3
+ require 'mgnu/genbank/location'
4
+ require 'mgnu/genbank/qualifier'
5
+ require 'mgnu/genbank/reference'
6
+ require 'mgnu/genbank/source'
7
+
8
+ module MgNu
9
+ class Genbank
10
+ attr_accessor :locus, :definition, :accession, :secondary_accession, :version, :dblink
11
+ attr_accessor :geninfo_identifier, :keywords, :segment, :source, :references, :comment
12
+ attr_accessor :features, :sequence
13
+ include MgNu::Loggable
14
+ extend Forwardable
15
+
16
+ STRUCTURE = [:locus, :definition, :accession, :version, :dblink,
17
+ :keywords, :segment, :source, :references, :comment,
18
+ :features, :sequence]
19
+
20
+ Locus = Struct.new :name, :length, :no_of_strands, :molecule_type, :molecule_structure, :genbank_division, :modification_date do
21
+ def to_s
22
+ str = ''
23
+ str << 'LOCUS'.ljust(12) # 1-12
24
+ str << name.ljust(17) # 13-29
25
+ str << length.rjust(11) # 30-41
26
+ str << ' bp ' # 41-44
27
+ str << "#{no_of_strands}".ljust(3) # ss- ds- ms-, 45-47
28
+ str << "#{molecule_type}".ljust(8) # 48-55
29
+ str << "#{molecule_structure}".ljust(8) # linear or circular, 56-63
30
+ str << " #{genbank_division} " # 65-68
31
+ str << modification_date # 69
32
+ end
33
+ end
34
+
35
+ # create a new Genbank object
36
+ def initialize
37
+ @locus = nil
38
+ @definition = ''
39
+ @accession = ''
40
+ @secondary_accession = []
41
+ @dblink = ''
42
+ @version = ''
43
+ @geninfo_identifier = ''
44
+ @keywords = nil
45
+ @segment = ''
46
+ @source = nil
47
+ @references = []
48
+ @comment = ''
49
+ @features = []
50
+ @sequence = ''
51
+ end
52
+
53
+ def_delegators :@locus, :name, :length, :no_of_strands, :molecule_type
54
+ def_delegators :molecule_structure, :genbank_division, :modification_date
55
+
56
+ # string representation
57
+ def to_s
58
+ str = ''
59
+ STRUCTURE.each do |part|
60
+ p = send(part)
61
+ p_exists = false
62
+ case part
63
+ when :locus, :source
64
+ if p
65
+ p_exists = true
66
+ str << p.to_s
67
+ end
68
+ when :definition, :dblink, :segment, :comment
69
+ if p && !p.empty?
70
+ p_exists = true
71
+ str << part.to_s.upcase.ljust(12)
72
+ str << p.print_multiline
73
+ str << '.' if part == :definition
74
+ end
75
+ when :accession
76
+ if p && !p.empty?
77
+ p_exists = true
78
+ str += 'ACCESSION'.ljust(12)
79
+ str += accession
80
+ if secondary_accession.any?
81
+ str += " #{secondary_accession.join(' ')}"
82
+ end
83
+ end
84
+ when :version
85
+ if p && !p.empty?
86
+ p_exists = true
87
+ str += 'VERSION'.ljust(12)
88
+ str += version
89
+ str += " GI:#{geninfo_identifier}" if geninfo_identifier
90
+ end
91
+ when :features, :references
92
+ unless p.empty?
93
+ p_exists = true
94
+ str += "FEATURES Location/Qualifiers\n" if part == :features
95
+ temp = p.collect { |x| x.to_s }
96
+ str += temp.join("\n")
97
+ end
98
+ when :sequence
99
+ unless p.value.empty?
100
+ p_exists = true
101
+ str << "#{'ORIGIN'.ljust(12)}\n"
102
+ str << @sequence.to_genbank
103
+ end
104
+ when :keywords
105
+ p_exists = true
106
+ str << 'KEYWORDS'.ljust(12)
107
+ str << p.join('; ').print_multiline if p
108
+ str << '.'
109
+ end
110
+ # print newline character if there are more parts
111
+ str << "\n" if p_exists && STRUCTURE[STRUCTURE.index(part) + 1]
112
+ end
113
+ str << '//'
114
+ end
115
+ end # end of MgNu::Parser::Genbank class
116
+ end # end of MgNu module
117
+ __END__
@@ -0,0 +1,84 @@
1
+ module MgNu
2
+ class Genbank
3
+ class Feature
4
+ attr_accessor :feature_type, :qualifiers, :location, :sequence
5
+ attr_accessor :start_continues, :stop_continues, :raw_qualifiers
6
+
7
+ # create a new Feature object
8
+ def initialize
9
+ @qualifiers = []
10
+ @raw_qualifiers = []
11
+ end
12
+
13
+ # for handling tags in gb format
14
+ def method_missing(method_name, *args)
15
+ quals = @qualifiers.select {|q| q.name == method_name.to_s}
16
+ if quals.length > 1
17
+ return quals.map {|q| q.value }
18
+ elsif quals.length == 1
19
+ return quals.first.value
20
+ else
21
+ return nil
22
+ end
23
+ end
24
+
25
+ # class method for parsing a gb entry in a buffer
26
+ def self.parse(buffer)
27
+ buffer = buffer.split("\n")
28
+ feature = Feature.new # create a new feature
29
+ buffer.each_with_index do |line,i|
30
+ if line =~ /^\s{5}([\w\-\*']+)\s+(.+)$/ #feature type and (beginning of) location line
31
+ feature.feature_type = Regexp.last_match[1]
32
+ loc = Regexp.last_match[2]
33
+
34
+ until buffer[i + 1] =~ /\/.+=.+/ # check for a continuation of Location line
35
+ break unless buffer[i+1]
36
+ loc += buffer[i + 1].lstrip!
37
+ buffer.delete_at(i + 1)
38
+ end
39
+ feature.location = Location.new(loc)
40
+ elsif line =~ /^\s{21}\/(.+)=(.+)$/
41
+ key, value = Regexp.last_match[1], Regexp.last_match[2]
42
+
43
+ # to handle multi-line qualifier values
44
+ until buffer[i+1] =~ /^\s{21}\/(?:.+?)=/ # next qualifier
45
+ break unless buffer[i + 1]
46
+ value += " #{buffer[i + 1].lstrip}"
47
+ buffer.delete_at(i + 1)
48
+ end
49
+ # parse out quotes
50
+ quoted = false
51
+ if value =~ /^"(.+)"$/
52
+ value = Regexp.last_match[1]
53
+ quoted = true # some qualifier values are part of a controlled vocabulary and, as such, unquoted
54
+ end
55
+ # make sure sequence contains no spaces
56
+ if key == 'translation'
57
+ value.gsub!(/\s/, '');
58
+ end
59
+ # add new qualifier to feature
60
+ feature.qualifiers << Qualifier.new(:name => key, :value => value.squeeze(' '), :quoted => quoted)
61
+ elsif line =~ /^\s{21}\/(.+)$/ # qualifier name w/out value
62
+ key = Regexp.last_match[1]
63
+ feature.qualifiers << Qualifier.new(:name => key)
64
+ else
65
+ raise "UNKNOWN FEATURE LINE TYPE: #{line} -- #{i}"
66
+ end
67
+ end # end loop through buffer
68
+ feature
69
+ end
70
+
71
+ # string representation of Feature
72
+ def to_s
73
+ out = ''
74
+ out += ' ' * 5
75
+ out += feature_type.ljust(16)
76
+ out += location.to_s
77
+ qualifiers.each do |q|
78
+ out += q.to_s
79
+ end
80
+ out
81
+ end
82
+ end # end MgNu::Genbank::Feature class
83
+ end # end MgNu::Genbank class
84
+ end # end MgNu module