mgnu 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,22 @@
1
+ module MgNu
2
+ module Parser
3
+ class Sam
4
+ class Alignment
5
+ attr_accessor :name, :flag, :hit, :position, :quality, :cigar, :mate_ref
6
+ attr_accessor :mate_pos, :distance, :sequence, :query_qual, :other
7
+
8
+ # create a new Alignment object
9
+ def initialize(attributes = {})
10
+ self.attributes = attributes
11
+ end
12
+
13
+
14
+ def attributes=(attributes = {})
15
+ attributes.each do |attr,value|
16
+ self.send("#{attr}=", value) if self.respond_to?("#{attr}=")
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,23 @@
1
+ module MgNu
2
+ module Parser
3
+ class Sam
4
+ class Header
5
+ include MgNu::Loggable
6
+
7
+ attr_accessor :vn, :so, :sq, :rg, :pg, :co
8
+
9
+ # create a new Header object
10
+ def initialize(options)
11
+ options = {
12
+ :vn => options.has_key?(:vn) ? options[:vn] : nil,
13
+ :so => options.has_key?(:so) ? options[:so] : nil,
14
+ :sq => options.has_key?(:sq) ? options[:sq] : nil,
15
+ :rg => options.has_key?(:rg) ? options[:rg] : nil,
16
+ :pg => options.has_key?(:pg) ? options[:pg] : nil,
17
+ :co => options.has_key?(:co) ? options[:co] : nil,
18
+ }.merge!(options)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,18 @@
1
+ module MgNu
2
+ module Parser
3
+ module Sam
4
+ class Pair
5
+
6
+ attr_accessor :name, :first, :second
7
+
8
+ # create a new Pair object
9
+ def initialize(name, first, second)
10
+ @name = name
11
+ @first = first
12
+ @second = second
13
+ end
14
+
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,207 @@
1
+ require 'strscan'
2
+ require_relative 'sequence/fasta'
3
+ require_relative 'sequence/fastq'
4
+
5
+ module MgNu
6
+ class Sequence
7
+ attr_accessor :type, :value
8
+
9
+ def initialize(options)
10
+ options = {:value => nil, :type => nil}.merge! options
11
+ @value = options[:value]
12
+ @value = options[:sequence] if options.key?(:sequence)
13
+ @type = options[:type]
14
+ end
15
+
16
+ alias_method :sequence, :value
17
+ alias_method :sequence=, :value=
18
+
19
+ def rna?
20
+ @type == 'rna' ? true : false
21
+ end
22
+
23
+ def dna?
24
+ @type == 'dna' ? true : false
25
+ end
26
+
27
+ def aa?
28
+ @type == 'aa' || @type == 'aminoacid' || @type == 'protein' ? true : false
29
+ end
30
+
31
+ alias_method :protein?, :aa?
32
+ alias_method :aminoacid?, :aa?
33
+
34
+ def length
35
+ @value.nil? ? nil : @value.length
36
+ end
37
+
38
+ # returns a string
39
+ def complement
40
+ if @type == 'rna'
41
+ @value.tr('ucgtrymkdhvbUCGTRYMKDHVB', 'agcuyrkmhdbvAGCUYRKMHDBV')
42
+ else
43
+ @value.tr('acgtrymkdhvbACGTRYMKDHVB', 'tgcayrkmhdbvTGCAYRKMHDBV')
44
+ end
45
+ end
46
+
47
+ # changes sequence @value
48
+ def complement!
49
+ @value = complement
50
+ end
51
+
52
+ def reverse_complement
53
+ complement.reverse
54
+ end
55
+ alias_method :revcomp, :reverse_complement
56
+
57
+ def reverse_complement!
58
+ @value = complement.reverse
59
+ end
60
+ alias_method :revcomp!, :reverse_complement!
61
+
62
+ def translate(frame = 1, cdn_table = MgNu::BACTERIA_CODONS)
63
+ from, sequence = nil, @value
64
+
65
+ case frame
66
+ when 1, 2, 3
67
+ from = frame - 1
68
+ when 4, 5, 6
69
+ from = frame - 4
70
+ sequence = reverse_complement
71
+ when -1, -2, -3
72
+ from = -1 - frame
73
+ sequence = reverse_complement
74
+ else
75
+ $stderr.puts 'unknown frame - defaulting to zero (0)'
76
+ from = 0
77
+ end
78
+
79
+ nalen = sequence.length - from
80
+ nalen -= nalen % 3
81
+ sequence[from, nalen].downcase.gsub(/.{3}/) { |codon| cdn_table[codon] || 'X' }
82
+ end
83
+
84
+ def translate!(frame = 1, cdn_table = MgNu::BACTERIA_CODONS)
85
+ @value = translate(frame, cdn_table)
86
+ end
87
+
88
+ def to_s(cols = 60)
89
+ seq = ''
90
+ if @value.length < cols
91
+ seq = @value
92
+ else
93
+ 0.step(@value.length, cols) { |segment| seq += @value[segment, cols] + "\n" }
94
+ end
95
+ seq
96
+ end
97
+
98
+ # Genbank formatted sequence 6 cols w/10 letters each, right justified line numbers
99
+ # 1 tcctgatctc ctttatagca ctttccgtga aaattgccaa gcgacctgca tgagttccgg
100
+ # 61 gagcgagaac ttctgcattt aactcacgag gagtaacaat atccactcca ggcagattcc
101
+ # 121 tgaaaccctt cagaacatta tccttgttgg atacaactat caaaacgctc ttctttttct
102
+ def to_genbank
103
+ i = 1
104
+ result = @value.gsub(/.{1,60}/) do |s|
105
+ s = s.gsub(/.{1,10}/, ' \0')
106
+ y = format('%9d%s\n', i, s)
107
+ i += 60
108
+ y
109
+ end
110
+ result
111
+ end
112
+
113
+ # returns an array of 1-based positon ranges after splitting on N-blocks > length
114
+ def nblocks(length = 10)
115
+ pieces = []
116
+ prev = 1
117
+ seq = StringScanner.new(value) # the sequence
118
+ while seq.scan_until(/[Nn]{#{length},}/) # only splits at N stitches that are >10, but that can be changed
119
+ pieces << (prev .. seq.pos - seq.matched.length)
120
+ prev = seq.pos + 1
121
+ end
122
+ pieces << (prev .. value.length) # add last piece
123
+ pieces
124
+ end
125
+
126
+ def levenshtein_distance(other)
127
+ # initialize
128
+ a, b, m = '', '', []
129
+
130
+ # one or the other strings are empty or the strings are the same
131
+ return -1 if @value.nil? || @value == ''
132
+ a = @value.upcase
133
+
134
+ if other.class == MgNu::Sequence
135
+ return -1 if other.value == '' || other.value.nil?
136
+ b = other.value.upcase
137
+ return 0 if other.value.upcase == @value.upcase
138
+ elsif other.class == String
139
+ return -1 if other == ''
140
+ b = other.upcase
141
+ return 0 if other.upcase == @value.upcase
142
+ end
143
+
144
+ 0.upto(a.length) { |x| m[x] = [x] }
145
+ 1.upto(b.length) { |x| m[0] << x }
146
+
147
+ 1.upto(a.length) do |x|
148
+ 1.upto(b.length) do |y|
149
+ cost = a[x - 1] == b[y - 1] ? 0 : 1
150
+ m[x][y] = [m[x - 1][y] + 1, m[x][y - 1] + 1, m[x - 1][y - 1] + cost].min
151
+ end
152
+ end
153
+ m[-1][-1]
154
+ end # end of levenshtein_distance
155
+
156
+ alias_method :distance, :levenshtein_distance
157
+
158
+ def percent_identity(other)
159
+ # one or the other strings are empty or the strings are the same
160
+ return -1 if @value.nil? || @value == ''
161
+ a = @value
162
+ b = ''
163
+
164
+ if other.class == MgNu::Sequence
165
+ return -1 if other.value == '' || other.value.nil?
166
+ b = other.value
167
+ return 1.0 if other.value == @value
168
+ elsif other.class == String
169
+ return -1 if other == ''
170
+ b = other
171
+ return 1.0 if other == @value
172
+ end
173
+
174
+ if a.length != b.length
175
+ warn('lengths differ - percent identity may is probably inaccurate')
176
+ end
177
+
178
+ match = 0
179
+ a.split(//).each_with_index do |char, i|
180
+ match += 1 if char.upcase == b[i].chr.upcase
181
+ end
182
+
183
+ a.length >= b.length ? match / a.length.to_f : match / b.length.to_f
184
+ end # end of percent_identity
185
+
186
+ alias_method :identity, :percent_identity
187
+
188
+ def gc_content
189
+ return -1 if @value == '' || @value.nil?
190
+ base2count = {'A' => 0, 'C' => 0, 'G' => 0, 'T' => 0, 'U' => 0,
191
+ 'R' => 0, 'Y' => 0, 'M' => 0, 'K' => 0, 'W' => 0,
192
+ 'S' => 0, 'B' => 0, 'D' => 0, 'H' => 0, 'V' => 0}
193
+ temp = @value.split(//)
194
+ temp.each do |base|
195
+ next if base == '*' || base.upcase == 'N'
196
+ if base2count.key?(base.upcase)
197
+ base2count[base.upcase] += 1
198
+ else
199
+ $stderr.puts "Unknown character #{base.upcase}"
200
+ end
201
+ end
202
+ gc = base2count['G'] + base2count['C'] + base2count['R'] + base2count['K'] + base2count['S'] + base2count['B'] + base2count['D'] + base2count['V']
203
+ total = base2count.values.inject(0) { |a, e| a + e.nil? ? 0 : e }
204
+ format('%.4f', (gc.to_f / total.to_f))
205
+ end
206
+ end # end of Sequence class
207
+ end # end of MgNu module
@@ -0,0 +1,79 @@
1
+ #require 'mgnu/sequence'
2
+
3
+ module MgNu
4
+ class Sequence
5
+ class Fasta < Sequence
6
+ attr_accessor :header, :header_name, :header_description
7
+
8
+ # create a new MgNu::Sequence::Fasta object
9
+ def initialize(options)
10
+ super(options)
11
+ options = {:header => nil}.merge! options
12
+ @header = options[:header]
13
+ temp = @header.split
14
+ @header_name = temp.shift
15
+ @header_description = temp.length > 0 ? temp.join(' ') : nil
16
+ end
17
+
18
+ # split sequence into columns
19
+ def sequence_by_columns(cols = 60)
20
+ seq = ''
21
+ if length < cols
22
+ seq << sequence
23
+ else
24
+ 0.step(length, cols) { |segment| seq << sequence[segment, cols] << "\n" }
25
+ end
26
+ seq
27
+ end
28
+
29
+ # override to_s string representation
30
+ def to_s(cols = 60)
31
+ seq = ''
32
+ if sequence =~ /\d+\s+\d+/
33
+ # this is a fasta quality sequence
34
+ scores = sequence.split(/\s+/)
35
+ buffer = []
36
+ while scores.length > 0
37
+ score = scores.shift
38
+ if buffer.length == 17
39
+ seq << "#{buffer.join(' ')}\n"
40
+ buffer.clear
41
+ buffer << score
42
+ else
43
+ buffer << score
44
+ end
45
+ end
46
+ seq << "#{buffer.join(' ')}\n" if buffer.length > 0
47
+ else
48
+ if cols == -1 # don't break the sequence up
49
+ seq = sequence
50
+ else
51
+ seq = length < cols ? sequence : sequence_by_columns(cols)
52
+ end
53
+ end
54
+ ">#{@header}\n#{seq}"
55
+ end
56
+
57
+ # find runs of N characters in the sequence and split
58
+ def split_on_n(min_n = 10)
59
+ count = 0
60
+ sequence_chunks = []
61
+ sequence.split(/[nN]{#{min_n},}/).each do |chunk|
62
+ sequence_chunks << chunk
63
+ count += 1
64
+ end
65
+
66
+ if count > 1
67
+ outstr = ''
68
+ sequence_chunks.each_with_index do |chunk, i|
69
+ outstr << ">#{@header_name}_#{i + 1} #{@header_description}\n"
70
+ outstr << "#{chunk}\n"
71
+ end
72
+ outstr
73
+ else
74
+ to_s
75
+ end
76
+ end
77
+ end # end of Fasta class
78
+ end # end of Sequence class
79
+ end # end of MgNu module
@@ -0,0 +1,43 @@
1
+ #require 'mgnu/sequence'
2
+
3
+ module MgNu
4
+ class Sequence
5
+ class Fastq < Sequence
6
+ attr_accessor :header, :header_name, :header_description
7
+ attr_accessor :quality, :qualhdr, :qualary, :offset
8
+
9
+ # create a new MgNu::Sequence::Fastq object
10
+ def initialize(options)
11
+ super(options)
12
+ options = {:offset => 64, :header => nil, :quality => nil}.merge! options
13
+ @quality = options[:quality]
14
+ @offset = options[:offset]
15
+ @header = options[:header]
16
+ temp = @header.split
17
+ @header_name = temp.shift
18
+ @header_description = temp.length > 0 ? temp.join(' ') : nil
19
+ @qualhdr = options[:qualhdr] if options[:qualhdr]
20
+ end
21
+
22
+ def to_fasta
23
+ MgNu::Sequence::Fasta.new(:header => @header, :sequence => sequence)
24
+ end
25
+
26
+ # override to_s representation
27
+ def to_s
28
+ "@#{@header}\n#{sequence}\n+\n#{@quality}\n"
29
+ end
30
+
31
+ # Unpack the quality string and return an array of
32
+ # offset-corrected integers
33
+ # @params [Integer] offset platform dependent offset value to
34
+ # substract from the quality score, defaults to most recent
35
+ # platform (i.e. illumina)
36
+ # (64)
37
+ # @return [Array] the array of integers
38
+ def unpack_quality # only compute this one time
39
+ @quality.unpack('C*').map! { |x| x - @offset }
40
+ end
41
+ end # end of Fastq class
42
+ end # end of Sequence class
43
+ end # end of MgNu module
@@ -0,0 +1,16 @@
1
+ module MgNu
2
+ class Version
3
+ MAJOR = 2
4
+ MINOR = 1
5
+ PATCH = 1
6
+ PRE = nil
7
+ class << self
8
+ # string representation of the version
9
+ #
10
+ # @return [String]
11
+ def to_s
12
+ [MAJOR, MINOR, PATCH, PRE].compact.join('.')
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,39 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'mgnu/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "mgnu"
8
+ spec.version = MgNu::Version
9
+ spec.authors = ["Brian C. Thomas"]
10
+ spec.email = ["bct.x42@gmail.com"]
11
+ spec.summary = %q{MgNu Ruby Bioinformatics Library}
12
+ spec.description = %q{Lightweight ruby bioinformatics library}
13
+ spec.homepage = "http://www.metagenomi.co"
14
+ spec.license = "MIT"
15
+
16
+ # spec.files = `git ls-files -z`.split("\x0")
17
+ spec.files = %w(.yardopts README.md Rakefile mgnu.gemspec)
18
+ spec.files += Dir.glob('lib/**/*.rb')
19
+ spec.files += Dir.glob('spec/**/*')
20
+
21
+
22
+ # spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
23
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
24
+ spec.require_paths = ["lib"]
25
+
26
+ spec.add_dependency('naught', '~> 0.0', '>= 0.0.1')
27
+ spec.add_dependency('moneta', '~> 0.0', '>= 0.0.1')
28
+ spec.add_dependency('tokyocabinet', '~> 1.29', '>= 1.29.1')
29
+ spec.add_dependency('ox', '~> 2.8', '>= 2.8.2')
30
+ spec.add_dependency('memoizable', '~> 0.0', '>= 0.0.1')
31
+ spec.add_dependency('yard', '~> 0.9', '>= 0.9.11')
32
+ spec.add_dependency('rake', '~> 12.0', '>= 12.0.0')
33
+ spec.add_development_dependency('mime-types', '~> 0.0', '> 0.0.1')
34
+ spec.add_development_dependency('rspec', '~> 0.0', '> 0.0.1')
35
+ spec.add_development_dependency('rubocop', '~> 0.49', '>= 0.49.0')
36
+ spec.add_development_dependency('timecop', '~> 0.0', '> 0.0.1')
37
+ spec.add_development_dependency('yardstick', '~> 0.0', '> 0.0.1')
38
+ spec.add_development_dependency('simplecov', '~> 0.0', '> 0.0.1')
39
+ end