mgnu 2.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +0 -0
  3. data/README.md +31 -0
  4. data/Rakefile +33 -0
  5. data/lib/mgnu.rb +9 -0
  6. data/lib/mgnu/alignment.rb +143 -0
  7. data/lib/mgnu/common.rb +68 -0
  8. data/lib/mgnu/genbank.rb +117 -0
  9. data/lib/mgnu/genbank/feature.rb +84 -0
  10. data/lib/mgnu/genbank/location.rb +150 -0
  11. data/lib/mgnu/genbank/qualifier.rb +45 -0
  12. data/lib/mgnu/genbank/reference.rb +114 -0
  13. data/lib/mgnu/genbank/source.rb +39 -0
  14. data/lib/mgnu/loggable.rb +61 -0
  15. data/lib/mgnu/parser.rb +50 -0
  16. data/lib/mgnu/parser/blast.rb +87 -0
  17. data/lib/mgnu/parser/blast/format0.rb +290 -0
  18. data/lib/mgnu/parser/blast/format7.rb +121 -0
  19. data/lib/mgnu/parser/blast/format8.rb +120 -0
  20. data/lib/mgnu/parser/blast/hsp.rb +75 -0
  21. data/lib/mgnu/parser/blast/query.rb +45 -0
  22. data/lib/mgnu/parser/blast/sbjct.rb +62 -0
  23. data/lib/mgnu/parser/clustalw.rb +72 -0
  24. data/lib/mgnu/parser/fasta.rb +61 -0
  25. data/lib/mgnu/parser/fasta_header_index.rb +39 -0
  26. data/lib/mgnu/parser/fasta_index.rb +57 -0
  27. data/lib/mgnu/parser/fastq.rb +61 -0
  28. data/lib/mgnu/parser/genbank.rb +187 -0
  29. data/lib/mgnu/parser/gff.rb +56 -0
  30. data/lib/mgnu/parser/iprscan/hit.rb +76 -0
  31. data/lib/mgnu/parser/iprscan_file.rb +39 -0
  32. data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
  33. data/lib/mgnu/parser/pilercr.rb +102 -0
  34. data/lib/mgnu/parser/prodigal.rb +170 -0
  35. data/lib/mgnu/parser/sam.rb +115 -0
  36. data/lib/mgnu/parser/sam/alignment.rb +22 -0
  37. data/lib/mgnu/parser/sam/header.rb +23 -0
  38. data/lib/mgnu/parser/sam/pair.rb +18 -0
  39. data/lib/mgnu/sequence.rb +207 -0
  40. data/lib/mgnu/sequence/fasta.rb +79 -0
  41. data/lib/mgnu/sequence/fastq.rb +43 -0
  42. data/lib/mgnu/version.rb +16 -0
  43. data/mgnu.gemspec +39 -0
  44. data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
  45. data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
  46. data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
  47. data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
  48. data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
  49. data/spec/mgnu/parser/clustalw_spec.rb +90 -0
  50. data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
  51. data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
  52. data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
  53. data/spec/mgnu/parser_spec.rb +22 -0
  54. data/spec/mgnu/sequence/fasta_spec.rb +60 -0
  55. data/spec/mgnu/sequence/fastq_spec.rb +31 -0
  56. data/spec/mgnu/sequence_spec.rb +81 -0
  57. data/spec/mgnu_spec.rb +7 -0
  58. data/spec/spec_helper.rb +53 -0
  59. metadata +376 -0
@@ -0,0 +1,22 @@
1
+ module MgNu
2
+ module Parser
3
+ class Sam
4
+ class Alignment
5
+ attr_accessor :name, :flag, :hit, :position, :quality, :cigar, :mate_ref
6
+ attr_accessor :mate_pos, :distance, :sequence, :query_qual, :other
7
+
8
+ # create a new Alignment object
9
+ def initialize(attributes = {})
10
+ self.attributes = attributes
11
+ end
12
+
13
+
14
+ def attributes=(attributes = {})
15
+ attributes.each do |attr,value|
16
+ self.send("#{attr}=", value) if self.respond_to?("#{attr}=")
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,23 @@
1
+ module MgNu
2
+ module Parser
3
+ class Sam
4
+ class Header
5
+ include MgNu::Loggable
6
+
7
+ attr_accessor :vn, :so, :sq, :rg, :pg, :co
8
+
9
+ # create a new Header object
10
+ def initialize(options)
11
+ options = {
12
+ :vn => options.has_key?(:vn) ? options[:vn] : nil,
13
+ :so => options.has_key?(:so) ? options[:so] : nil,
14
+ :sq => options.has_key?(:sq) ? options[:sq] : nil,
15
+ :rg => options.has_key?(:rg) ? options[:rg] : nil,
16
+ :pg => options.has_key?(:pg) ? options[:pg] : nil,
17
+ :co => options.has_key?(:co) ? options[:co] : nil,
18
+ }.merge!(options)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,18 @@
1
+ module MgNu
2
+ module Parser
3
+ module Sam
4
+ class Pair
5
+
6
+ attr_accessor :name, :first, :second
7
+
8
+ # create a new Pair object
9
+ def initialize(name, first, second)
10
+ @name = name
11
+ @first = first
12
+ @second = second
13
+ end
14
+
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,207 @@
1
+ require 'strscan'
2
+ require_relative 'sequence/fasta'
3
+ require_relative 'sequence/fastq'
4
+
5
+ module MgNu
6
+ class Sequence
7
+ attr_accessor :type, :value
8
+
9
+ def initialize(options)
10
+ options = {:value => nil, :type => nil}.merge! options
11
+ @value = options[:value]
12
+ @value = options[:sequence] if options.key?(:sequence)
13
+ @type = options[:type]
14
+ end
15
+
16
+ alias_method :sequence, :value
17
+ alias_method :sequence=, :value=
18
+
19
+ def rna?
20
+ @type == 'rna' ? true : false
21
+ end
22
+
23
+ def dna?
24
+ @type == 'dna' ? true : false
25
+ end
26
+
27
+ def aa?
28
+ @type == 'aa' || @type == 'aminoacid' || @type == 'protein' ? true : false
29
+ end
30
+
31
+ alias_method :protein?, :aa?
32
+ alias_method :aminoacid?, :aa?
33
+
34
+ def length
35
+ @value.nil? ? nil : @value.length
36
+ end
37
+
38
+ # returns a string
39
+ def complement
40
+ if @type == 'rna'
41
+ @value.tr('ucgtrymkdhvbUCGTRYMKDHVB', 'agcuyrkmhdbvAGCUYRKMHDBV')
42
+ else
43
+ @value.tr('acgtrymkdhvbACGTRYMKDHVB', 'tgcayrkmhdbvTGCAYRKMHDBV')
44
+ end
45
+ end
46
+
47
+ # changes sequence @value
48
+ def complement!
49
+ @value = complement
50
+ end
51
+
52
+ def reverse_complement
53
+ complement.reverse
54
+ end
55
+ alias_method :revcomp, :reverse_complement
56
+
57
+ def reverse_complement!
58
+ @value = complement.reverse
59
+ end
60
+ alias_method :revcomp!, :reverse_complement!
61
+
62
+ def translate(frame = 1, cdn_table = MgNu::BACTERIA_CODONS)
63
+ from, sequence = nil, @value
64
+
65
+ case frame
66
+ when 1, 2, 3
67
+ from = frame - 1
68
+ when 4, 5, 6
69
+ from = frame - 4
70
+ sequence = reverse_complement
71
+ when -1, -2, -3
72
+ from = -1 - frame
73
+ sequence = reverse_complement
74
+ else
75
+ $stderr.puts 'unknown frame - defaulting to zero (0)'
76
+ from = 0
77
+ end
78
+
79
+ nalen = sequence.length - from
80
+ nalen -= nalen % 3
81
+ sequence[from, nalen].downcase.gsub(/.{3}/) { |codon| cdn_table[codon] || 'X' }
82
+ end
83
+
84
+ def translate!(frame = 1, cdn_table = MgNu::BACTERIA_CODONS)
85
+ @value = translate(frame, cdn_table)
86
+ end
87
+
88
+ def to_s(cols = 60)
89
+ seq = ''
90
+ if @value.length < cols
91
+ seq = @value
92
+ else
93
+ 0.step(@value.length, cols) { |segment| seq += @value[segment, cols] + "\n" }
94
+ end
95
+ seq
96
+ end
97
+
98
+ # Genbank formatted sequence 6 cols w/10 letters each, right justified line numbers
99
+ # 1 tcctgatctc ctttatagca ctttccgtga aaattgccaa gcgacctgca tgagttccgg
100
+ # 61 gagcgagaac ttctgcattt aactcacgag gagtaacaat atccactcca ggcagattcc
101
+ # 121 tgaaaccctt cagaacatta tccttgttgg atacaactat caaaacgctc ttctttttct
102
+ def to_genbank
103
+ i = 1
104
+ result = @value.gsub(/.{1,60}/) do |s|
105
+ s = s.gsub(/.{1,10}/, ' \0')
106
+ y = format('%9d%s\n', i, s)
107
+ i += 60
108
+ y
109
+ end
110
+ result
111
+ end
112
+
113
+ # returns an array of 1-based positon ranges after splitting on N-blocks > length
114
+ def nblocks(length = 10)
115
+ pieces = []
116
+ prev = 1
117
+ seq = StringScanner.new(value) # the sequence
118
+ while seq.scan_until(/[Nn]{#{length},}/) # only splits at N stitches that are >10, but that can be changed
119
+ pieces << (prev .. seq.pos - seq.matched.length)
120
+ prev = seq.pos + 1
121
+ end
122
+ pieces << (prev .. value.length) # add last piece
123
+ pieces
124
+ end
125
+
126
+ def levenshtein_distance(other)
127
+ # initialize
128
+ a, b, m = '', '', []
129
+
130
+ # one or the other strings are empty or the strings are the same
131
+ return -1 if @value.nil? || @value == ''
132
+ a = @value.upcase
133
+
134
+ if other.class == MgNu::Sequence
135
+ return -1 if other.value == '' || other.value.nil?
136
+ b = other.value.upcase
137
+ return 0 if other.value.upcase == @value.upcase
138
+ elsif other.class == String
139
+ return -1 if other == ''
140
+ b = other.upcase
141
+ return 0 if other.upcase == @value.upcase
142
+ end
143
+
144
+ 0.upto(a.length) { |x| m[x] = [x] }
145
+ 1.upto(b.length) { |x| m[0] << x }
146
+
147
+ 1.upto(a.length) do |x|
148
+ 1.upto(b.length) do |y|
149
+ cost = a[x - 1] == b[y - 1] ? 0 : 1
150
+ m[x][y] = [m[x - 1][y] + 1, m[x][y - 1] + 1, m[x - 1][y - 1] + cost].min
151
+ end
152
+ end
153
+ m[-1][-1]
154
+ end # end of levenshtein_distance
155
+
156
+ alias_method :distance, :levenshtein_distance
157
+
158
+ def percent_identity(other)
159
+ # one or the other strings are empty or the strings are the same
160
+ return -1 if @value.nil? || @value == ''
161
+ a = @value
162
+ b = ''
163
+
164
+ if other.class == MgNu::Sequence
165
+ return -1 if other.value == '' || other.value.nil?
166
+ b = other.value
167
+ return 1.0 if other.value == @value
168
+ elsif other.class == String
169
+ return -1 if other == ''
170
+ b = other
171
+ return 1.0 if other == @value
172
+ end
173
+
174
+ if a.length != b.length
175
+ warn('lengths differ - percent identity may is probably inaccurate')
176
+ end
177
+
178
+ match = 0
179
+ a.split(//).each_with_index do |char, i|
180
+ match += 1 if char.upcase == b[i].chr.upcase
181
+ end
182
+
183
+ a.length >= b.length ? match / a.length.to_f : match / b.length.to_f
184
+ end # end of percent_identity
185
+
186
+ alias_method :identity, :percent_identity
187
+
188
+ def gc_content
189
+ return -1 if @value == '' || @value.nil?
190
+ base2count = {'A' => 0, 'C' => 0, 'G' => 0, 'T' => 0, 'U' => 0,
191
+ 'R' => 0, 'Y' => 0, 'M' => 0, 'K' => 0, 'W' => 0,
192
+ 'S' => 0, 'B' => 0, 'D' => 0, 'H' => 0, 'V' => 0}
193
+ temp = @value.split(//)
194
+ temp.each do |base|
195
+ next if base == '*' || base.upcase == 'N'
196
+ if base2count.key?(base.upcase)
197
+ base2count[base.upcase] += 1
198
+ else
199
+ $stderr.puts "Unknown character #{base.upcase}"
200
+ end
201
+ end
202
+ gc = base2count['G'] + base2count['C'] + base2count['R'] + base2count['K'] + base2count['S'] + base2count['B'] + base2count['D'] + base2count['V']
203
+ total = base2count.values.inject(0) { |a, e| a + e.nil? ? 0 : e }
204
+ format('%.4f', (gc.to_f / total.to_f))
205
+ end
206
+ end # end of Sequence class
207
+ end # end of MgNu module
@@ -0,0 +1,79 @@
1
+ #require 'mgnu/sequence'
2
+
3
+ module MgNu
4
+ class Sequence
5
+ class Fasta < Sequence
6
+ attr_accessor :header, :header_name, :header_description
7
+
8
+ # create a new MgNu::Sequence::Fasta object
9
+ def initialize(options)
10
+ super(options)
11
+ options = {:header => nil}.merge! options
12
+ @header = options[:header]
13
+ temp = @header.split
14
+ @header_name = temp.shift
15
+ @header_description = temp.length > 0 ? temp.join(' ') : nil
16
+ end
17
+
18
+ # split sequence into columns
19
+ def sequence_by_columns(cols = 60)
20
+ seq = ''
21
+ if length < cols
22
+ seq << sequence
23
+ else
24
+ 0.step(length, cols) { |segment| seq << sequence[segment, cols] << "\n" }
25
+ end
26
+ seq
27
+ end
28
+
29
+ # override to_s string representation
30
+ def to_s(cols = 60)
31
+ seq = ''
32
+ if sequence =~ /\d+\s+\d+/
33
+ # this is a fasta quality sequence
34
+ scores = sequence.split(/\s+/)
35
+ buffer = []
36
+ while scores.length > 0
37
+ score = scores.shift
38
+ if buffer.length == 17
39
+ seq << "#{buffer.join(' ')}\n"
40
+ buffer.clear
41
+ buffer << score
42
+ else
43
+ buffer << score
44
+ end
45
+ end
46
+ seq << "#{buffer.join(' ')}\n" if buffer.length > 0
47
+ else
48
+ if cols == -1 # don't break the sequence up
49
+ seq = sequence
50
+ else
51
+ seq = length < cols ? sequence : sequence_by_columns(cols)
52
+ end
53
+ end
54
+ ">#{@header}\n#{seq}"
55
+ end
56
+
57
+ # find runs of N characters in the sequence and split
58
+ def split_on_n(min_n = 10)
59
+ count = 0
60
+ sequence_chunks = []
61
+ sequence.split(/[nN]{#{min_n},}/).each do |chunk|
62
+ sequence_chunks << chunk
63
+ count += 1
64
+ end
65
+
66
+ if count > 1
67
+ outstr = ''
68
+ sequence_chunks.each_with_index do |chunk, i|
69
+ outstr << ">#{@header_name}_#{i + 1} #{@header_description}\n"
70
+ outstr << "#{chunk}\n"
71
+ end
72
+ outstr
73
+ else
74
+ to_s
75
+ end
76
+ end
77
+ end # end of Fasta class
78
+ end # end of Sequence class
79
+ end # end of MgNu module
@@ -0,0 +1,43 @@
1
+ #require 'mgnu/sequence'
2
+
3
+ module MgNu
4
+ class Sequence
5
+ class Fastq < Sequence
6
+ attr_accessor :header, :header_name, :header_description
7
+ attr_accessor :quality, :qualhdr, :qualary, :offset
8
+
9
+ # create a new MgNu::Sequence::Fastq object
10
+ def initialize(options)
11
+ super(options)
12
+ options = {:offset => 64, :header => nil, :quality => nil}.merge! options
13
+ @quality = options[:quality]
14
+ @offset = options[:offset]
15
+ @header = options[:header]
16
+ temp = @header.split
17
+ @header_name = temp.shift
18
+ @header_description = temp.length > 0 ? temp.join(' ') : nil
19
+ @qualhdr = options[:qualhdr] if options[:qualhdr]
20
+ end
21
+
22
+ def to_fasta
23
+ MgNu::Sequence::Fasta.new(:header => @header, :sequence => sequence)
24
+ end
25
+
26
+ # override to_s representation
27
+ def to_s
28
+ "@#{@header}\n#{sequence}\n+\n#{@quality}\n"
29
+ end
30
+
31
+ # Unpack the quality string and return an array of
32
+ # offset-corrected integers
33
+ # @params [Integer] offset platform dependent offset value to
34
+ # substract from the quality score, defaults to most recent
35
+ # platform (i.e. illumina)
36
+ # (64)
37
+ # @return [Array] the array of integers
38
+ def unpack_quality # only compute this one time
39
+ @quality.unpack('C*').map! { |x| x - @offset }
40
+ end
41
+ end # end of Fastq class
42
+ end # end of Sequence class
43
+ end # end of MgNu module
@@ -0,0 +1,16 @@
1
+ module MgNu
2
+ class Version
3
+ MAJOR = 2
4
+ MINOR = 1
5
+ PATCH = 1
6
+ PRE = nil
7
+ class << self
8
+ # string representation of the version
9
+ #
10
+ # @return [String]
11
+ def to_s
12
+ [MAJOR, MINOR, PATCH, PRE].compact.join('.')
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,39 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'mgnu/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "mgnu"
8
+ spec.version = MgNu::Version
9
+ spec.authors = ["Brian C. Thomas"]
10
+ spec.email = ["bct.x42@gmail.com"]
11
+ spec.summary = %q{MgNu Ruby Bioinformatics Library}
12
+ spec.description = %q{Lightweight ruby bioinformatics library}
13
+ spec.homepage = "http://www.metagenomi.co"
14
+ spec.license = "MIT"
15
+
16
+ # spec.files = `git ls-files -z`.split("\x0")
17
+ spec.files = %w(.yardopts README.md Rakefile mgnu.gemspec)
18
+ spec.files += Dir.glob('lib/**/*.rb')
19
+ spec.files += Dir.glob('spec/**/*')
20
+
21
+
22
+ # spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
23
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
24
+ spec.require_paths = ["lib"]
25
+
26
+ spec.add_dependency('naught', '~> 0.0', '>= 0.0.1')
27
+ spec.add_dependency('moneta', '~> 0.0', '>= 0.0.1')
28
+ spec.add_dependency('tokyocabinet', '~> 1.29', '>= 1.29.1')
29
+ spec.add_dependency('ox', '~> 2.8', '>= 2.8.2')
30
+ spec.add_dependency('memoizable', '~> 0.0', '>= 0.0.1')
31
+ spec.add_dependency('yard', '~> 0.9', '>= 0.9.11')
32
+ spec.add_dependency('rake', '~> 12.0', '>= 12.0.0')
33
+ spec.add_development_dependency('mime-types', '~> 0.0', '> 0.0.1')
34
+ spec.add_development_dependency('rspec', '~> 0.0', '> 0.0.1')
35
+ spec.add_development_dependency('rubocop', '~> 0.49', '>= 0.49.0')
36
+ spec.add_development_dependency('timecop', '~> 0.0', '> 0.0.1')
37
+ spec.add_development_dependency('yardstick', '~> 0.0', '> 0.0.1')
38
+ spec.add_development_dependency('simplecov', '~> 0.0', '> 0.0.1')
39
+ end