mgnu 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +0 -0
- data/README.md +31 -0
- data/Rakefile +33 -0
- data/lib/mgnu.rb +9 -0
- data/lib/mgnu/alignment.rb +143 -0
- data/lib/mgnu/common.rb +68 -0
- data/lib/mgnu/genbank.rb +117 -0
- data/lib/mgnu/genbank/feature.rb +84 -0
- data/lib/mgnu/genbank/location.rb +150 -0
- data/lib/mgnu/genbank/qualifier.rb +45 -0
- data/lib/mgnu/genbank/reference.rb +114 -0
- data/lib/mgnu/genbank/source.rb +39 -0
- data/lib/mgnu/loggable.rb +61 -0
- data/lib/mgnu/parser.rb +50 -0
- data/lib/mgnu/parser/blast.rb +87 -0
- data/lib/mgnu/parser/blast/format0.rb +290 -0
- data/lib/mgnu/parser/blast/format7.rb +121 -0
- data/lib/mgnu/parser/blast/format8.rb +120 -0
- data/lib/mgnu/parser/blast/hsp.rb +75 -0
- data/lib/mgnu/parser/blast/query.rb +45 -0
- data/lib/mgnu/parser/blast/sbjct.rb +62 -0
- data/lib/mgnu/parser/clustalw.rb +72 -0
- data/lib/mgnu/parser/fasta.rb +61 -0
- data/lib/mgnu/parser/fasta_header_index.rb +39 -0
- data/lib/mgnu/parser/fasta_index.rb +57 -0
- data/lib/mgnu/parser/fastq.rb +61 -0
- data/lib/mgnu/parser/genbank.rb +187 -0
- data/lib/mgnu/parser/gff.rb +56 -0
- data/lib/mgnu/parser/iprscan/hit.rb +76 -0
- data/lib/mgnu/parser/iprscan_file.rb +39 -0
- data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
- data/lib/mgnu/parser/pilercr.rb +102 -0
- data/lib/mgnu/parser/prodigal.rb +170 -0
- data/lib/mgnu/parser/sam.rb +115 -0
- data/lib/mgnu/parser/sam/alignment.rb +22 -0
- data/lib/mgnu/parser/sam/header.rb +23 -0
- data/lib/mgnu/parser/sam/pair.rb +18 -0
- data/lib/mgnu/sequence.rb +207 -0
- data/lib/mgnu/sequence/fasta.rb +79 -0
- data/lib/mgnu/sequence/fastq.rb +43 -0
- data/lib/mgnu/version.rb +16 -0
- data/mgnu.gemspec +39 -0
- data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
- data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
- data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
- data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
- data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
- data/spec/mgnu/parser/clustalw_spec.rb +90 -0
- data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
- data/spec/mgnu/parser_spec.rb +22 -0
- data/spec/mgnu/sequence/fasta_spec.rb +60 -0
- data/spec/mgnu/sequence/fastq_spec.rb +31 -0
- data/spec/mgnu/sequence_spec.rb +81 -0
- data/spec/mgnu_spec.rb +7 -0
- data/spec/spec_helper.rb +53 -0
- metadata +376 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Sam
|
4
|
+
class Alignment
|
5
|
+
attr_accessor :name, :flag, :hit, :position, :quality, :cigar, :mate_ref
|
6
|
+
attr_accessor :mate_pos, :distance, :sequence, :query_qual, :other
|
7
|
+
|
8
|
+
# create a new Alignment object
|
9
|
+
def initialize(attributes = {})
|
10
|
+
self.attributes = attributes
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
def attributes=(attributes = {})
|
15
|
+
attributes.each do |attr,value|
|
16
|
+
self.send("#{attr}=", value) if self.respond_to?("#{attr}=")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Sam
|
4
|
+
class Header
|
5
|
+
include MgNu::Loggable
|
6
|
+
|
7
|
+
attr_accessor :vn, :so, :sq, :rg, :pg, :co
|
8
|
+
|
9
|
+
# create a new Header object
|
10
|
+
def initialize(options)
|
11
|
+
options = {
|
12
|
+
:vn => options.has_key?(:vn) ? options[:vn] : nil,
|
13
|
+
:so => options.has_key?(:so) ? options[:so] : nil,
|
14
|
+
:sq => options.has_key?(:sq) ? options[:sq] : nil,
|
15
|
+
:rg => options.has_key?(:rg) ? options[:rg] : nil,
|
16
|
+
:pg => options.has_key?(:pg) ? options[:pg] : nil,
|
17
|
+
:co => options.has_key?(:co) ? options[:co] : nil,
|
18
|
+
}.merge!(options)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
module Sam
|
4
|
+
class Pair
|
5
|
+
|
6
|
+
attr_accessor :name, :first, :second
|
7
|
+
|
8
|
+
# create a new Pair object
|
9
|
+
def initialize(name, first, second)
|
10
|
+
@name = name
|
11
|
+
@first = first
|
12
|
+
@second = second
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,207 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
require_relative 'sequence/fasta'
|
3
|
+
require_relative 'sequence/fastq'
|
4
|
+
|
5
|
+
module MgNu
|
6
|
+
class Sequence
|
7
|
+
attr_accessor :type, :value
|
8
|
+
|
9
|
+
def initialize(options)
|
10
|
+
options = {:value => nil, :type => nil}.merge! options
|
11
|
+
@value = options[:value]
|
12
|
+
@value = options[:sequence] if options.key?(:sequence)
|
13
|
+
@type = options[:type]
|
14
|
+
end
|
15
|
+
|
16
|
+
alias_method :sequence, :value
|
17
|
+
alias_method :sequence=, :value=
|
18
|
+
|
19
|
+
def rna?
|
20
|
+
@type == 'rna' ? true : false
|
21
|
+
end
|
22
|
+
|
23
|
+
def dna?
|
24
|
+
@type == 'dna' ? true : false
|
25
|
+
end
|
26
|
+
|
27
|
+
def aa?
|
28
|
+
@type == 'aa' || @type == 'aminoacid' || @type == 'protein' ? true : false
|
29
|
+
end
|
30
|
+
|
31
|
+
alias_method :protein?, :aa?
|
32
|
+
alias_method :aminoacid?, :aa?
|
33
|
+
|
34
|
+
def length
|
35
|
+
@value.nil? ? nil : @value.length
|
36
|
+
end
|
37
|
+
|
38
|
+
# returns a string
|
39
|
+
def complement
|
40
|
+
if @type == 'rna'
|
41
|
+
@value.tr('ucgtrymkdhvbUCGTRYMKDHVB', 'agcuyrkmhdbvAGCUYRKMHDBV')
|
42
|
+
else
|
43
|
+
@value.tr('acgtrymkdhvbACGTRYMKDHVB', 'tgcayrkmhdbvTGCAYRKMHDBV')
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# changes sequence @value
|
48
|
+
def complement!
|
49
|
+
@value = complement
|
50
|
+
end
|
51
|
+
|
52
|
+
def reverse_complement
|
53
|
+
complement.reverse
|
54
|
+
end
|
55
|
+
alias_method :revcomp, :reverse_complement
|
56
|
+
|
57
|
+
def reverse_complement!
|
58
|
+
@value = complement.reverse
|
59
|
+
end
|
60
|
+
alias_method :revcomp!, :reverse_complement!
|
61
|
+
|
62
|
+
def translate(frame = 1, cdn_table = MgNu::BACTERIA_CODONS)
|
63
|
+
from, sequence = nil, @value
|
64
|
+
|
65
|
+
case frame
|
66
|
+
when 1, 2, 3
|
67
|
+
from = frame - 1
|
68
|
+
when 4, 5, 6
|
69
|
+
from = frame - 4
|
70
|
+
sequence = reverse_complement
|
71
|
+
when -1, -2, -3
|
72
|
+
from = -1 - frame
|
73
|
+
sequence = reverse_complement
|
74
|
+
else
|
75
|
+
$stderr.puts 'unknown frame - defaulting to zero (0)'
|
76
|
+
from = 0
|
77
|
+
end
|
78
|
+
|
79
|
+
nalen = sequence.length - from
|
80
|
+
nalen -= nalen % 3
|
81
|
+
sequence[from, nalen].downcase.gsub(/.{3}/) { |codon| cdn_table[codon] || 'X' }
|
82
|
+
end
|
83
|
+
|
84
|
+
def translate!(frame = 1, cdn_table = MgNu::BACTERIA_CODONS)
|
85
|
+
@value = translate(frame, cdn_table)
|
86
|
+
end
|
87
|
+
|
88
|
+
def to_s(cols = 60)
|
89
|
+
seq = ''
|
90
|
+
if @value.length < cols
|
91
|
+
seq = @value
|
92
|
+
else
|
93
|
+
0.step(@value.length, cols) { |segment| seq += @value[segment, cols] + "\n" }
|
94
|
+
end
|
95
|
+
seq
|
96
|
+
end
|
97
|
+
|
98
|
+
# Genbank formatted sequence 6 cols w/10 letters each, right justified line numbers
|
99
|
+
# 1 tcctgatctc ctttatagca ctttccgtga aaattgccaa gcgacctgca tgagttccgg
|
100
|
+
# 61 gagcgagaac ttctgcattt aactcacgag gagtaacaat atccactcca ggcagattcc
|
101
|
+
# 121 tgaaaccctt cagaacatta tccttgttgg atacaactat caaaacgctc ttctttttct
|
102
|
+
def to_genbank
|
103
|
+
i = 1
|
104
|
+
result = @value.gsub(/.{1,60}/) do |s|
|
105
|
+
s = s.gsub(/.{1,10}/, ' \0')
|
106
|
+
y = format('%9d%s\n', i, s)
|
107
|
+
i += 60
|
108
|
+
y
|
109
|
+
end
|
110
|
+
result
|
111
|
+
end
|
112
|
+
|
113
|
+
# returns an array of 1-based positon ranges after splitting on N-blocks > length
|
114
|
+
def nblocks(length = 10)
|
115
|
+
pieces = []
|
116
|
+
prev = 1
|
117
|
+
seq = StringScanner.new(value) # the sequence
|
118
|
+
while seq.scan_until(/[Nn]{#{length},}/) # only splits at N stitches that are >10, but that can be changed
|
119
|
+
pieces << (prev .. seq.pos - seq.matched.length)
|
120
|
+
prev = seq.pos + 1
|
121
|
+
end
|
122
|
+
pieces << (prev .. value.length) # add last piece
|
123
|
+
pieces
|
124
|
+
end
|
125
|
+
|
126
|
+
def levenshtein_distance(other)
|
127
|
+
# initialize
|
128
|
+
a, b, m = '', '', []
|
129
|
+
|
130
|
+
# one or the other strings are empty or the strings are the same
|
131
|
+
return -1 if @value.nil? || @value == ''
|
132
|
+
a = @value.upcase
|
133
|
+
|
134
|
+
if other.class == MgNu::Sequence
|
135
|
+
return -1 if other.value == '' || other.value.nil?
|
136
|
+
b = other.value.upcase
|
137
|
+
return 0 if other.value.upcase == @value.upcase
|
138
|
+
elsif other.class == String
|
139
|
+
return -1 if other == ''
|
140
|
+
b = other.upcase
|
141
|
+
return 0 if other.upcase == @value.upcase
|
142
|
+
end
|
143
|
+
|
144
|
+
0.upto(a.length) { |x| m[x] = [x] }
|
145
|
+
1.upto(b.length) { |x| m[0] << x }
|
146
|
+
|
147
|
+
1.upto(a.length) do |x|
|
148
|
+
1.upto(b.length) do |y|
|
149
|
+
cost = a[x - 1] == b[y - 1] ? 0 : 1
|
150
|
+
m[x][y] = [m[x - 1][y] + 1, m[x][y - 1] + 1, m[x - 1][y - 1] + cost].min
|
151
|
+
end
|
152
|
+
end
|
153
|
+
m[-1][-1]
|
154
|
+
end # end of levenshtein_distance
|
155
|
+
|
156
|
+
alias_method :distance, :levenshtein_distance
|
157
|
+
|
158
|
+
def percent_identity(other)
|
159
|
+
# one or the other strings are empty or the strings are the same
|
160
|
+
return -1 if @value.nil? || @value == ''
|
161
|
+
a = @value
|
162
|
+
b = ''
|
163
|
+
|
164
|
+
if other.class == MgNu::Sequence
|
165
|
+
return -1 if other.value == '' || other.value.nil?
|
166
|
+
b = other.value
|
167
|
+
return 1.0 if other.value == @value
|
168
|
+
elsif other.class == String
|
169
|
+
return -1 if other == ''
|
170
|
+
b = other
|
171
|
+
return 1.0 if other == @value
|
172
|
+
end
|
173
|
+
|
174
|
+
if a.length != b.length
|
175
|
+
warn('lengths differ - percent identity may is probably inaccurate')
|
176
|
+
end
|
177
|
+
|
178
|
+
match = 0
|
179
|
+
a.split(//).each_with_index do |char, i|
|
180
|
+
match += 1 if char.upcase == b[i].chr.upcase
|
181
|
+
end
|
182
|
+
|
183
|
+
a.length >= b.length ? match / a.length.to_f : match / b.length.to_f
|
184
|
+
end # end of percent_identity
|
185
|
+
|
186
|
+
alias_method :identity, :percent_identity
|
187
|
+
|
188
|
+
def gc_content
|
189
|
+
return -1 if @value == '' || @value.nil?
|
190
|
+
base2count = {'A' => 0, 'C' => 0, 'G' => 0, 'T' => 0, 'U' => 0,
|
191
|
+
'R' => 0, 'Y' => 0, 'M' => 0, 'K' => 0, 'W' => 0,
|
192
|
+
'S' => 0, 'B' => 0, 'D' => 0, 'H' => 0, 'V' => 0}
|
193
|
+
temp = @value.split(//)
|
194
|
+
temp.each do |base|
|
195
|
+
next if base == '*' || base.upcase == 'N'
|
196
|
+
if base2count.key?(base.upcase)
|
197
|
+
base2count[base.upcase] += 1
|
198
|
+
else
|
199
|
+
$stderr.puts "Unknown character #{base.upcase}"
|
200
|
+
end
|
201
|
+
end
|
202
|
+
gc = base2count['G'] + base2count['C'] + base2count['R'] + base2count['K'] + base2count['S'] + base2count['B'] + base2count['D'] + base2count['V']
|
203
|
+
total = base2count.values.inject(0) { |a, e| a + e.nil? ? 0 : e }
|
204
|
+
format('%.4f', (gc.to_f / total.to_f))
|
205
|
+
end
|
206
|
+
end # end of Sequence class
|
207
|
+
end # end of MgNu module
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#require 'mgnu/sequence'
|
2
|
+
|
3
|
+
module MgNu
|
4
|
+
class Sequence
|
5
|
+
class Fasta < Sequence
|
6
|
+
attr_accessor :header, :header_name, :header_description
|
7
|
+
|
8
|
+
# create a new MgNu::Sequence::Fasta object
|
9
|
+
def initialize(options)
|
10
|
+
super(options)
|
11
|
+
options = {:header => nil}.merge! options
|
12
|
+
@header = options[:header]
|
13
|
+
temp = @header.split
|
14
|
+
@header_name = temp.shift
|
15
|
+
@header_description = temp.length > 0 ? temp.join(' ') : nil
|
16
|
+
end
|
17
|
+
|
18
|
+
# split sequence into columns
|
19
|
+
def sequence_by_columns(cols = 60)
|
20
|
+
seq = ''
|
21
|
+
if length < cols
|
22
|
+
seq << sequence
|
23
|
+
else
|
24
|
+
0.step(length, cols) { |segment| seq << sequence[segment, cols] << "\n" }
|
25
|
+
end
|
26
|
+
seq
|
27
|
+
end
|
28
|
+
|
29
|
+
# override to_s string representation
|
30
|
+
def to_s(cols = 60)
|
31
|
+
seq = ''
|
32
|
+
if sequence =~ /\d+\s+\d+/
|
33
|
+
# this is a fasta quality sequence
|
34
|
+
scores = sequence.split(/\s+/)
|
35
|
+
buffer = []
|
36
|
+
while scores.length > 0
|
37
|
+
score = scores.shift
|
38
|
+
if buffer.length == 17
|
39
|
+
seq << "#{buffer.join(' ')}\n"
|
40
|
+
buffer.clear
|
41
|
+
buffer << score
|
42
|
+
else
|
43
|
+
buffer << score
|
44
|
+
end
|
45
|
+
end
|
46
|
+
seq << "#{buffer.join(' ')}\n" if buffer.length > 0
|
47
|
+
else
|
48
|
+
if cols == -1 # don't break the sequence up
|
49
|
+
seq = sequence
|
50
|
+
else
|
51
|
+
seq = length < cols ? sequence : sequence_by_columns(cols)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
">#{@header}\n#{seq}"
|
55
|
+
end
|
56
|
+
|
57
|
+
# find runs of N characters in the sequence and split
|
58
|
+
def split_on_n(min_n = 10)
|
59
|
+
count = 0
|
60
|
+
sequence_chunks = []
|
61
|
+
sequence.split(/[nN]{#{min_n},}/).each do |chunk|
|
62
|
+
sequence_chunks << chunk
|
63
|
+
count += 1
|
64
|
+
end
|
65
|
+
|
66
|
+
if count > 1
|
67
|
+
outstr = ''
|
68
|
+
sequence_chunks.each_with_index do |chunk, i|
|
69
|
+
outstr << ">#{@header_name}_#{i + 1} #{@header_description}\n"
|
70
|
+
outstr << "#{chunk}\n"
|
71
|
+
end
|
72
|
+
outstr
|
73
|
+
else
|
74
|
+
to_s
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end # end of Fasta class
|
78
|
+
end # end of Sequence class
|
79
|
+
end # end of MgNu module
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#require 'mgnu/sequence'
|
2
|
+
|
3
|
+
module MgNu
|
4
|
+
class Sequence
|
5
|
+
class Fastq < Sequence
|
6
|
+
attr_accessor :header, :header_name, :header_description
|
7
|
+
attr_accessor :quality, :qualhdr, :qualary, :offset
|
8
|
+
|
9
|
+
# create a new MgNu::Sequence::Fastq object
|
10
|
+
def initialize(options)
|
11
|
+
super(options)
|
12
|
+
options = {:offset => 64, :header => nil, :quality => nil}.merge! options
|
13
|
+
@quality = options[:quality]
|
14
|
+
@offset = options[:offset]
|
15
|
+
@header = options[:header]
|
16
|
+
temp = @header.split
|
17
|
+
@header_name = temp.shift
|
18
|
+
@header_description = temp.length > 0 ? temp.join(' ') : nil
|
19
|
+
@qualhdr = options[:qualhdr] if options[:qualhdr]
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_fasta
|
23
|
+
MgNu::Sequence::Fasta.new(:header => @header, :sequence => sequence)
|
24
|
+
end
|
25
|
+
|
26
|
+
# override to_s representation
|
27
|
+
def to_s
|
28
|
+
"@#{@header}\n#{sequence}\n+\n#{@quality}\n"
|
29
|
+
end
|
30
|
+
|
31
|
+
# Unpack the quality string and return an array of
|
32
|
+
# offset-corrected integers
|
33
|
+
# @params [Integer] offset platform dependent offset value to
|
34
|
+
# substract from the quality score, defaults to most recent
|
35
|
+
# platform (i.e. illumina)
|
36
|
+
# (64)
|
37
|
+
# @return [Array] the array of integers
|
38
|
+
def unpack_quality # only compute this one time
|
39
|
+
@quality.unpack('C*').map! { |x| x - @offset }
|
40
|
+
end
|
41
|
+
end # end of Fastq class
|
42
|
+
end # end of Sequence class
|
43
|
+
end # end of MgNu module
|
data/lib/mgnu/version.rb
ADDED
data/mgnu.gemspec
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'mgnu/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "mgnu"
|
8
|
+
spec.version = MgNu::Version
|
9
|
+
spec.authors = ["Brian C. Thomas"]
|
10
|
+
spec.email = ["bct.x42@gmail.com"]
|
11
|
+
spec.summary = %q{MgNu Ruby Bioinformatics Library}
|
12
|
+
spec.description = %q{Lightweight ruby bioinformatics library}
|
13
|
+
spec.homepage = "http://www.metagenomi.co"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
# spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.files = %w(.yardopts README.md Rakefile mgnu.gemspec)
|
18
|
+
spec.files += Dir.glob('lib/**/*.rb')
|
19
|
+
spec.files += Dir.glob('spec/**/*')
|
20
|
+
|
21
|
+
|
22
|
+
# spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
23
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
24
|
+
spec.require_paths = ["lib"]
|
25
|
+
|
26
|
+
spec.add_dependency('naught', '~> 0.0', '>= 0.0.1')
|
27
|
+
spec.add_dependency('moneta', '~> 0.0', '>= 0.0.1')
|
28
|
+
spec.add_dependency('tokyocabinet', '~> 1.29', '>= 1.29.1')
|
29
|
+
spec.add_dependency('ox', '~> 2.8', '>= 2.8.2')
|
30
|
+
spec.add_dependency('memoizable', '~> 0.0', '>= 0.0.1')
|
31
|
+
spec.add_dependency('yard', '~> 0.9', '>= 0.9.11')
|
32
|
+
spec.add_dependency('rake', '~> 12.0', '>= 12.0.0')
|
33
|
+
spec.add_development_dependency('mime-types', '~> 0.0', '> 0.0.1')
|
34
|
+
spec.add_development_dependency('rspec', '~> 0.0', '> 0.0.1')
|
35
|
+
spec.add_development_dependency('rubocop', '~> 0.49', '>= 0.49.0')
|
36
|
+
spec.add_development_dependency('timecop', '~> 0.0', '> 0.0.1')
|
37
|
+
spec.add_development_dependency('yardstick', '~> 0.0', '> 0.0.1')
|
38
|
+
spec.add_development_dependency('simplecov', '~> 0.0', '> 0.0.1')
|
39
|
+
end
|