mgnu 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +0 -0
- data/README.md +31 -0
- data/Rakefile +33 -0
- data/lib/mgnu.rb +9 -0
- data/lib/mgnu/alignment.rb +143 -0
- data/lib/mgnu/common.rb +68 -0
- data/lib/mgnu/genbank.rb +117 -0
- data/lib/mgnu/genbank/feature.rb +84 -0
- data/lib/mgnu/genbank/location.rb +150 -0
- data/lib/mgnu/genbank/qualifier.rb +45 -0
- data/lib/mgnu/genbank/reference.rb +114 -0
- data/lib/mgnu/genbank/source.rb +39 -0
- data/lib/mgnu/loggable.rb +61 -0
- data/lib/mgnu/parser.rb +50 -0
- data/lib/mgnu/parser/blast.rb +87 -0
- data/lib/mgnu/parser/blast/format0.rb +290 -0
- data/lib/mgnu/parser/blast/format7.rb +121 -0
- data/lib/mgnu/parser/blast/format8.rb +120 -0
- data/lib/mgnu/parser/blast/hsp.rb +75 -0
- data/lib/mgnu/parser/blast/query.rb +45 -0
- data/lib/mgnu/parser/blast/sbjct.rb +62 -0
- data/lib/mgnu/parser/clustalw.rb +72 -0
- data/lib/mgnu/parser/fasta.rb +61 -0
- data/lib/mgnu/parser/fasta_header_index.rb +39 -0
- data/lib/mgnu/parser/fasta_index.rb +57 -0
- data/lib/mgnu/parser/fastq.rb +61 -0
- data/lib/mgnu/parser/genbank.rb +187 -0
- data/lib/mgnu/parser/gff.rb +56 -0
- data/lib/mgnu/parser/iprscan/hit.rb +76 -0
- data/lib/mgnu/parser/iprscan_file.rb +39 -0
- data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
- data/lib/mgnu/parser/pilercr.rb +102 -0
- data/lib/mgnu/parser/prodigal.rb +170 -0
- data/lib/mgnu/parser/sam.rb +115 -0
- data/lib/mgnu/parser/sam/alignment.rb +22 -0
- data/lib/mgnu/parser/sam/header.rb +23 -0
- data/lib/mgnu/parser/sam/pair.rb +18 -0
- data/lib/mgnu/sequence.rb +207 -0
- data/lib/mgnu/sequence/fasta.rb +79 -0
- data/lib/mgnu/sequence/fastq.rb +43 -0
- data/lib/mgnu/version.rb +16 -0
- data/mgnu.gemspec +39 -0
- data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
- data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
- data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
- data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
- data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
- data/spec/mgnu/parser/clustalw_spec.rb +90 -0
- data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
- data/spec/mgnu/parser_spec.rb +22 -0
- data/spec/mgnu/sequence/fasta_spec.rb +60 -0
- data/spec/mgnu/sequence/fastq_spec.rb +31 -0
- data/spec/mgnu/sequence_spec.rb +81 -0
- data/spec/mgnu_spec.rb +7 -0
- data/spec/spec_helper.rb +53 -0
- metadata +376 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Sam
|
4
|
+
class Alignment
|
5
|
+
attr_accessor :name, :flag, :hit, :position, :quality, :cigar, :mate_ref
|
6
|
+
attr_accessor :mate_pos, :distance, :sequence, :query_qual, :other
|
7
|
+
|
8
|
+
# create a new Alignment object
|
9
|
+
def initialize(attributes = {})
|
10
|
+
self.attributes = attributes
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
def attributes=(attributes = {})
|
15
|
+
attributes.each do |attr,value|
|
16
|
+
self.send("#{attr}=", value) if self.respond_to?("#{attr}=")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Sam
|
4
|
+
class Header
|
5
|
+
include MgNu::Loggable
|
6
|
+
|
7
|
+
attr_accessor :vn, :so, :sq, :rg, :pg, :co
|
8
|
+
|
9
|
+
# create a new Header object
|
10
|
+
def initialize(options)
|
11
|
+
options = {
|
12
|
+
:vn => options.has_key?(:vn) ? options[:vn] : nil,
|
13
|
+
:so => options.has_key?(:so) ? options[:so] : nil,
|
14
|
+
:sq => options.has_key?(:sq) ? options[:sq] : nil,
|
15
|
+
:rg => options.has_key?(:rg) ? options[:rg] : nil,
|
16
|
+
:pg => options.has_key?(:pg) ? options[:pg] : nil,
|
17
|
+
:co => options.has_key?(:co) ? options[:co] : nil,
|
18
|
+
}.merge!(options)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
module Sam
|
4
|
+
class Pair
|
5
|
+
|
6
|
+
attr_accessor :name, :first, :second
|
7
|
+
|
8
|
+
# create a new Pair object
|
9
|
+
def initialize(name, first, second)
|
10
|
+
@name = name
|
11
|
+
@first = first
|
12
|
+
@second = second
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,207 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
require_relative 'sequence/fasta'
|
3
|
+
require_relative 'sequence/fastq'
|
4
|
+
|
5
|
+
module MgNu
|
6
|
+
class Sequence
|
7
|
+
attr_accessor :type, :value
|
8
|
+
|
9
|
+
def initialize(options)
|
10
|
+
options = {:value => nil, :type => nil}.merge! options
|
11
|
+
@value = options[:value]
|
12
|
+
@value = options[:sequence] if options.key?(:sequence)
|
13
|
+
@type = options[:type]
|
14
|
+
end
|
15
|
+
|
16
|
+
alias_method :sequence, :value
|
17
|
+
alias_method :sequence=, :value=
|
18
|
+
|
19
|
+
def rna?
|
20
|
+
@type == 'rna' ? true : false
|
21
|
+
end
|
22
|
+
|
23
|
+
def dna?
|
24
|
+
@type == 'dna' ? true : false
|
25
|
+
end
|
26
|
+
|
27
|
+
def aa?
|
28
|
+
@type == 'aa' || @type == 'aminoacid' || @type == 'protein' ? true : false
|
29
|
+
end
|
30
|
+
|
31
|
+
alias_method :protein?, :aa?
|
32
|
+
alias_method :aminoacid?, :aa?
|
33
|
+
|
34
|
+
def length
|
35
|
+
@value.nil? ? nil : @value.length
|
36
|
+
end
|
37
|
+
|
38
|
+
# returns a string
|
39
|
+
def complement
|
40
|
+
if @type == 'rna'
|
41
|
+
@value.tr('ucgtrymkdhvbUCGTRYMKDHVB', 'agcuyrkmhdbvAGCUYRKMHDBV')
|
42
|
+
else
|
43
|
+
@value.tr('acgtrymkdhvbACGTRYMKDHVB', 'tgcayrkmhdbvTGCAYRKMHDBV')
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# changes sequence @value
|
48
|
+
def complement!
|
49
|
+
@value = complement
|
50
|
+
end
|
51
|
+
|
52
|
+
def reverse_complement
|
53
|
+
complement.reverse
|
54
|
+
end
|
55
|
+
alias_method :revcomp, :reverse_complement
|
56
|
+
|
57
|
+
def reverse_complement!
|
58
|
+
@value = complement.reverse
|
59
|
+
end
|
60
|
+
alias_method :revcomp!, :reverse_complement!
|
61
|
+
|
62
|
+
def translate(frame = 1, cdn_table = MgNu::BACTERIA_CODONS)
|
63
|
+
from, sequence = nil, @value
|
64
|
+
|
65
|
+
case frame
|
66
|
+
when 1, 2, 3
|
67
|
+
from = frame - 1
|
68
|
+
when 4, 5, 6
|
69
|
+
from = frame - 4
|
70
|
+
sequence = reverse_complement
|
71
|
+
when -1, -2, -3
|
72
|
+
from = -1 - frame
|
73
|
+
sequence = reverse_complement
|
74
|
+
else
|
75
|
+
$stderr.puts 'unknown frame - defaulting to zero (0)'
|
76
|
+
from = 0
|
77
|
+
end
|
78
|
+
|
79
|
+
nalen = sequence.length - from
|
80
|
+
nalen -= nalen % 3
|
81
|
+
sequence[from, nalen].downcase.gsub(/.{3}/) { |codon| cdn_table[codon] || 'X' }
|
82
|
+
end
|
83
|
+
|
84
|
+
def translate!(frame = 1, cdn_table = MgNu::BACTERIA_CODONS)
|
85
|
+
@value = translate(frame, cdn_table)
|
86
|
+
end
|
87
|
+
|
88
|
+
def to_s(cols = 60)
|
89
|
+
seq = ''
|
90
|
+
if @value.length < cols
|
91
|
+
seq = @value
|
92
|
+
else
|
93
|
+
0.step(@value.length, cols) { |segment| seq += @value[segment, cols] + "\n" }
|
94
|
+
end
|
95
|
+
seq
|
96
|
+
end
|
97
|
+
|
98
|
+
# Genbank formatted sequence 6 cols w/10 letters each, right justified line numbers
|
99
|
+
# 1 tcctgatctc ctttatagca ctttccgtga aaattgccaa gcgacctgca tgagttccgg
|
100
|
+
# 61 gagcgagaac ttctgcattt aactcacgag gagtaacaat atccactcca ggcagattcc
|
101
|
+
# 121 tgaaaccctt cagaacatta tccttgttgg atacaactat caaaacgctc ttctttttct
|
102
|
+
def to_genbank
|
103
|
+
i = 1
|
104
|
+
result = @value.gsub(/.{1,60}/) do |s|
|
105
|
+
s = s.gsub(/.{1,10}/, ' \0')
|
106
|
+
y = format('%9d%s\n', i, s)
|
107
|
+
i += 60
|
108
|
+
y
|
109
|
+
end
|
110
|
+
result
|
111
|
+
end
|
112
|
+
|
113
|
+
# returns an array of 1-based positon ranges after splitting on N-blocks > length
|
114
|
+
def nblocks(length = 10)
|
115
|
+
pieces = []
|
116
|
+
prev = 1
|
117
|
+
seq = StringScanner.new(value) # the sequence
|
118
|
+
while seq.scan_until(/[Nn]{#{length},}/) # only splits at N stitches that are >10, but that can be changed
|
119
|
+
pieces << (prev .. seq.pos - seq.matched.length)
|
120
|
+
prev = seq.pos + 1
|
121
|
+
end
|
122
|
+
pieces << (prev .. value.length) # add last piece
|
123
|
+
pieces
|
124
|
+
end
|
125
|
+
|
126
|
+
def levenshtein_distance(other)
|
127
|
+
# initialize
|
128
|
+
a, b, m = '', '', []
|
129
|
+
|
130
|
+
# one or the other strings are empty or the strings are the same
|
131
|
+
return -1 if @value.nil? || @value == ''
|
132
|
+
a = @value.upcase
|
133
|
+
|
134
|
+
if other.class == MgNu::Sequence
|
135
|
+
return -1 if other.value == '' || other.value.nil?
|
136
|
+
b = other.value.upcase
|
137
|
+
return 0 if other.value.upcase == @value.upcase
|
138
|
+
elsif other.class == String
|
139
|
+
return -1 if other == ''
|
140
|
+
b = other.upcase
|
141
|
+
return 0 if other.upcase == @value.upcase
|
142
|
+
end
|
143
|
+
|
144
|
+
0.upto(a.length) { |x| m[x] = [x] }
|
145
|
+
1.upto(b.length) { |x| m[0] << x }
|
146
|
+
|
147
|
+
1.upto(a.length) do |x|
|
148
|
+
1.upto(b.length) do |y|
|
149
|
+
cost = a[x - 1] == b[y - 1] ? 0 : 1
|
150
|
+
m[x][y] = [m[x - 1][y] + 1, m[x][y - 1] + 1, m[x - 1][y - 1] + cost].min
|
151
|
+
end
|
152
|
+
end
|
153
|
+
m[-1][-1]
|
154
|
+
end # end of levenshtein_distance
|
155
|
+
|
156
|
+
alias_method :distance, :levenshtein_distance
|
157
|
+
|
158
|
+
def percent_identity(other)
|
159
|
+
# one or the other strings are empty or the strings are the same
|
160
|
+
return -1 if @value.nil? || @value == ''
|
161
|
+
a = @value
|
162
|
+
b = ''
|
163
|
+
|
164
|
+
if other.class == MgNu::Sequence
|
165
|
+
return -1 if other.value == '' || other.value.nil?
|
166
|
+
b = other.value
|
167
|
+
return 1.0 if other.value == @value
|
168
|
+
elsif other.class == String
|
169
|
+
return -1 if other == ''
|
170
|
+
b = other
|
171
|
+
return 1.0 if other == @value
|
172
|
+
end
|
173
|
+
|
174
|
+
if a.length != b.length
|
175
|
+
warn('lengths differ - percent identity may is probably inaccurate')
|
176
|
+
end
|
177
|
+
|
178
|
+
match = 0
|
179
|
+
a.split(//).each_with_index do |char, i|
|
180
|
+
match += 1 if char.upcase == b[i].chr.upcase
|
181
|
+
end
|
182
|
+
|
183
|
+
a.length >= b.length ? match / a.length.to_f : match / b.length.to_f
|
184
|
+
end # end of percent_identity
|
185
|
+
|
186
|
+
alias_method :identity, :percent_identity
|
187
|
+
|
188
|
+
def gc_content
|
189
|
+
return -1 if @value == '' || @value.nil?
|
190
|
+
base2count = {'A' => 0, 'C' => 0, 'G' => 0, 'T' => 0, 'U' => 0,
|
191
|
+
'R' => 0, 'Y' => 0, 'M' => 0, 'K' => 0, 'W' => 0,
|
192
|
+
'S' => 0, 'B' => 0, 'D' => 0, 'H' => 0, 'V' => 0}
|
193
|
+
temp = @value.split(//)
|
194
|
+
temp.each do |base|
|
195
|
+
next if base == '*' || base.upcase == 'N'
|
196
|
+
if base2count.key?(base.upcase)
|
197
|
+
base2count[base.upcase] += 1
|
198
|
+
else
|
199
|
+
$stderr.puts "Unknown character #{base.upcase}"
|
200
|
+
end
|
201
|
+
end
|
202
|
+
gc = base2count['G'] + base2count['C'] + base2count['R'] + base2count['K'] + base2count['S'] + base2count['B'] + base2count['D'] + base2count['V']
|
203
|
+
total = base2count.values.inject(0) { |a, e| a + e.nil? ? 0 : e }
|
204
|
+
format('%.4f', (gc.to_f / total.to_f))
|
205
|
+
end
|
206
|
+
end # end of Sequence class
|
207
|
+
end # end of MgNu module
|
@@ -0,0 +1,79 @@
|
|
1
|
+
#require 'mgnu/sequence'
|
2
|
+
|
3
|
+
module MgNu
|
4
|
+
class Sequence
|
5
|
+
class Fasta < Sequence
|
6
|
+
attr_accessor :header, :header_name, :header_description
|
7
|
+
|
8
|
+
# create a new MgNu::Sequence::Fasta object
|
9
|
+
def initialize(options)
|
10
|
+
super(options)
|
11
|
+
options = {:header => nil}.merge! options
|
12
|
+
@header = options[:header]
|
13
|
+
temp = @header.split
|
14
|
+
@header_name = temp.shift
|
15
|
+
@header_description = temp.length > 0 ? temp.join(' ') : nil
|
16
|
+
end
|
17
|
+
|
18
|
+
# split sequence into columns
|
19
|
+
def sequence_by_columns(cols = 60)
|
20
|
+
seq = ''
|
21
|
+
if length < cols
|
22
|
+
seq << sequence
|
23
|
+
else
|
24
|
+
0.step(length, cols) { |segment| seq << sequence[segment, cols] << "\n" }
|
25
|
+
end
|
26
|
+
seq
|
27
|
+
end
|
28
|
+
|
29
|
+
# override to_s string representation
|
30
|
+
def to_s(cols = 60)
|
31
|
+
seq = ''
|
32
|
+
if sequence =~ /\d+\s+\d+/
|
33
|
+
# this is a fasta quality sequence
|
34
|
+
scores = sequence.split(/\s+/)
|
35
|
+
buffer = []
|
36
|
+
while scores.length > 0
|
37
|
+
score = scores.shift
|
38
|
+
if buffer.length == 17
|
39
|
+
seq << "#{buffer.join(' ')}\n"
|
40
|
+
buffer.clear
|
41
|
+
buffer << score
|
42
|
+
else
|
43
|
+
buffer << score
|
44
|
+
end
|
45
|
+
end
|
46
|
+
seq << "#{buffer.join(' ')}\n" if buffer.length > 0
|
47
|
+
else
|
48
|
+
if cols == -1 # don't break the sequence up
|
49
|
+
seq = sequence
|
50
|
+
else
|
51
|
+
seq = length < cols ? sequence : sequence_by_columns(cols)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
">#{@header}\n#{seq}"
|
55
|
+
end
|
56
|
+
|
57
|
+
# find runs of N characters in the sequence and split
|
58
|
+
def split_on_n(min_n = 10)
|
59
|
+
count = 0
|
60
|
+
sequence_chunks = []
|
61
|
+
sequence.split(/[nN]{#{min_n},}/).each do |chunk|
|
62
|
+
sequence_chunks << chunk
|
63
|
+
count += 1
|
64
|
+
end
|
65
|
+
|
66
|
+
if count > 1
|
67
|
+
outstr = ''
|
68
|
+
sequence_chunks.each_with_index do |chunk, i|
|
69
|
+
outstr << ">#{@header_name}_#{i + 1} #{@header_description}\n"
|
70
|
+
outstr << "#{chunk}\n"
|
71
|
+
end
|
72
|
+
outstr
|
73
|
+
else
|
74
|
+
to_s
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end # end of Fasta class
|
78
|
+
end # end of Sequence class
|
79
|
+
end # end of MgNu module
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#require 'mgnu/sequence'
|
2
|
+
|
3
|
+
module MgNu
|
4
|
+
class Sequence
|
5
|
+
class Fastq < Sequence
|
6
|
+
attr_accessor :header, :header_name, :header_description
|
7
|
+
attr_accessor :quality, :qualhdr, :qualary, :offset
|
8
|
+
|
9
|
+
# create a new MgNu::Sequence::Fastq object
|
10
|
+
def initialize(options)
|
11
|
+
super(options)
|
12
|
+
options = {:offset => 64, :header => nil, :quality => nil}.merge! options
|
13
|
+
@quality = options[:quality]
|
14
|
+
@offset = options[:offset]
|
15
|
+
@header = options[:header]
|
16
|
+
temp = @header.split
|
17
|
+
@header_name = temp.shift
|
18
|
+
@header_description = temp.length > 0 ? temp.join(' ') : nil
|
19
|
+
@qualhdr = options[:qualhdr] if options[:qualhdr]
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_fasta
|
23
|
+
MgNu::Sequence::Fasta.new(:header => @header, :sequence => sequence)
|
24
|
+
end
|
25
|
+
|
26
|
+
# override to_s representation
|
27
|
+
def to_s
|
28
|
+
"@#{@header}\n#{sequence}\n+\n#{@quality}\n"
|
29
|
+
end
|
30
|
+
|
31
|
+
# Unpack the quality string and return an array of
|
32
|
+
# offset-corrected integers
|
33
|
+
# @params [Integer] offset platform dependent offset value to
|
34
|
+
# substract from the quality score, defaults to most recent
|
35
|
+
# platform (i.e. illumina)
|
36
|
+
# (64)
|
37
|
+
# @return [Array] the array of integers
|
38
|
+
def unpack_quality # only compute this one time
|
39
|
+
@quality.unpack('C*').map! { |x| x - @offset }
|
40
|
+
end
|
41
|
+
end # end of Fastq class
|
42
|
+
end # end of Sequence class
|
43
|
+
end # end of MgNu module
|
data/lib/mgnu/version.rb
ADDED
data/mgnu.gemspec
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'mgnu/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "mgnu"
|
8
|
+
spec.version = MgNu::Version
|
9
|
+
spec.authors = ["Brian C. Thomas"]
|
10
|
+
spec.email = ["bct.x42@gmail.com"]
|
11
|
+
spec.summary = %q{MgNu Ruby Bioinformatics Library}
|
12
|
+
spec.description = %q{Lightweight ruby bioinformatics library}
|
13
|
+
spec.homepage = "http://www.metagenomi.co"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
# spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.files = %w(.yardopts README.md Rakefile mgnu.gemspec)
|
18
|
+
spec.files += Dir.glob('lib/**/*.rb')
|
19
|
+
spec.files += Dir.glob('spec/**/*')
|
20
|
+
|
21
|
+
|
22
|
+
# spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
23
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
24
|
+
spec.require_paths = ["lib"]
|
25
|
+
|
26
|
+
spec.add_dependency('naught', '~> 0.0', '>= 0.0.1')
|
27
|
+
spec.add_dependency('moneta', '~> 0.0', '>= 0.0.1')
|
28
|
+
spec.add_dependency('tokyocabinet', '~> 1.29', '>= 1.29.1')
|
29
|
+
spec.add_dependency('ox', '~> 2.8', '>= 2.8.2')
|
30
|
+
spec.add_dependency('memoizable', '~> 0.0', '>= 0.0.1')
|
31
|
+
spec.add_dependency('yard', '~> 0.9', '>= 0.9.11')
|
32
|
+
spec.add_dependency('rake', '~> 12.0', '>= 12.0.0')
|
33
|
+
spec.add_development_dependency('mime-types', '~> 0.0', '> 0.0.1')
|
34
|
+
spec.add_development_dependency('rspec', '~> 0.0', '> 0.0.1')
|
35
|
+
spec.add_development_dependency('rubocop', '~> 0.49', '>= 0.49.0')
|
36
|
+
spec.add_development_dependency('timecop', '~> 0.0', '> 0.0.1')
|
37
|
+
spec.add_development_dependency('yardstick', '~> 0.0', '> 0.0.1')
|
38
|
+
spec.add_development_dependency('simplecov', '~> 0.0', '> 0.0.1')
|
39
|
+
end
|