mgnu 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +0 -0
- data/README.md +31 -0
- data/Rakefile +33 -0
- data/lib/mgnu.rb +9 -0
- data/lib/mgnu/alignment.rb +143 -0
- data/lib/mgnu/common.rb +68 -0
- data/lib/mgnu/genbank.rb +117 -0
- data/lib/mgnu/genbank/feature.rb +84 -0
- data/lib/mgnu/genbank/location.rb +150 -0
- data/lib/mgnu/genbank/qualifier.rb +45 -0
- data/lib/mgnu/genbank/reference.rb +114 -0
- data/lib/mgnu/genbank/source.rb +39 -0
- data/lib/mgnu/loggable.rb +61 -0
- data/lib/mgnu/parser.rb +50 -0
- data/lib/mgnu/parser/blast.rb +87 -0
- data/lib/mgnu/parser/blast/format0.rb +290 -0
- data/lib/mgnu/parser/blast/format7.rb +121 -0
- data/lib/mgnu/parser/blast/format8.rb +120 -0
- data/lib/mgnu/parser/blast/hsp.rb +75 -0
- data/lib/mgnu/parser/blast/query.rb +45 -0
- data/lib/mgnu/parser/blast/sbjct.rb +62 -0
- data/lib/mgnu/parser/clustalw.rb +72 -0
- data/lib/mgnu/parser/fasta.rb +61 -0
- data/lib/mgnu/parser/fasta_header_index.rb +39 -0
- data/lib/mgnu/parser/fasta_index.rb +57 -0
- data/lib/mgnu/parser/fastq.rb +61 -0
- data/lib/mgnu/parser/genbank.rb +187 -0
- data/lib/mgnu/parser/gff.rb +56 -0
- data/lib/mgnu/parser/iprscan/hit.rb +76 -0
- data/lib/mgnu/parser/iprscan_file.rb +39 -0
- data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
- data/lib/mgnu/parser/pilercr.rb +102 -0
- data/lib/mgnu/parser/prodigal.rb +170 -0
- data/lib/mgnu/parser/sam.rb +115 -0
- data/lib/mgnu/parser/sam/alignment.rb +22 -0
- data/lib/mgnu/parser/sam/header.rb +23 -0
- data/lib/mgnu/parser/sam/pair.rb +18 -0
- data/lib/mgnu/sequence.rb +207 -0
- data/lib/mgnu/sequence/fasta.rb +79 -0
- data/lib/mgnu/sequence/fastq.rb +43 -0
- data/lib/mgnu/version.rb +16 -0
- data/mgnu.gemspec +39 -0
- data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
- data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
- data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
- data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
- data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
- data/spec/mgnu/parser/clustalw_spec.rb +90 -0
- data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
- data/spec/mgnu/parser_spec.rb +22 -0
- data/spec/mgnu/sequence/fasta_spec.rb +60 -0
- data/spec/mgnu/sequence/fastq_spec.rb +31 -0
- data/spec/mgnu/sequence_spec.rb +81 -0
- data/spec/mgnu_spec.rb +7 -0
- data/spec/spec_helper.rb +53 -0
- metadata +376 -0
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'mgnu/parser/blast/query'
|
2
|
+
require 'mgnu/parser/blast/sbjct'
|
3
|
+
require 'mgnu/parser/blast/hsp'
|
4
|
+
|
5
|
+
|
6
|
+
module MgNu
|
7
|
+
module Parser
|
8
|
+
class Blast
|
9
|
+
class Format8
|
10
|
+
include Enumerable
|
11
|
+
|
12
|
+
attr_accessor :queries
|
13
|
+
|
14
|
+
# create a new Format8 parser object
|
15
|
+
def initialize(input)
|
16
|
+
@query = nil
|
17
|
+
@sbjct = nil
|
18
|
+
@queries = []
|
19
|
+
|
20
|
+
@input = input
|
21
|
+
end
|
22
|
+
|
23
|
+
def each
|
24
|
+
@input.each do |line|
|
25
|
+
next if line =~ /^#/ # skip comments
|
26
|
+
|
27
|
+
temp = line.split(/\t/)
|
28
|
+
|
29
|
+
query_id = temp.shift
|
30
|
+
|
31
|
+
if @query.nil?
|
32
|
+
@query = Query.new
|
33
|
+
@query.query_id = query_id
|
34
|
+
end
|
35
|
+
|
36
|
+
if @query.query_id == query_id
|
37
|
+
# already on this query, so just add the sbject
|
38
|
+
extract_sbjct(temp)
|
39
|
+
else
|
40
|
+
# new query_id, save this one and start on new one
|
41
|
+
@query.sbjcts << @sbjct
|
42
|
+
@sbjct = nil
|
43
|
+
yield @query
|
44
|
+
@query = Query.new
|
45
|
+
@query.query_id = query_id
|
46
|
+
extract_sbjct(temp)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def parse
|
52
|
+
@input.each do |line|
|
53
|
+
next if line =~ /^#/ # skip comments
|
54
|
+
|
55
|
+
temp = line.split
|
56
|
+
|
57
|
+
query_id = temp.shift
|
58
|
+
|
59
|
+
if @query.nil?
|
60
|
+
@query = Query.new
|
61
|
+
@query.query_id = query_id
|
62
|
+
end
|
63
|
+
|
64
|
+
if @query.query_id == query_id
|
65
|
+
# already on this query, so just add the sbject
|
66
|
+
extract_sbjct(temp)
|
67
|
+
else
|
68
|
+
# new query_id, save this one and start on new one
|
69
|
+
@query.sbjcts << @sbjct
|
70
|
+
@queries << @query
|
71
|
+
@sbjct = nil
|
72
|
+
@query = Query.new
|
73
|
+
@query.query_id = query_id
|
74
|
+
extract_sbjct(temp)
|
75
|
+
end
|
76
|
+
end # end of input.each do |line|
|
77
|
+
|
78
|
+
#grab the last ones, if present
|
79
|
+
unless @query.nil?
|
80
|
+
@query.sbjcts << @sbjct
|
81
|
+
@queries << @query
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def extract_sbjct(input)
|
86
|
+
sbjct_id = input.shift
|
87
|
+
if @sbjct.nil?
|
88
|
+
@sbjct = Sbjct.new
|
89
|
+
@sbjct.sbjct_id = sbjct_id
|
90
|
+
end
|
91
|
+
|
92
|
+
if @sbjct.sbjct_id == sbjct_id
|
93
|
+
extract_hsp(input)
|
94
|
+
else
|
95
|
+
@query.sbjcts << @sbjct
|
96
|
+
@sbjct = Sbjct.new
|
97
|
+
@sbjct.sbjct_id = sbjct_id
|
98
|
+
extract_hsp(input)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def extract_hsp(input)
|
103
|
+
hsp = Hsp.new
|
104
|
+
hsp.identity = input.shift.to_f
|
105
|
+
hsp.length = input.shift.to_i
|
106
|
+
hsp.mismatches = input.shift.to_i
|
107
|
+
hsp.gap_count = input.shift.to_i
|
108
|
+
hsp.query_from = input.shift.to_i
|
109
|
+
hsp.query_to = input.shift.to_i
|
110
|
+
hsp.sbjct_from = input.shift.to_i
|
111
|
+
hsp.sbjct_to = input.shift.to_i
|
112
|
+
hsp.evalue = input.shift.to_f
|
113
|
+
hsp.bit_score = input.shift.to_f
|
114
|
+
@sbjct.hsps << hsp
|
115
|
+
end
|
116
|
+
|
117
|
+
end # end of MgNu::Parser::Blast::Format8 class
|
118
|
+
end # end of MgNu::Parser::Blast class
|
119
|
+
end # end of MgNu::Parser module
|
120
|
+
end # end of MgNu module
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Blast
|
4
|
+
class Hsp
|
5
|
+
|
6
|
+
attr_accessor :number, :bit_score, :score, :evalue
|
7
|
+
attr_accessor :query_from, :query_to, :sbjct_from, :sbjct_to
|
8
|
+
attr_accessor :query_frame, :sbjct_frame, :identity, :positive
|
9
|
+
attr_accessor :length, :query_sequence, :sbjct_sequence, :midline
|
10
|
+
attr_accessor :gap_count, :mismatches, :sbjct, :query
|
11
|
+
|
12
|
+
# create a new Hsp object
|
13
|
+
def initialize
|
14
|
+
@number = nil
|
15
|
+
@bit_score = nil
|
16
|
+
@score = nil
|
17
|
+
@evalue = nil
|
18
|
+
@query_from = nil
|
19
|
+
@query_to = nil
|
20
|
+
@sbjct_from = nil
|
21
|
+
@sbjct_to = nil
|
22
|
+
@query_frame = nil
|
23
|
+
@sbjct_frame = nil
|
24
|
+
@identity = nil
|
25
|
+
@positive = nil
|
26
|
+
@length = nil
|
27
|
+
@query_sequence = ""
|
28
|
+
@sbjct_sequence = ""
|
29
|
+
@midline = ""
|
30
|
+
@gap_count = nil
|
31
|
+
@mismatches = nil
|
32
|
+
@sbjct = nil
|
33
|
+
@query = nil
|
34
|
+
end
|
35
|
+
|
36
|
+
def query_frameshifts
|
37
|
+
if @query_sequence =~ /(?:\/|\\)/
|
38
|
+
loc2frame = Hash.new
|
39
|
+
re = /[\/\\]{1,2}/
|
40
|
+
re.global_match(@query_sequence.gsub(/[- ]/,'')) do |m|
|
41
|
+
frame = nil
|
42
|
+
# m.begin(0) is location of char match
|
43
|
+
# (m.begin(0) - 1) * 3 is the length of the coding dna
|
44
|
+
# up to the (but not including) the char match
|
45
|
+
# (m.begin(0) - 1) * 3 + @query_from - 1 is the corrected
|
46
|
+
# position taking into account the start of the query
|
47
|
+
# sequence. query_from is reported in nt
|
48
|
+
if @query_from > @query_to
|
49
|
+
location = @query_from - (m.begin(0) * 3 - 1)
|
50
|
+
else
|
51
|
+
location = (m.begin(0) * 3 - 1) + @query_from
|
52
|
+
end
|
53
|
+
case m[0]
|
54
|
+
when '/'
|
55
|
+
frame = 1
|
56
|
+
when '//'
|
57
|
+
frame = 2
|
58
|
+
when '\\'
|
59
|
+
frame = -1
|
60
|
+
when '\\\\'
|
61
|
+
frame = -2
|
62
|
+
end
|
63
|
+
loc2frame[location] = frame
|
64
|
+
end # end re.global_match
|
65
|
+
return loc2frame
|
66
|
+
else
|
67
|
+
return nil
|
68
|
+
end # end if @query_sequence =~ /(?:\/|\\)/
|
69
|
+
end # end query_frameshifts
|
70
|
+
|
71
|
+
end # end of MgNu::Parser::Blast::Hsp class
|
72
|
+
end # end of MgNu::Parser::Blast class
|
73
|
+
|
74
|
+
end # end of MgNu::Parser module
|
75
|
+
end # end of MgNu module
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Blast
|
4
|
+
class Query
|
5
|
+
|
6
|
+
attr_accessor :number, :query_id, :definition, :length, :sbjcts
|
7
|
+
attr_accessor :database, :database_sequence_count, :database_total_letters
|
8
|
+
|
9
|
+
# create a new Query object
|
10
|
+
def initialize
|
11
|
+
@number = nil
|
12
|
+
@query_id = ""
|
13
|
+
@definition = ""
|
14
|
+
@length = nil
|
15
|
+
@sbjcts = []
|
16
|
+
@best_hit = nil
|
17
|
+
@database = nil
|
18
|
+
@database_sequence_count = 0
|
19
|
+
@database_total_letters = 0
|
20
|
+
end
|
21
|
+
|
22
|
+
# Returns the @best_hit instance variable. If not set, it
|
23
|
+
# will search this query's sbjcts and find the one with the best
|
24
|
+
# evalue and return it
|
25
|
+
#
|
26
|
+
# @return [MgNu::Parser::Blast::Sbjct] the best hit for this
|
27
|
+
# query
|
28
|
+
def best_hit
|
29
|
+
return @best_hit unless @best_hit.nil?
|
30
|
+
if @sbjcts.length > 0 # make sure there are some hits
|
31
|
+
best_hit = @sbjcts[0]
|
32
|
+
@sbjcts.each do |s|
|
33
|
+
if s.evalue < best_hit.evalue
|
34
|
+
best_hit = s
|
35
|
+
end
|
36
|
+
end
|
37
|
+
@best_hit = best_hit
|
38
|
+
return best_hit
|
39
|
+
end
|
40
|
+
return nil
|
41
|
+
end
|
42
|
+
end # end of MgNu::Parser::Blast::Query class
|
43
|
+
end # end of MgNu::Parser::Blast class
|
44
|
+
end # end of MgNu::Parser module
|
45
|
+
end # end of MgNu module
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Blast
|
4
|
+
class Sbjct
|
5
|
+
|
6
|
+
attr_accessor :hsps
|
7
|
+
attr_accessor :number, :sbjct_id, :definition, :length
|
8
|
+
attr_accessor :accession, :query
|
9
|
+
|
10
|
+
# create a new Sbjct object
|
11
|
+
def initialize
|
12
|
+
@number = nil
|
13
|
+
@sbjct_id = ""
|
14
|
+
@definition = ""
|
15
|
+
@length = nil
|
16
|
+
@accession = ""
|
17
|
+
@hsps = []
|
18
|
+
@best_hsp = nil
|
19
|
+
@query = nil
|
20
|
+
end
|
21
|
+
|
22
|
+
# searches hsps and looks for the best and returns it's evalue
|
23
|
+
def evalue
|
24
|
+
# call the best_hsp method and see if result is nil
|
25
|
+
best_hsp.nil? ? nil : @best_hsp.evalue
|
26
|
+
end # end of Sbjct#evalue
|
27
|
+
|
28
|
+
# searches hsps and looks for the best and returns it's
|
29
|
+
# bit_score
|
30
|
+
def bit_score
|
31
|
+
# call the best_hsp method and see if result is nil
|
32
|
+
best_hsp.nil? ? nil : @best_hsp.bit_score
|
33
|
+
end # end of Sbjct#bit_score
|
34
|
+
|
35
|
+
# searches hsps and looks for the best and returns it's
|
36
|
+
# identity
|
37
|
+
def identity
|
38
|
+
# call the best_hsp method and see if result is nil
|
39
|
+
best_hsp.nil? ? nil : @best_hsp.identity
|
40
|
+
end # end of Sbjct#bit_score
|
41
|
+
|
42
|
+
# searches hsps and looks for the best and sets the instance
|
43
|
+
# variable
|
44
|
+
def best_hsp
|
45
|
+
if @best_hsp.nil?
|
46
|
+
if @hsps.length > 0 # have some hsps for this hit
|
47
|
+
temp_best = @hsps[0]
|
48
|
+
@hsps.each do |h|
|
49
|
+
if h.evalue < temp_best.evalue
|
50
|
+
temp_best = h
|
51
|
+
end
|
52
|
+
end
|
53
|
+
@best_hsp = temp_best
|
54
|
+
end
|
55
|
+
end
|
56
|
+
@best_hsp
|
57
|
+
end # end of Sbjct#evalue
|
58
|
+
end # end of MgNu::Parser::Blast::Sbjct class
|
59
|
+
end # end of MgNu::Parser::Blast class
|
60
|
+
|
61
|
+
end # end of MgNu::Parser module
|
62
|
+
end # end of MgNu module
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
# ClustalW is the class used for parsing clustalw multiple alignment output.
|
4
|
+
class ClustalW
|
5
|
+
attr_accessor :buffer, :raw
|
6
|
+
attr_reader :file, :alignment
|
7
|
+
|
8
|
+
# params [String] alignment file (*.aln)
|
9
|
+
# params [Boolean] is this a file (default is true), or a string?
|
10
|
+
# returns [MgNu::Alignment]
|
11
|
+
def initialize(input = nil, file = true)
|
12
|
+
if input
|
13
|
+
if file
|
14
|
+
if File.exists?(input) and File.readable?(input)
|
15
|
+
@raw = File.read(input)
|
16
|
+
end # end of exists and readable file checks
|
17
|
+
else # file is false, so this must be a string with input
|
18
|
+
@raw = input
|
19
|
+
end
|
20
|
+
@buffer = @raw.split(/\r?\n\r?\n/)
|
21
|
+
@alignment = nil
|
22
|
+
self.parse
|
23
|
+
if @buffer.length == 0
|
24
|
+
puts "ClustalW alignment file #{input} did not parse!"
|
25
|
+
exit(1);
|
26
|
+
end
|
27
|
+
else
|
28
|
+
error("MgNu::Parser::ClustalW.new(): need an existing file")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# process the input multiple alignement
|
33
|
+
def parse
|
34
|
+
if @alignment == nil
|
35
|
+
header = @buffer.shift
|
36
|
+
@buffer[0].gsub!(/^(\r?\n)+/, '') # drop newline at start of section
|
37
|
+
@buffer.collect! { |section| section.split(/\r?\n/) }
|
38
|
+
|
39
|
+
match_lines = []
|
40
|
+
# drop numbers if the alignment was run with "-SEQNOS=on"
|
41
|
+
@buffer.each do |section|
|
42
|
+
section.each { |line| line.sub!(/\s+\d+\s*$/, '') }
|
43
|
+
match_lines << section.pop
|
44
|
+
end
|
45
|
+
|
46
|
+
# get the 1st position of a space from the right using
|
47
|
+
# rindex. Increment this by 1 to get the seq_start
|
48
|
+
seq_start = (@buffer[0][0].rindex(/\s/) || -1) + 1
|
49
|
+
|
50
|
+
# create ordered array of hashes with
|
51
|
+
# seqname => sequence and create an array with a order of
|
52
|
+
# sequences (seqname as value)
|
53
|
+
order = Array.new
|
54
|
+
h = Hash.new
|
55
|
+
@buffer.each do |section|
|
56
|
+
section.each do |line|
|
57
|
+
name = line[0, seq_start].sub(/\s+\z/, '')
|
58
|
+
sequence = line[seq_start..-1]
|
59
|
+
if h.has_key?(name)
|
60
|
+
h[name] += sequence
|
61
|
+
else
|
62
|
+
order << name
|
63
|
+
h[name] = sequence
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
@alignment = MgNu::Alignment.new(h, order)
|
69
|
+
end # end of #parse method
|
70
|
+
end # end of MgNu::Parser::ClustalW class
|
71
|
+
end # end of MgNu::Parser module
|
72
|
+
end # end of MgNu module
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Fasta
|
4
|
+
include Enumerable
|
5
|
+
|
6
|
+
attr_reader :file, :filename
|
7
|
+
|
8
|
+
# create a new Fasta parser
|
9
|
+
def initialize(filename = nil, quality_file = false)
|
10
|
+
@quality_file = quality_file
|
11
|
+
@filename = filename
|
12
|
+
if filename
|
13
|
+
if File.exists?(filename) and File.readable?(filename)
|
14
|
+
@file = File.open(filename)
|
15
|
+
else
|
16
|
+
raise "\n\n -- No file by that name (#{filename}). Exiting\n\n"
|
17
|
+
exit(1)
|
18
|
+
#@file = File.new(filename, "w")
|
19
|
+
end
|
20
|
+
else
|
21
|
+
error("MgNu::Parser::Fasta.new(): need a filename or an existing file")
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# override enumerables
|
26
|
+
def each
|
27
|
+
@buffer = [] # temp storage
|
28
|
+
@file.each_line do |line|
|
29
|
+
line.chomp!
|
30
|
+
if line =~ />(.*)/ # got a header line
|
31
|
+
if @buffer.length > 0
|
32
|
+
if @quality_file
|
33
|
+
yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
|
34
|
+
:sequence => @buffer.join(" "))
|
35
|
+
else
|
36
|
+
yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
|
37
|
+
:sequence => @buffer.join(""))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
@buffer = []
|
41
|
+
@buffer << $1
|
42
|
+
else # got a sequence line
|
43
|
+
@buffer << line
|
44
|
+
end
|
45
|
+
end # end of file io
|
46
|
+
@file.close
|
47
|
+
|
48
|
+
# don't forget to yield the last one
|
49
|
+
if @buffer.length > 0
|
50
|
+
if @quality_file
|
51
|
+
yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
|
52
|
+
:sequence => @buffer.join(" "))
|
53
|
+
else
|
54
|
+
yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
|
55
|
+
:sequence => @buffer.join(""))
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end # end of #each
|
59
|
+
end # end of MgNu::Parser::Fasta class
|
60
|
+
end # end of MgNu::File module
|
61
|
+
end # end of MgNu module
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'moneta'
|
2
|
+
module MgNu
|
3
|
+
module Parser
|
4
|
+
class FastaHeaderIndex
|
5
|
+
attr_reader :filename, :db_name, :db
|
6
|
+
|
7
|
+
def initialize(filename)
|
8
|
+
@filename = filename
|
9
|
+
if filename =~ /^.+\.hdr\.tch$/
|
10
|
+
@db_name = @filename
|
11
|
+
else
|
12
|
+
@db_name = @filename + ".hdr.tch"
|
13
|
+
end
|
14
|
+
|
15
|
+
if File.exist?(@db_name)
|
16
|
+
@db = Moneta.new(:TokyoCabinet, file: @db_name)
|
17
|
+
else
|
18
|
+
@db = Moneta.new(:TokyoCabinet, file: @db_name)
|
19
|
+
parse
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# setup parse method for creating tokyo cabinet
|
24
|
+
def parse
|
25
|
+
MgNu::Parser::Fasta.new(@filename).each do |f|
|
26
|
+
@db[f.header_name] = f.header_description
|
27
|
+
end
|
28
|
+
end # end of #parse
|
29
|
+
|
30
|
+
def [](name)
|
31
|
+
@db[name] ? @db[name] : nil
|
32
|
+
end
|
33
|
+
|
34
|
+
def close
|
35
|
+
@db.close unless @db.nil?
|
36
|
+
end
|
37
|
+
end # end of MgNu::Parser::FastaHeaderIndex class
|
38
|
+
end # end of MgNu::Parser module
|
39
|
+
end # end of MgNu module
|