mgnu 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +0 -0
- data/README.md +31 -0
- data/Rakefile +33 -0
- data/lib/mgnu.rb +9 -0
- data/lib/mgnu/alignment.rb +143 -0
- data/lib/mgnu/common.rb +68 -0
- data/lib/mgnu/genbank.rb +117 -0
- data/lib/mgnu/genbank/feature.rb +84 -0
- data/lib/mgnu/genbank/location.rb +150 -0
- data/lib/mgnu/genbank/qualifier.rb +45 -0
- data/lib/mgnu/genbank/reference.rb +114 -0
- data/lib/mgnu/genbank/source.rb +39 -0
- data/lib/mgnu/loggable.rb +61 -0
- data/lib/mgnu/parser.rb +50 -0
- data/lib/mgnu/parser/blast.rb +87 -0
- data/lib/mgnu/parser/blast/format0.rb +290 -0
- data/lib/mgnu/parser/blast/format7.rb +121 -0
- data/lib/mgnu/parser/blast/format8.rb +120 -0
- data/lib/mgnu/parser/blast/hsp.rb +75 -0
- data/lib/mgnu/parser/blast/query.rb +45 -0
- data/lib/mgnu/parser/blast/sbjct.rb +62 -0
- data/lib/mgnu/parser/clustalw.rb +72 -0
- data/lib/mgnu/parser/fasta.rb +61 -0
- data/lib/mgnu/parser/fasta_header_index.rb +39 -0
- data/lib/mgnu/parser/fasta_index.rb +57 -0
- data/lib/mgnu/parser/fastq.rb +61 -0
- data/lib/mgnu/parser/genbank.rb +187 -0
- data/lib/mgnu/parser/gff.rb +56 -0
- data/lib/mgnu/parser/iprscan/hit.rb +76 -0
- data/lib/mgnu/parser/iprscan_file.rb +39 -0
- data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
- data/lib/mgnu/parser/pilercr.rb +102 -0
- data/lib/mgnu/parser/prodigal.rb +170 -0
- data/lib/mgnu/parser/sam.rb +115 -0
- data/lib/mgnu/parser/sam/alignment.rb +22 -0
- data/lib/mgnu/parser/sam/header.rb +23 -0
- data/lib/mgnu/parser/sam/pair.rb +18 -0
- data/lib/mgnu/sequence.rb +207 -0
- data/lib/mgnu/sequence/fasta.rb +79 -0
- data/lib/mgnu/sequence/fastq.rb +43 -0
- data/lib/mgnu/version.rb +16 -0
- data/mgnu.gemspec +39 -0
- data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
- data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
- data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
- data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
- data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
- data/spec/mgnu/parser/clustalw_spec.rb +90 -0
- data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
- data/spec/mgnu/parser_spec.rb +22 -0
- data/spec/mgnu/sequence/fasta_spec.rb +60 -0
- data/spec/mgnu/sequence/fastq_spec.rb +31 -0
- data/spec/mgnu/sequence_spec.rb +81 -0
- data/spec/mgnu_spec.rb +7 -0
- data/spec/spec_helper.rb +53 -0
- metadata +376 -0
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'mgnu/parser/blast/query'
|
2
|
+
require 'mgnu/parser/blast/sbjct'
|
3
|
+
require 'mgnu/parser/blast/hsp'
|
4
|
+
|
5
|
+
|
6
|
+
module MgNu
|
7
|
+
module Parser
|
8
|
+
class Blast
|
9
|
+
class Format8
|
10
|
+
include Enumerable
|
11
|
+
|
12
|
+
attr_accessor :queries
|
13
|
+
|
14
|
+
# create a new Format8 parser object
|
15
|
+
def initialize(input)
|
16
|
+
@query = nil
|
17
|
+
@sbjct = nil
|
18
|
+
@queries = []
|
19
|
+
|
20
|
+
@input = input
|
21
|
+
end
|
22
|
+
|
23
|
+
def each
|
24
|
+
@input.each do |line|
|
25
|
+
next if line =~ /^#/ # skip comments
|
26
|
+
|
27
|
+
temp = line.split(/\t/)
|
28
|
+
|
29
|
+
query_id = temp.shift
|
30
|
+
|
31
|
+
if @query.nil?
|
32
|
+
@query = Query.new
|
33
|
+
@query.query_id = query_id
|
34
|
+
end
|
35
|
+
|
36
|
+
if @query.query_id == query_id
|
37
|
+
# already on this query, so just add the sbject
|
38
|
+
extract_sbjct(temp)
|
39
|
+
else
|
40
|
+
# new query_id, save this one and start on new one
|
41
|
+
@query.sbjcts << @sbjct
|
42
|
+
@sbjct = nil
|
43
|
+
yield @query
|
44
|
+
@query = Query.new
|
45
|
+
@query.query_id = query_id
|
46
|
+
extract_sbjct(temp)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def parse
|
52
|
+
@input.each do |line|
|
53
|
+
next if line =~ /^#/ # skip comments
|
54
|
+
|
55
|
+
temp = line.split
|
56
|
+
|
57
|
+
query_id = temp.shift
|
58
|
+
|
59
|
+
if @query.nil?
|
60
|
+
@query = Query.new
|
61
|
+
@query.query_id = query_id
|
62
|
+
end
|
63
|
+
|
64
|
+
if @query.query_id == query_id
|
65
|
+
# already on this query, so just add the sbject
|
66
|
+
extract_sbjct(temp)
|
67
|
+
else
|
68
|
+
# new query_id, save this one and start on new one
|
69
|
+
@query.sbjcts << @sbjct
|
70
|
+
@queries << @query
|
71
|
+
@sbjct = nil
|
72
|
+
@query = Query.new
|
73
|
+
@query.query_id = query_id
|
74
|
+
extract_sbjct(temp)
|
75
|
+
end
|
76
|
+
end # end of input.each do |line|
|
77
|
+
|
78
|
+
#grab the last ones, if present
|
79
|
+
unless @query.nil?
|
80
|
+
@query.sbjcts << @sbjct
|
81
|
+
@queries << @query
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def extract_sbjct(input)
|
86
|
+
sbjct_id = input.shift
|
87
|
+
if @sbjct.nil?
|
88
|
+
@sbjct = Sbjct.new
|
89
|
+
@sbjct.sbjct_id = sbjct_id
|
90
|
+
end
|
91
|
+
|
92
|
+
if @sbjct.sbjct_id == sbjct_id
|
93
|
+
extract_hsp(input)
|
94
|
+
else
|
95
|
+
@query.sbjcts << @sbjct
|
96
|
+
@sbjct = Sbjct.new
|
97
|
+
@sbjct.sbjct_id = sbjct_id
|
98
|
+
extract_hsp(input)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def extract_hsp(input)
|
103
|
+
hsp = Hsp.new
|
104
|
+
hsp.identity = input.shift.to_f
|
105
|
+
hsp.length = input.shift.to_i
|
106
|
+
hsp.mismatches = input.shift.to_i
|
107
|
+
hsp.gap_count = input.shift.to_i
|
108
|
+
hsp.query_from = input.shift.to_i
|
109
|
+
hsp.query_to = input.shift.to_i
|
110
|
+
hsp.sbjct_from = input.shift.to_i
|
111
|
+
hsp.sbjct_to = input.shift.to_i
|
112
|
+
hsp.evalue = input.shift.to_f
|
113
|
+
hsp.bit_score = input.shift.to_f
|
114
|
+
@sbjct.hsps << hsp
|
115
|
+
end
|
116
|
+
|
117
|
+
end # end of MgNu::Parser::Blast::Format8 class
|
118
|
+
end # end of MgNu::Parser::Blast class
|
119
|
+
end # end of MgNu::Parser module
|
120
|
+
end # end of MgNu module
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Blast
|
4
|
+
class Hsp
|
5
|
+
|
6
|
+
attr_accessor :number, :bit_score, :score, :evalue
|
7
|
+
attr_accessor :query_from, :query_to, :sbjct_from, :sbjct_to
|
8
|
+
attr_accessor :query_frame, :sbjct_frame, :identity, :positive
|
9
|
+
attr_accessor :length, :query_sequence, :sbjct_sequence, :midline
|
10
|
+
attr_accessor :gap_count, :mismatches, :sbjct, :query
|
11
|
+
|
12
|
+
# create a new Hsp object
|
13
|
+
def initialize
|
14
|
+
@number = nil
|
15
|
+
@bit_score = nil
|
16
|
+
@score = nil
|
17
|
+
@evalue = nil
|
18
|
+
@query_from = nil
|
19
|
+
@query_to = nil
|
20
|
+
@sbjct_from = nil
|
21
|
+
@sbjct_to = nil
|
22
|
+
@query_frame = nil
|
23
|
+
@sbjct_frame = nil
|
24
|
+
@identity = nil
|
25
|
+
@positive = nil
|
26
|
+
@length = nil
|
27
|
+
@query_sequence = ""
|
28
|
+
@sbjct_sequence = ""
|
29
|
+
@midline = ""
|
30
|
+
@gap_count = nil
|
31
|
+
@mismatches = nil
|
32
|
+
@sbjct = nil
|
33
|
+
@query = nil
|
34
|
+
end
|
35
|
+
|
36
|
+
def query_frameshifts
|
37
|
+
if @query_sequence =~ /(?:\/|\\)/
|
38
|
+
loc2frame = Hash.new
|
39
|
+
re = /[\/\\]{1,2}/
|
40
|
+
re.global_match(@query_sequence.gsub(/[- ]/,'')) do |m|
|
41
|
+
frame = nil
|
42
|
+
# m.begin(0) is location of char match
|
43
|
+
# (m.begin(0) - 1) * 3 is the length of the coding dna
|
44
|
+
# up to the (but not including) the char match
|
45
|
+
# (m.begin(0) - 1) * 3 + @query_from - 1 is the corrected
|
46
|
+
# position taking into account the start of the query
|
47
|
+
# sequence. query_from is reported in nt
|
48
|
+
if @query_from > @query_to
|
49
|
+
location = @query_from - (m.begin(0) * 3 - 1)
|
50
|
+
else
|
51
|
+
location = (m.begin(0) * 3 - 1) + @query_from
|
52
|
+
end
|
53
|
+
case m[0]
|
54
|
+
when '/'
|
55
|
+
frame = 1
|
56
|
+
when '//'
|
57
|
+
frame = 2
|
58
|
+
when '\\'
|
59
|
+
frame = -1
|
60
|
+
when '\\\\'
|
61
|
+
frame = -2
|
62
|
+
end
|
63
|
+
loc2frame[location] = frame
|
64
|
+
end # end re.global_match
|
65
|
+
return loc2frame
|
66
|
+
else
|
67
|
+
return nil
|
68
|
+
end # end if @query_sequence =~ /(?:\/|\\)/
|
69
|
+
end # end query_frameshifts
|
70
|
+
|
71
|
+
end # end of MgNu::Parser::Blast::Hsp class
|
72
|
+
end # end of MgNu::Parser::Blast class
|
73
|
+
|
74
|
+
end # end of MgNu::Parser module
|
75
|
+
end # end of MgNu module
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Blast
|
4
|
+
class Query
|
5
|
+
|
6
|
+
attr_accessor :number, :query_id, :definition, :length, :sbjcts
|
7
|
+
attr_accessor :database, :database_sequence_count, :database_total_letters
|
8
|
+
|
9
|
+
# create a new Query object
|
10
|
+
def initialize
|
11
|
+
@number = nil
|
12
|
+
@query_id = ""
|
13
|
+
@definition = ""
|
14
|
+
@length = nil
|
15
|
+
@sbjcts = []
|
16
|
+
@best_hit = nil
|
17
|
+
@database = nil
|
18
|
+
@database_sequence_count = 0
|
19
|
+
@database_total_letters = 0
|
20
|
+
end
|
21
|
+
|
22
|
+
# Returns the @best_hit instance variable. If not set, it
|
23
|
+
# will search this query's sbjcts and find the one with the best
|
24
|
+
# evalue and return it
|
25
|
+
#
|
26
|
+
# @return [MgNu::Parser::Blast::Sbjct] the best hit for this
|
27
|
+
# query
|
28
|
+
def best_hit
|
29
|
+
return @best_hit unless @best_hit.nil?
|
30
|
+
if @sbjcts.length > 0 # make sure there are some hits
|
31
|
+
best_hit = @sbjcts[0]
|
32
|
+
@sbjcts.each do |s|
|
33
|
+
if s.evalue < best_hit.evalue
|
34
|
+
best_hit = s
|
35
|
+
end
|
36
|
+
end
|
37
|
+
@best_hit = best_hit
|
38
|
+
return best_hit
|
39
|
+
end
|
40
|
+
return nil
|
41
|
+
end
|
42
|
+
end # end of MgNu::Parser::Blast::Query class
|
43
|
+
end # end of MgNu::Parser::Blast class
|
44
|
+
end # end of MgNu::Parser module
|
45
|
+
end # end of MgNu module
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Blast
|
4
|
+
class Sbjct
|
5
|
+
|
6
|
+
attr_accessor :hsps
|
7
|
+
attr_accessor :number, :sbjct_id, :definition, :length
|
8
|
+
attr_accessor :accession, :query
|
9
|
+
|
10
|
+
# create a new Sbjct object
|
11
|
+
def initialize
|
12
|
+
@number = nil
|
13
|
+
@sbjct_id = ""
|
14
|
+
@definition = ""
|
15
|
+
@length = nil
|
16
|
+
@accession = ""
|
17
|
+
@hsps = []
|
18
|
+
@best_hsp = nil
|
19
|
+
@query = nil
|
20
|
+
end
|
21
|
+
|
22
|
+
# searches hsps and looks for the best and returns it's evalue
|
23
|
+
def evalue
|
24
|
+
# call the best_hsp method and see if result is nil
|
25
|
+
best_hsp.nil? ? nil : @best_hsp.evalue
|
26
|
+
end # end of Sbjct#evalue
|
27
|
+
|
28
|
+
# searches hsps and looks for the best and returns it's
|
29
|
+
# bit_score
|
30
|
+
def bit_score
|
31
|
+
# call the best_hsp method and see if result is nil
|
32
|
+
best_hsp.nil? ? nil : @best_hsp.bit_score
|
33
|
+
end # end of Sbjct#bit_score
|
34
|
+
|
35
|
+
# searches hsps and looks for the best and returns it's
|
36
|
+
# identity
|
37
|
+
def identity
|
38
|
+
# call the best_hsp method and see if result is nil
|
39
|
+
best_hsp.nil? ? nil : @best_hsp.identity
|
40
|
+
end # end of Sbjct#bit_score
|
41
|
+
|
42
|
+
# searches hsps and looks for the best and sets the instance
|
43
|
+
# variable
|
44
|
+
def best_hsp
|
45
|
+
if @best_hsp.nil?
|
46
|
+
if @hsps.length > 0 # have some hsps for this hit
|
47
|
+
temp_best = @hsps[0]
|
48
|
+
@hsps.each do |h|
|
49
|
+
if h.evalue < temp_best.evalue
|
50
|
+
temp_best = h
|
51
|
+
end
|
52
|
+
end
|
53
|
+
@best_hsp = temp_best
|
54
|
+
end
|
55
|
+
end
|
56
|
+
@best_hsp
|
57
|
+
end # end of Sbjct#evalue
|
58
|
+
end # end of MgNu::Parser::Blast::Sbjct class
|
59
|
+
end # end of MgNu::Parser::Blast class
|
60
|
+
|
61
|
+
end # end of MgNu::Parser module
|
62
|
+
end # end of MgNu module
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
# ClustalW is the class used for parsing clustalw multiple alignment output.
|
4
|
+
class ClustalW
|
5
|
+
attr_accessor :buffer, :raw
|
6
|
+
attr_reader :file, :alignment
|
7
|
+
|
8
|
+
# params [String] alignment file (*.aln)
|
9
|
+
# params [Boolean] is this a file (default is true), or a string?
|
10
|
+
# returns [MgNu::Alignment]
|
11
|
+
def initialize(input = nil, file = true)
|
12
|
+
if input
|
13
|
+
if file
|
14
|
+
if File.exists?(input) and File.readable?(input)
|
15
|
+
@raw = File.read(input)
|
16
|
+
end # end of exists and readable file checks
|
17
|
+
else # file is false, so this must be a string with input
|
18
|
+
@raw = input
|
19
|
+
end
|
20
|
+
@buffer = @raw.split(/\r?\n\r?\n/)
|
21
|
+
@alignment = nil
|
22
|
+
self.parse
|
23
|
+
if @buffer.length == 0
|
24
|
+
puts "ClustalW alignment file #{input} did not parse!"
|
25
|
+
exit(1);
|
26
|
+
end
|
27
|
+
else
|
28
|
+
error("MgNu::Parser::ClustalW.new(): need an existing file")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# process the input multiple alignement
|
33
|
+
def parse
|
34
|
+
if @alignment == nil
|
35
|
+
header = @buffer.shift
|
36
|
+
@buffer[0].gsub!(/^(\r?\n)+/, '') # drop newline at start of section
|
37
|
+
@buffer.collect! { |section| section.split(/\r?\n/) }
|
38
|
+
|
39
|
+
match_lines = []
|
40
|
+
# drop numbers if the alignment was run with "-SEQNOS=on"
|
41
|
+
@buffer.each do |section|
|
42
|
+
section.each { |line| line.sub!(/\s+\d+\s*$/, '') }
|
43
|
+
match_lines << section.pop
|
44
|
+
end
|
45
|
+
|
46
|
+
# get the 1st position of a space from the right using
|
47
|
+
# rindex. Increment this by 1 to get the seq_start
|
48
|
+
seq_start = (@buffer[0][0].rindex(/\s/) || -1) + 1
|
49
|
+
|
50
|
+
# create ordered array of hashes with
|
51
|
+
# seqname => sequence and create an array with a order of
|
52
|
+
# sequences (seqname as value)
|
53
|
+
order = Array.new
|
54
|
+
h = Hash.new
|
55
|
+
@buffer.each do |section|
|
56
|
+
section.each do |line|
|
57
|
+
name = line[0, seq_start].sub(/\s+\z/, '')
|
58
|
+
sequence = line[seq_start..-1]
|
59
|
+
if h.has_key?(name)
|
60
|
+
h[name] += sequence
|
61
|
+
else
|
62
|
+
order << name
|
63
|
+
h[name] = sequence
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
@alignment = MgNu::Alignment.new(h, order)
|
69
|
+
end # end of #parse method
|
70
|
+
end # end of MgNu::Parser::ClustalW class
|
71
|
+
end # end of MgNu::Parser module
|
72
|
+
end # end of MgNu module
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Fasta
|
4
|
+
include Enumerable
|
5
|
+
|
6
|
+
attr_reader :file, :filename
|
7
|
+
|
8
|
+
# create a new Fasta parser
|
9
|
+
def initialize(filename = nil, quality_file = false)
|
10
|
+
@quality_file = quality_file
|
11
|
+
@filename = filename
|
12
|
+
if filename
|
13
|
+
if File.exists?(filename) and File.readable?(filename)
|
14
|
+
@file = File.open(filename)
|
15
|
+
else
|
16
|
+
raise "\n\n -- No file by that name (#{filename}). Exiting\n\n"
|
17
|
+
exit(1)
|
18
|
+
#@file = File.new(filename, "w")
|
19
|
+
end
|
20
|
+
else
|
21
|
+
error("MgNu::Parser::Fasta.new(): need a filename or an existing file")
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# override enumerables
|
26
|
+
def each
|
27
|
+
@buffer = [] # temp storage
|
28
|
+
@file.each_line do |line|
|
29
|
+
line.chomp!
|
30
|
+
if line =~ />(.*)/ # got a header line
|
31
|
+
if @buffer.length > 0
|
32
|
+
if @quality_file
|
33
|
+
yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
|
34
|
+
:sequence => @buffer.join(" "))
|
35
|
+
else
|
36
|
+
yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
|
37
|
+
:sequence => @buffer.join(""))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
@buffer = []
|
41
|
+
@buffer << $1
|
42
|
+
else # got a sequence line
|
43
|
+
@buffer << line
|
44
|
+
end
|
45
|
+
end # end of file io
|
46
|
+
@file.close
|
47
|
+
|
48
|
+
# don't forget to yield the last one
|
49
|
+
if @buffer.length > 0
|
50
|
+
if @quality_file
|
51
|
+
yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
|
52
|
+
:sequence => @buffer.join(" "))
|
53
|
+
else
|
54
|
+
yield MgNu::Sequence::Fasta.new(:header => @buffer.shift,
|
55
|
+
:sequence => @buffer.join(""))
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end # end of #each
|
59
|
+
end # end of MgNu::Parser::Fasta class
|
60
|
+
end # end of MgNu::File module
|
61
|
+
end # end of MgNu module
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'moneta'
|
2
|
+
module MgNu
|
3
|
+
module Parser
|
4
|
+
class FastaHeaderIndex
|
5
|
+
attr_reader :filename, :db_name, :db
|
6
|
+
|
7
|
+
def initialize(filename)
|
8
|
+
@filename = filename
|
9
|
+
if filename =~ /^.+\.hdr\.tch$/
|
10
|
+
@db_name = @filename
|
11
|
+
else
|
12
|
+
@db_name = @filename + ".hdr.tch"
|
13
|
+
end
|
14
|
+
|
15
|
+
if File.exist?(@db_name)
|
16
|
+
@db = Moneta.new(:TokyoCabinet, file: @db_name)
|
17
|
+
else
|
18
|
+
@db = Moneta.new(:TokyoCabinet, file: @db_name)
|
19
|
+
parse
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# setup parse method for creating tokyo cabinet
|
24
|
+
def parse
|
25
|
+
MgNu::Parser::Fasta.new(@filename).each do |f|
|
26
|
+
@db[f.header_name] = f.header_description
|
27
|
+
end
|
28
|
+
end # end of #parse
|
29
|
+
|
30
|
+
def [](name)
|
31
|
+
@db[name] ? @db[name] : nil
|
32
|
+
end
|
33
|
+
|
34
|
+
def close
|
35
|
+
@db.close unless @db.nil?
|
36
|
+
end
|
37
|
+
end # end of MgNu::Parser::FastaHeaderIndex class
|
38
|
+
end # end of MgNu::Parser module
|
39
|
+
end # end of MgNu module
|