mgnu 2.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.yardopts +0 -0
- data/README.md +31 -0
- data/Rakefile +33 -0
- data/lib/mgnu.rb +9 -0
- data/lib/mgnu/alignment.rb +143 -0
- data/lib/mgnu/common.rb +68 -0
- data/lib/mgnu/genbank.rb +117 -0
- data/lib/mgnu/genbank/feature.rb +84 -0
- data/lib/mgnu/genbank/location.rb +150 -0
- data/lib/mgnu/genbank/qualifier.rb +45 -0
- data/lib/mgnu/genbank/reference.rb +114 -0
- data/lib/mgnu/genbank/source.rb +39 -0
- data/lib/mgnu/loggable.rb +61 -0
- data/lib/mgnu/parser.rb +50 -0
- data/lib/mgnu/parser/blast.rb +87 -0
- data/lib/mgnu/parser/blast/format0.rb +290 -0
- data/lib/mgnu/parser/blast/format7.rb +121 -0
- data/lib/mgnu/parser/blast/format8.rb +120 -0
- data/lib/mgnu/parser/blast/hsp.rb +75 -0
- data/lib/mgnu/parser/blast/query.rb +45 -0
- data/lib/mgnu/parser/blast/sbjct.rb +62 -0
- data/lib/mgnu/parser/clustalw.rb +72 -0
- data/lib/mgnu/parser/fasta.rb +61 -0
- data/lib/mgnu/parser/fasta_header_index.rb +39 -0
- data/lib/mgnu/parser/fasta_index.rb +57 -0
- data/lib/mgnu/parser/fastq.rb +61 -0
- data/lib/mgnu/parser/genbank.rb +187 -0
- data/lib/mgnu/parser/gff.rb +56 -0
- data/lib/mgnu/parser/iprscan/hit.rb +76 -0
- data/lib/mgnu/parser/iprscan_file.rb +39 -0
- data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
- data/lib/mgnu/parser/pilercr.rb +102 -0
- data/lib/mgnu/parser/prodigal.rb +170 -0
- data/lib/mgnu/parser/sam.rb +115 -0
- data/lib/mgnu/parser/sam/alignment.rb +22 -0
- data/lib/mgnu/parser/sam/header.rb +23 -0
- data/lib/mgnu/parser/sam/pair.rb +18 -0
- data/lib/mgnu/sequence.rb +207 -0
- data/lib/mgnu/sequence/fasta.rb +79 -0
- data/lib/mgnu/sequence/fastq.rb +43 -0
- data/lib/mgnu/version.rb +16 -0
- data/mgnu.gemspec +39 -0
- data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
- data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
- data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
- data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
- data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
- data/spec/mgnu/parser/clustalw_spec.rb +90 -0
- data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
- data/spec/mgnu/parser_spec.rb +22 -0
- data/spec/mgnu/sequence/fasta_spec.rb +60 -0
- data/spec/mgnu/sequence/fastq_spec.rb +31 -0
- data/spec/mgnu/sequence_spec.rb +81 -0
- data/spec/mgnu_spec.rb +7 -0
- data/spec/spec_helper.rb +53 -0
- metadata +376 -0
@@ -0,0 +1,163 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class KeggOntologyIndex
|
4
|
+
include TokyoCabinet
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
attr_reader :filename, :db_name, :db
|
8
|
+
alias :ontologies :db
|
9
|
+
|
10
|
+
# create a new KeggOntologyIndex
|
11
|
+
def initialize(filename="/work/blastdb/kegg/ko")
|
12
|
+
@filename = filename
|
13
|
+
@db_name = @filename + ".tch"
|
14
|
+
@db = HDB.new
|
15
|
+
if File.exists?(@filename) and File.readable?(@filename)
|
16
|
+
if File.exists?(@db_name) and File.readable?(@db_name)
|
17
|
+
if ! @db.open(@db_name, HDB::OREADER) # open the database read-only
|
18
|
+
ecode = hdb.ecode
|
19
|
+
$stderr.puts "ERROR: could not open #{@db_name} (code: #{hdb.errmsg(ecode)})"
|
20
|
+
exit(1)
|
21
|
+
end
|
22
|
+
else
|
23
|
+
if ! @db.open(@db_name, HDB::OWRITER | HDB::OCREAT | HDB::OLCKNB | HDB::OTSYNC ) # create and open the database rw
|
24
|
+
ecode = hdb.ecode
|
25
|
+
$stderr.puts "ERROR: could not open #{@db_name} (code: #{hdb.errmsg(ecode)})"
|
26
|
+
exit(1)
|
27
|
+
end
|
28
|
+
parse
|
29
|
+
end
|
30
|
+
else
|
31
|
+
raise "\n\n ERROR -- No file by name (#{@filename}). Exiting.\n\n"
|
32
|
+
exit(1)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def each
|
37
|
+
@db.keys.each do |k|
|
38
|
+
yield MgNu::Kegg::Ontology.from_json(@db[k])
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def [](k)
|
43
|
+
ko = nil
|
44
|
+
if @db[k]
|
45
|
+
ko = MgNu::Kegg::Ontology.from_json(@db[k])
|
46
|
+
else
|
47
|
+
$stderr.puts "warning - #{k} wasn't in the file, ko is nil!"
|
48
|
+
end
|
49
|
+
ko
|
50
|
+
end
|
51
|
+
|
52
|
+
# setup parse method for creating TC
|
53
|
+
def parse
|
54
|
+
buffer = Array.new
|
55
|
+
File.new(@filename).each do |line|
|
56
|
+
line.chomp!
|
57
|
+
if line =~ /\/\/\//
|
58
|
+
ko = parse_ko_buffer(buffer)
|
59
|
+
@db[ko.kegg_id] = ko.to_json
|
60
|
+
buffer.clear
|
61
|
+
else
|
62
|
+
buffer << line
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
if buffer.length > 0
|
67
|
+
ko = parse_ko_buffer(buffer)
|
68
|
+
@db[ko.kegg_id] = ko.to_json
|
69
|
+
end
|
70
|
+
end # end parse method
|
71
|
+
|
72
|
+
def parse_ko_buffer(buffer)
|
73
|
+
ko = MgNu::Kegg::Ontology.new
|
74
|
+
while buffer.length > 0
|
75
|
+
line = buffer.shift
|
76
|
+
if line =~ /^ENTRY\s+(\S+)\s/
|
77
|
+
ko.kegg_id = $1
|
78
|
+
elsif line =~ /^NAME\s+(.+)/
|
79
|
+
ko.name = $1
|
80
|
+
elsif line =~ /^DEFINITION\s+(.+)/
|
81
|
+
ko.definition = $1
|
82
|
+
while buffer.length > 0
|
83
|
+
dline = buffer.shift
|
84
|
+
if dline =~ /^(?:CLASS|DBLINKS|GENES)/
|
85
|
+
buffer.unshift(dline)
|
86
|
+
break
|
87
|
+
else
|
88
|
+
ko.definition += dline
|
89
|
+
end
|
90
|
+
end
|
91
|
+
elsif line =~ /^CLASS\s+(.+)/
|
92
|
+
class_str = $1 + " "
|
93
|
+
while buffer.length > 0
|
94
|
+
cline = buffer.shift
|
95
|
+
if cline =~ /^(?:DBLINKS|GENES)/
|
96
|
+
buffer.unshift(cline)
|
97
|
+
break
|
98
|
+
else
|
99
|
+
class_str += cline + " "
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
re = /\s*(.+?)\[PATH:(ko\d+)\]\s*/
|
104
|
+
re.global_match(class_str) do |m|
|
105
|
+
ko.classes << MgNu::Kegg::Ontology::KeggClass.new(:pathway => m[2], :description => m[1])
|
106
|
+
end
|
107
|
+
if ko.classes.length == 0
|
108
|
+
ko.classes << MgNu::Kegg::Ontology::KeggClass.new(:pathway => "unknown", :description => class_str)
|
109
|
+
end
|
110
|
+
elsif line =~ /^DBLINKS\s+(.+):\s(.+)/
|
111
|
+
database = $1
|
112
|
+
names = $2.split(/\s+/)
|
113
|
+
while buffer.length > 0
|
114
|
+
dline = buffer.shift
|
115
|
+
if dline =~ /^GENES\s+(.+):\s(.+)/
|
116
|
+
buffer.unshift(dline)
|
117
|
+
break
|
118
|
+
elsif dline =~ /\s+(.+):\s(.+)/ # new db
|
119
|
+
names.flatten.each do |n|
|
120
|
+
next if n == ""
|
121
|
+
ko.dblinks << MgNu::Kegg::Ontology::Dblink.new(:name => n, :database => database)
|
122
|
+
end
|
123
|
+
database = $1
|
124
|
+
names = $2.split(/\s+/)
|
125
|
+
else
|
126
|
+
names << dline.split(/\s+/)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
names.flatten.each do |n|
|
130
|
+
next if n == ""
|
131
|
+
ko.dblinks << MgNu::Kegg::Ontology::Dblink.new(:name => n, :database => database)
|
132
|
+
end
|
133
|
+
elsif line =~ /^GENES\s+(.+):\s(.+)/
|
134
|
+
org = $1
|
135
|
+
names = $2.split(/\s+/)
|
136
|
+
while buffer.length > 0
|
137
|
+
gline = buffer.shift
|
138
|
+
if gline =~ /\s+(.+):\s(.+)/
|
139
|
+
names.flatten.each do |n|
|
140
|
+
next if n == ""
|
141
|
+
ko.genes << MgNu::Kegg::Ontology::Gene.new(:name => n, :organism => org)
|
142
|
+
end
|
143
|
+
org = $1
|
144
|
+
names = $2.split(/\s+/)
|
145
|
+
else
|
146
|
+
names << gline.split(/\s+/)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
names.flatten.each do |n|
|
150
|
+
next if n == ""
|
151
|
+
ko.genes << MgNu::Kegg::Ontology::Gene.new(:name => n, :organism => org)
|
152
|
+
end
|
153
|
+
end # end if /ENTRY/
|
154
|
+
end # end buffer.each line
|
155
|
+
ko # return the ko object
|
156
|
+
end # end of #parse_ko_buffer(buffer)
|
157
|
+
|
158
|
+
def close
|
159
|
+
@db.close unless @db.nil?
|
160
|
+
end
|
161
|
+
end # end of MgNu::Parser::KeggOntologyIndex class
|
162
|
+
end # end of MgNu::Parser module
|
163
|
+
end # end of MgNu module
|
@@ -0,0 +1,102 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Pilercr
|
4
|
+
include Enumerable
|
5
|
+
|
6
|
+
attr_reader :file, :filename
|
7
|
+
|
8
|
+
# create a new Pilercr parser
|
9
|
+
def initialize(filename = nil)
|
10
|
+
@filename = filename
|
11
|
+
if filename
|
12
|
+
if File.exists?(filename) and File.readable?(filename)
|
13
|
+
@file = File.open(filename)
|
14
|
+
else
|
15
|
+
raise "\n\n -- No file by that name (#{filename}). Exiting\n\n"
|
16
|
+
exit(1)
|
17
|
+
end
|
18
|
+
else
|
19
|
+
$stderr.puts "MgNu::Parser::Pilercr.new(): need a filename or an existing file"
|
20
|
+
exit(1)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# override enumerables
|
25
|
+
def each
|
26
|
+
# read by blocks, starting with "Array " lines
|
27
|
+
array_count = 0
|
28
|
+
processed = 0
|
29
|
+
@file.each_line("Array ") do |block|
|
30
|
+
if array_count == processed and array_count > 0 # processed all the blocks in this file
|
31
|
+
break
|
32
|
+
else
|
33
|
+
if block =~ /^pilercr/
|
34
|
+
if block =~ /(\d+) putative/
|
35
|
+
array_count= $1.to_i
|
36
|
+
end
|
37
|
+
next
|
38
|
+
else
|
39
|
+
yield process_buffer(block.split(/\n/))
|
40
|
+
processed += 1
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end # end of File#each
|
45
|
+
|
46
|
+
def process_buffer(buffer)
|
47
|
+
pilercr = MgNu::Pilercr.new
|
48
|
+
buffer.each do |line|
|
49
|
+
if line =~ /SUMMARY/
|
50
|
+
buffer.clear
|
51
|
+
next
|
52
|
+
end
|
53
|
+
next if line =~ /^\s*$/
|
54
|
+
next if line =~ /^\d+$/
|
55
|
+
next if line =~ /^\s+Pos/
|
56
|
+
next if line =~ /^Array\s*$/
|
57
|
+
next if line =~ /^=+/
|
58
|
+
if line =~ />(.+)/
|
59
|
+
temp = $1.split(/\s+/)
|
60
|
+
if temp.length > 1
|
61
|
+
pilercr.header_name = temp.shift
|
62
|
+
pilercr.header = temp.join(" ")
|
63
|
+
else
|
64
|
+
pilercr.header_name = temp[0]
|
65
|
+
pilercr.header = temp[0]
|
66
|
+
end
|
67
|
+
else
|
68
|
+
temp = line.split(/\s+/)
|
69
|
+
temp.shift # drop empty space
|
70
|
+
if temp.length == 4 # final line with repeat sequence
|
71
|
+
pilercr.total_repeats = temp[0].to_i
|
72
|
+
pilercr.repeat_length = temp[1].to_i
|
73
|
+
pilercr.total_spacers = temp[2].to_i
|
74
|
+
pilercr.repeat_sequence = temp[3]
|
75
|
+
elsif temp.length == 6 # line with unknown spacer length
|
76
|
+
pilercr.repeats << MgNu::Pilercr::Repeat.new(:position => temp[0].to_i,
|
77
|
+
:length => temp[1].to_i,
|
78
|
+
:identity => temp[2].to_f,
|
79
|
+
:spacer_length => temp[3].to_i,
|
80
|
+
:match_line => temp[4],
|
81
|
+
:spacer => temp[5])
|
82
|
+
elsif temp.length == 7 # normal repeat line
|
83
|
+
pilercr.repeats << MgNu::Pilercr::Repeat.new(:position => temp[0].to_i,
|
84
|
+
:length => temp[1].to_i,
|
85
|
+
:identity => temp[2].to_f,
|
86
|
+
:spacer_length => temp[3].to_i,
|
87
|
+
:left_flank => temp[4],
|
88
|
+
:match_line => temp[5],
|
89
|
+
:spacer => temp[6])
|
90
|
+
|
91
|
+
else
|
92
|
+
$stderr.puts "WARN: Unknown line format"
|
93
|
+
$stderr.puts line
|
94
|
+
end
|
95
|
+
end # end if/else
|
96
|
+
end # end buffer.each do |line|
|
97
|
+
pilercr
|
98
|
+
end # end process_buffer method
|
99
|
+
|
100
|
+
end # end of MgNu::Parser::Pilercr class
|
101
|
+
end # end of MgNu::Parser module
|
102
|
+
end # end of MgNu module
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# this is a hack of MgNu::Parser::Genbank to deal specifically with
|
2
|
+
# prodigal's limited GFF support
|
3
|
+
require 'mgnu/genbank/feature'
|
4
|
+
require 'mgnu/genbank/location'
|
5
|
+
require 'mgnu/genbank/qualifier'
|
6
|
+
|
7
|
+
module MgNu
|
8
|
+
module Parser
|
9
|
+
class Prodigal
|
10
|
+
|
11
|
+
attr_reader :file
|
12
|
+
attr_accessor :name, :length, :definition, :features
|
13
|
+
|
14
|
+
include MgNu::Loggable
|
15
|
+
include MgNu::Parser
|
16
|
+
include Enumerable
|
17
|
+
|
18
|
+
# create a new prodigal parser
|
19
|
+
def initialize(filename = nil,debug=false)
|
20
|
+
@debug = debug
|
21
|
+
if filename
|
22
|
+
if File.exists?(filename) and File.readable?(filename)
|
23
|
+
@file = File.open(filename)
|
24
|
+
else
|
25
|
+
error("MgNu::Parser::Prodigal.new(): problems with filename")
|
26
|
+
raise "File doesn't exist or is not readable!"
|
27
|
+
end
|
28
|
+
else
|
29
|
+
error("MgNu::Parser::Prodigal.new(): need a filename")
|
30
|
+
raise "no filename given!"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def each
|
35
|
+
buffer = parse_until(@file,/^\/\//,false)
|
36
|
+
while (buffer.length > 0) do
|
37
|
+
buffer.shift if buffer[0] =~ /^\/\//
|
38
|
+
yield parse(buffer)
|
39
|
+
buffer = parse_until(@file,/^\/\//,false)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def parse(buffer)
|
44
|
+
return if buffer.nil?
|
45
|
+
return if buffer.length == 0
|
46
|
+
if buffer[0] =~ /^DEFINITION\s+(.+)$/
|
47
|
+
pseq = MgNu::Parser::Prodigal::Sequence.new(:definition => $1)
|
48
|
+
if buffer[0] =~ /^DEFINITION\s+seqnum=(\d+);seqlen=(\d+);seqhdr="(.+)\s*";gc_cont=([0-9\.]+);transl_table=(\d+).*$/
|
49
|
+
buffer.shift
|
50
|
+
pseq.seqnum = $1.to_i
|
51
|
+
pseq.length = $2.to_i
|
52
|
+
pseq.seqhdr = $3
|
53
|
+
pseq.gc_cont = $4.to_f
|
54
|
+
pseq.transl_table = $5.to_i
|
55
|
+
pseq.name = pseq.seqhdr.split(/\s+/)[0]
|
56
|
+
#pseq.features = parse_features(buffer)
|
57
|
+
pseq.parse_features(buffer)
|
58
|
+
return pseq
|
59
|
+
else
|
60
|
+
$stderr.puts "ERROR: unknown format for DEFINITION line"
|
61
|
+
$stderr.puts buffer[0]
|
62
|
+
exit(1)
|
63
|
+
end # end if /DEFINITION/
|
64
|
+
else
|
65
|
+
$stderr.puts "ERROR: buffer didn't begin with DEFINITION"
|
66
|
+
$stderr.puts buffer[0]
|
67
|
+
exit(1)
|
68
|
+
end # end if /DEFINITION/
|
69
|
+
end # end of def parse
|
70
|
+
|
71
|
+
# yielded from MgNu::Parser::Prodigal
|
72
|
+
class Sequence
|
73
|
+
attr_accessor :name, :length, :definition, :features
|
74
|
+
attr_accessor :seqnum, :seqhdr, :gc_cont, :transl_table
|
75
|
+
|
76
|
+
def initialize(options = {})
|
77
|
+
@name = options.has_key?(:name) ? options[:name] : ""
|
78
|
+
@length = options.has_key?(:length) ? options[:length] : ""
|
79
|
+
@definition = options.has_key?(:definition) ? options[:definition] : ""
|
80
|
+
@seqnum = options.has_key?(:seqnum) ? options[:seqnum] : ""
|
81
|
+
@seqhdr = options.has_key?(:seqhdr) ? options[:seqhdr] : ""
|
82
|
+
@gc_cont = options.has_key?(:gc_cont) ? options[:gc_cont] : ""
|
83
|
+
@transl_table = options.has_key?(:transl_table) ? options[:transl_table] : ""
|
84
|
+
@features = Array.new
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def parse_features(buffer)
|
89
|
+
buffer.shift if buffer[0] =~ /^FEATURES/
|
90
|
+
all_features = split_at_features(buffer.join("\n"))
|
91
|
+
|
92
|
+
all_features.each do |feature_str|
|
93
|
+
@features << MgNu::Genbank::Feature.parse(feature_str)
|
94
|
+
end
|
95
|
+
end # end parse_features
|
96
|
+
|
97
|
+
def split_at_features(str)
|
98
|
+
sep = "\001"
|
99
|
+
str.gsub(/\n(\s{5}\S)/, "\n#{sep}\\1").split(sep)
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_s
|
103
|
+
str = "DEFINITION seqnum=#{@seqnum};seqlen=#{@length};seqhdr=\"#{@seqhdr}\";gc_cont=#{@gc_cont};transl_table=#{@transl_table}\n"
|
104
|
+
str += "FEATURES Location/Qualifiers\n"
|
105
|
+
@features.each do |f|
|
106
|
+
str += "#{f.to_s}\n"
|
107
|
+
end
|
108
|
+
str += '//'
|
109
|
+
return str
|
110
|
+
end
|
111
|
+
|
112
|
+
end # end of MgNu::Parser::Prodigal::Sequence class
|
113
|
+
end # end of MgNu::Parser::Prodigal class
|
114
|
+
end # end of MgNu::Parser module
|
115
|
+
end # end of MgNu module
|
116
|
+
|
117
|
+
__END__
|
118
|
+
|
119
|
+
DEFINITION seqnum=1;seqlen=252779;seqhdr="cn_combo_scaffold_29 length_252779 read_count_231853";gc_cont=66.10;transl_table=11;uses_sd=1
|
120
|
+
FEATURES Location/Qualifiers
|
121
|
+
CDS complement(<2..85)
|
122
|
+
/note=";gc_cont=0.619;tscore=4.54;"
|
123
|
+
CDS 529..1245
|
124
|
+
/note=";gc_cont=0.646;tscore=4.54;"
|
125
|
+
CDS 1322..1747
|
126
|
+
/note=";gc_cont=0.688;tscore=4.54;"
|
127
|
+
|
128
|
+
|
129
|
+
def to_s
|
130
|
+
str = ""
|
131
|
+
str += ">Feature #{@name}\n"
|
132
|
+
@features.each do |f|
|
133
|
+
locstr = ""
|
134
|
+
if f.location.complement
|
135
|
+
if f.location.stop_continues
|
136
|
+
locstr += "<#{f.location.stop}\t"
|
137
|
+
else
|
138
|
+
locstr += "#{f.location.stop}\t"
|
139
|
+
end
|
140
|
+
if f.location.start_continues
|
141
|
+
locstr += ">#{f.location.start}\t"
|
142
|
+
else
|
143
|
+
locstr += "#{f.location.start}\t"
|
144
|
+
end
|
145
|
+
else
|
146
|
+
if f.location.start_continues
|
147
|
+
locstr += "<#{f.location.start}\t"
|
148
|
+
else
|
149
|
+
locstr += "#{f.location.start}\t"
|
150
|
+
end
|
151
|
+
if f.location.stop_continues
|
152
|
+
locstr += ">#{f.location.stop}\t"
|
153
|
+
else
|
154
|
+
locstr += "#{f.location.stop}\t"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
str += "#{locstr}gene\n"
|
158
|
+
str += "\t\t\tgene\tgene#{count}\n"
|
159
|
+
f.qualifiers.sort.each do |qualifier,q|
|
160
|
+
str += "\t\t\t#{qualifier}\t#{q.value}\n"
|
161
|
+
end
|
162
|
+
str += "#{locstr}CDS\n"
|
163
|
+
f.qualifiers.sort.each do |qualifier,q|
|
164
|
+
str += "\t\t\t#{qualifier}\t#{q.value}\n"
|
165
|
+
end
|
166
|
+
str += "\t\t\tproduct\tgene_#{@number}p\n"
|
167
|
+
str += "\t\t\ttransl_table\t#{@transl_table}\n"
|
168
|
+
end # end of features.each
|
169
|
+
return str
|
170
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'mgnu/parser/sam/header'
|
2
|
+
require 'mgnu/parser/sam/alignment'
|
3
|
+
|
4
|
+
module MgNu
|
5
|
+
module Parser
|
6
|
+
class Sam
|
7
|
+
include Enumerable
|
8
|
+
include MgNu::Loggable
|
9
|
+
include MgNu::Parser
|
10
|
+
|
11
|
+
FIELDS = [:name, :flag, :hit, :position, :quality, :cigar, :mate_ref,
|
12
|
+
:mate_pos, :distance, :sequence, :query_qual, :other]
|
13
|
+
|
14
|
+
attr_reader :file, :header #, :alignments
|
15
|
+
|
16
|
+
# create a new SAM file parser
|
17
|
+
def initialize(filename = nil)
|
18
|
+
@header = nil
|
19
|
+
if filename
|
20
|
+
if File.exists?(filename) and File.readable?(filename)
|
21
|
+
# # find all the reference sequences
|
22
|
+
# # skip header lines (^@) and then only save the 3rd column
|
23
|
+
# # from the sam file input, only return unique names, then
|
24
|
+
# # split on newlines
|
25
|
+
# `egrep -v '^@' #{filename} | cut -f3 | uniq`.split(/\n/).each do |ref|
|
26
|
+
# if @references.has_key?(ref)
|
27
|
+
# $stderr.puts "Already a reference by name (#{ref})"
|
28
|
+
# $stderr.puts "... skipping"
|
29
|
+
# else
|
30
|
+
# @references[ref] = MgNu::Parser::Sam::Reference.new(:name => ref)
|
31
|
+
# end
|
32
|
+
# end
|
33
|
+
|
34
|
+
@file = File.open(filename)
|
35
|
+
end # end of exists and readable file checks
|
36
|
+
else
|
37
|
+
error "MgNu::Parser::Sam.new(): need a SAM file"
|
38
|
+
exit(1)
|
39
|
+
end # end of if/else filename
|
40
|
+
end
|
41
|
+
|
42
|
+
# override enumerables
|
43
|
+
# MgNu::Parser::Sam will emit a reference-object with every
|
44
|
+
# iteration. Iteration happens with file-reading.
|
45
|
+
def each
|
46
|
+
header_buffer = Array.new
|
47
|
+
# short-term buffer hash
|
48
|
+
alignment_buffer = Hash.new
|
49
|
+
|
50
|
+
@file.each do |line|
|
51
|
+
next if line =~ /^\s*$/
|
52
|
+
line.chomp!
|
53
|
+
if line =~ /^@/
|
54
|
+
header_buffer << line
|
55
|
+
else
|
56
|
+
if header_buffer.length > 0
|
57
|
+
@header = process_header(header_buffer)
|
58
|
+
header_buffer.clear
|
59
|
+
end
|
60
|
+
alignment_attrs = Hash[*FIELDS.zip(line.split("\t"))]
|
61
|
+
# TODO last field needs to be globbed into array
|
62
|
+
alignment = MgNu::Parser::Sam::Alignment.new(alignment_attrs)
|
63
|
+
next unless alignment.matched_and_paired?
|
64
|
+
key = alignment.first_read? ? :first : :second
|
65
|
+
if !alignment_buffer[alignment.basename]
|
66
|
+
alignment_buffer[alignment.basename] = { key => alignment }
|
67
|
+
else
|
68
|
+
read_pair = alignment_buffer[alignment.basename]
|
69
|
+
read_pair[key] = alignment
|
70
|
+
yield MgNu::Parser::Sam::Pair.new(name, read_pair[:first], read_pair[:second])
|
71
|
+
alignment_buffer.delete(alignment.basename)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def process_header(buffer)
|
78
|
+
hdr = MgNu::Parser::Sam::Header.new
|
79
|
+
buffer.each do |line|
|
80
|
+
case line
|
81
|
+
when /^@HD/
|
82
|
+
if line =~ /VN:(.+)[\s\n]/
|
83
|
+
hdr.vn = $1
|
84
|
+
end
|
85
|
+
if line =~ /SO:(.+)[\s\n]/
|
86
|
+
hdr.so = $1
|
87
|
+
end
|
88
|
+
when /^@SQ/
|
89
|
+
ref = nil
|
90
|
+
if line =~ /SN:(.+)[\s\n]/
|
91
|
+
# verify this ref is in the @references hash (from
|
92
|
+
# initialize()
|
93
|
+
if @references.has_key?($1)
|
94
|
+
ref = @references[$1]
|
95
|
+
else
|
96
|
+
$stderr.puts "WARNING: reference from header not found in alignments"
|
97
|
+
# create a ref
|
98
|
+
ref = MgNu::Parser::Sam::Reference.new(:name => $1)
|
99
|
+
@references[$1] = ref
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
if line =~ /LN:(\d+)[\s\n]/
|
104
|
+
if ref
|
105
|
+
ref.ln = $1.to_i
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
return hdr
|
111
|
+
end # end process_header_line
|
112
|
+
end # end of MgNu::Parser::Sam class
|
113
|
+
end # end of MgNu::Parser module
|
114
|
+
end # end of MgNu module
|
115
|
+
__END__
|