mgnu 2.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.yardopts +0 -0
- data/README.md +31 -0
- data/Rakefile +33 -0
- data/lib/mgnu.rb +9 -0
- data/lib/mgnu/alignment.rb +143 -0
- data/lib/mgnu/common.rb +68 -0
- data/lib/mgnu/genbank.rb +117 -0
- data/lib/mgnu/genbank/feature.rb +84 -0
- data/lib/mgnu/genbank/location.rb +150 -0
- data/lib/mgnu/genbank/qualifier.rb +45 -0
- data/lib/mgnu/genbank/reference.rb +114 -0
- data/lib/mgnu/genbank/source.rb +39 -0
- data/lib/mgnu/loggable.rb +61 -0
- data/lib/mgnu/parser.rb +50 -0
- data/lib/mgnu/parser/blast.rb +87 -0
- data/lib/mgnu/parser/blast/format0.rb +290 -0
- data/lib/mgnu/parser/blast/format7.rb +121 -0
- data/lib/mgnu/parser/blast/format8.rb +120 -0
- data/lib/mgnu/parser/blast/hsp.rb +75 -0
- data/lib/mgnu/parser/blast/query.rb +45 -0
- data/lib/mgnu/parser/blast/sbjct.rb +62 -0
- data/lib/mgnu/parser/clustalw.rb +72 -0
- data/lib/mgnu/parser/fasta.rb +61 -0
- data/lib/mgnu/parser/fasta_header_index.rb +39 -0
- data/lib/mgnu/parser/fasta_index.rb +57 -0
- data/lib/mgnu/parser/fastq.rb +61 -0
- data/lib/mgnu/parser/genbank.rb +187 -0
- data/lib/mgnu/parser/gff.rb +56 -0
- data/lib/mgnu/parser/iprscan/hit.rb +76 -0
- data/lib/mgnu/parser/iprscan_file.rb +39 -0
- data/lib/mgnu/parser/kegg_ontology_index.rb +163 -0
- data/lib/mgnu/parser/pilercr.rb +102 -0
- data/lib/mgnu/parser/prodigal.rb +170 -0
- data/lib/mgnu/parser/sam.rb +115 -0
- data/lib/mgnu/parser/sam/alignment.rb +22 -0
- data/lib/mgnu/parser/sam/header.rb +23 -0
- data/lib/mgnu/parser/sam/pair.rb +18 -0
- data/lib/mgnu/sequence.rb +207 -0
- data/lib/mgnu/sequence/fasta.rb +79 -0
- data/lib/mgnu/sequence/fastq.rb +43 -0
- data/lib/mgnu/version.rb +16 -0
- data/mgnu.gemspec +39 -0
- data/spec/mgnu/parser/blast_format0_spec.rb +114 -0
- data/spec/mgnu/parser/blast_format7_spec.rb +24 -0
- data/spec/mgnu/parser/blast_format8_spec.rb +26 -0
- data/spec/mgnu/parser/blast_multihsp_spec.rb +100 -0
- data/spec/mgnu/parser/blast_oof_spec.rb +53 -0
- data/spec/mgnu/parser/clustalw_spec.rb +90 -0
- data/spec/mgnu/parser/fasta_header_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_index_tc_parser_spec.rb +25 -0
- data/spec/mgnu/parser/fasta_parser_spec.rb +53 -0
- data/spec/mgnu/parser_spec.rb +22 -0
- data/spec/mgnu/sequence/fasta_spec.rb +60 -0
- data/spec/mgnu/sequence/fastq_spec.rb +31 -0
- data/spec/mgnu/sequence_spec.rb +81 -0
- data/spec/mgnu_spec.rb +7 -0
- data/spec/spec_helper.rb +53 -0
- metadata +376 -0
@@ -0,0 +1,163 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class KeggOntologyIndex
|
4
|
+
include TokyoCabinet
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
attr_reader :filename, :db_name, :db
|
8
|
+
alias :ontologies :db
|
9
|
+
|
10
|
+
# create a new KeggOntologyIndex
|
11
|
+
def initialize(filename="/work/blastdb/kegg/ko")
|
12
|
+
@filename = filename
|
13
|
+
@db_name = @filename + ".tch"
|
14
|
+
@db = HDB.new
|
15
|
+
if File.exists?(@filename) and File.readable?(@filename)
|
16
|
+
if File.exists?(@db_name) and File.readable?(@db_name)
|
17
|
+
if ! @db.open(@db_name, HDB::OREADER) # open the database read-only
|
18
|
+
ecode = hdb.ecode
|
19
|
+
$stderr.puts "ERROR: could not open #{@db_name} (code: #{hdb.errmsg(ecode)})"
|
20
|
+
exit(1)
|
21
|
+
end
|
22
|
+
else
|
23
|
+
if ! @db.open(@db_name, HDB::OWRITER | HDB::OCREAT | HDB::OLCKNB | HDB::OTSYNC ) # create and open the database rw
|
24
|
+
ecode = hdb.ecode
|
25
|
+
$stderr.puts "ERROR: could not open #{@db_name} (code: #{hdb.errmsg(ecode)})"
|
26
|
+
exit(1)
|
27
|
+
end
|
28
|
+
parse
|
29
|
+
end
|
30
|
+
else
|
31
|
+
raise "\n\n ERROR -- No file by name (#{@filename}). Exiting.\n\n"
|
32
|
+
exit(1)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def each
|
37
|
+
@db.keys.each do |k|
|
38
|
+
yield MgNu::Kegg::Ontology.from_json(@db[k])
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def [](k)
|
43
|
+
ko = nil
|
44
|
+
if @db[k]
|
45
|
+
ko = MgNu::Kegg::Ontology.from_json(@db[k])
|
46
|
+
else
|
47
|
+
$stderr.puts "warning - #{k} wasn't in the file, ko is nil!"
|
48
|
+
end
|
49
|
+
ko
|
50
|
+
end
|
51
|
+
|
52
|
+
# setup parse method for creating TC
|
53
|
+
def parse
|
54
|
+
buffer = Array.new
|
55
|
+
File.new(@filename).each do |line|
|
56
|
+
line.chomp!
|
57
|
+
if line =~ /\/\/\//
|
58
|
+
ko = parse_ko_buffer(buffer)
|
59
|
+
@db[ko.kegg_id] = ko.to_json
|
60
|
+
buffer.clear
|
61
|
+
else
|
62
|
+
buffer << line
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
if buffer.length > 0
|
67
|
+
ko = parse_ko_buffer(buffer)
|
68
|
+
@db[ko.kegg_id] = ko.to_json
|
69
|
+
end
|
70
|
+
end # end parse method
|
71
|
+
|
72
|
+
def parse_ko_buffer(buffer)
|
73
|
+
ko = MgNu::Kegg::Ontology.new
|
74
|
+
while buffer.length > 0
|
75
|
+
line = buffer.shift
|
76
|
+
if line =~ /^ENTRY\s+(\S+)\s/
|
77
|
+
ko.kegg_id = $1
|
78
|
+
elsif line =~ /^NAME\s+(.+)/
|
79
|
+
ko.name = $1
|
80
|
+
elsif line =~ /^DEFINITION\s+(.+)/
|
81
|
+
ko.definition = $1
|
82
|
+
while buffer.length > 0
|
83
|
+
dline = buffer.shift
|
84
|
+
if dline =~ /^(?:CLASS|DBLINKS|GENES)/
|
85
|
+
buffer.unshift(dline)
|
86
|
+
break
|
87
|
+
else
|
88
|
+
ko.definition += dline
|
89
|
+
end
|
90
|
+
end
|
91
|
+
elsif line =~ /^CLASS\s+(.+)/
|
92
|
+
class_str = $1 + " "
|
93
|
+
while buffer.length > 0
|
94
|
+
cline = buffer.shift
|
95
|
+
if cline =~ /^(?:DBLINKS|GENES)/
|
96
|
+
buffer.unshift(cline)
|
97
|
+
break
|
98
|
+
else
|
99
|
+
class_str += cline + " "
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
re = /\s*(.+?)\[PATH:(ko\d+)\]\s*/
|
104
|
+
re.global_match(class_str) do |m|
|
105
|
+
ko.classes << MgNu::Kegg::Ontology::KeggClass.new(:pathway => m[2], :description => m[1])
|
106
|
+
end
|
107
|
+
if ko.classes.length == 0
|
108
|
+
ko.classes << MgNu::Kegg::Ontology::KeggClass.new(:pathway => "unknown", :description => class_str)
|
109
|
+
end
|
110
|
+
elsif line =~ /^DBLINKS\s+(.+):\s(.+)/
|
111
|
+
database = $1
|
112
|
+
names = $2.split(/\s+/)
|
113
|
+
while buffer.length > 0
|
114
|
+
dline = buffer.shift
|
115
|
+
if dline =~ /^GENES\s+(.+):\s(.+)/
|
116
|
+
buffer.unshift(dline)
|
117
|
+
break
|
118
|
+
elsif dline =~ /\s+(.+):\s(.+)/ # new db
|
119
|
+
names.flatten.each do |n|
|
120
|
+
next if n == ""
|
121
|
+
ko.dblinks << MgNu::Kegg::Ontology::Dblink.new(:name => n, :database => database)
|
122
|
+
end
|
123
|
+
database = $1
|
124
|
+
names = $2.split(/\s+/)
|
125
|
+
else
|
126
|
+
names << dline.split(/\s+/)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
names.flatten.each do |n|
|
130
|
+
next if n == ""
|
131
|
+
ko.dblinks << MgNu::Kegg::Ontology::Dblink.new(:name => n, :database => database)
|
132
|
+
end
|
133
|
+
elsif line =~ /^GENES\s+(.+):\s(.+)/
|
134
|
+
org = $1
|
135
|
+
names = $2.split(/\s+/)
|
136
|
+
while buffer.length > 0
|
137
|
+
gline = buffer.shift
|
138
|
+
if gline =~ /\s+(.+):\s(.+)/
|
139
|
+
names.flatten.each do |n|
|
140
|
+
next if n == ""
|
141
|
+
ko.genes << MgNu::Kegg::Ontology::Gene.new(:name => n, :organism => org)
|
142
|
+
end
|
143
|
+
org = $1
|
144
|
+
names = $2.split(/\s+/)
|
145
|
+
else
|
146
|
+
names << gline.split(/\s+/)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
names.flatten.each do |n|
|
150
|
+
next if n == ""
|
151
|
+
ko.genes << MgNu::Kegg::Ontology::Gene.new(:name => n, :organism => org)
|
152
|
+
end
|
153
|
+
end # end if /ENTRY/
|
154
|
+
end # end buffer.each line
|
155
|
+
ko # return the ko object
|
156
|
+
end # end of #parse_ko_buffer(buffer)
|
157
|
+
|
158
|
+
def close
|
159
|
+
@db.close unless @db.nil?
|
160
|
+
end
|
161
|
+
end # end of MgNu::Parser::KeggOntologyIndex class
|
162
|
+
end # end of MgNu::Parser module
|
163
|
+
end # end of MgNu module
|
@@ -0,0 +1,102 @@
|
|
1
|
+
module MgNu
|
2
|
+
module Parser
|
3
|
+
class Pilercr
|
4
|
+
include Enumerable
|
5
|
+
|
6
|
+
attr_reader :file, :filename
|
7
|
+
|
8
|
+
# create a new Pilercr parser
|
9
|
+
def initialize(filename = nil)
|
10
|
+
@filename = filename
|
11
|
+
if filename
|
12
|
+
if File.exists?(filename) and File.readable?(filename)
|
13
|
+
@file = File.open(filename)
|
14
|
+
else
|
15
|
+
raise "\n\n -- No file by that name (#{filename}). Exiting\n\n"
|
16
|
+
exit(1)
|
17
|
+
end
|
18
|
+
else
|
19
|
+
$stderr.puts "MgNu::Parser::Pilercr.new(): need a filename or an existing file"
|
20
|
+
exit(1)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# override enumerables
|
25
|
+
def each
|
26
|
+
# read by blocks, starting with "Array " lines
|
27
|
+
array_count = 0
|
28
|
+
processed = 0
|
29
|
+
@file.each_line("Array ") do |block|
|
30
|
+
if array_count == processed and array_count > 0 # processed all the blocks in this file
|
31
|
+
break
|
32
|
+
else
|
33
|
+
if block =~ /^pilercr/
|
34
|
+
if block =~ /(\d+) putative/
|
35
|
+
array_count= $1.to_i
|
36
|
+
end
|
37
|
+
next
|
38
|
+
else
|
39
|
+
yield process_buffer(block.split(/\n/))
|
40
|
+
processed += 1
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end # end of File#each
|
45
|
+
|
46
|
+
def process_buffer(buffer)
|
47
|
+
pilercr = MgNu::Pilercr.new
|
48
|
+
buffer.each do |line|
|
49
|
+
if line =~ /SUMMARY/
|
50
|
+
buffer.clear
|
51
|
+
next
|
52
|
+
end
|
53
|
+
next if line =~ /^\s*$/
|
54
|
+
next if line =~ /^\d+$/
|
55
|
+
next if line =~ /^\s+Pos/
|
56
|
+
next if line =~ /^Array\s*$/
|
57
|
+
next if line =~ /^=+/
|
58
|
+
if line =~ />(.+)/
|
59
|
+
temp = $1.split(/\s+/)
|
60
|
+
if temp.length > 1
|
61
|
+
pilercr.header_name = temp.shift
|
62
|
+
pilercr.header = temp.join(" ")
|
63
|
+
else
|
64
|
+
pilercr.header_name = temp[0]
|
65
|
+
pilercr.header = temp[0]
|
66
|
+
end
|
67
|
+
else
|
68
|
+
temp = line.split(/\s+/)
|
69
|
+
temp.shift # drop empty space
|
70
|
+
if temp.length == 4 # final line with repeat sequence
|
71
|
+
pilercr.total_repeats = temp[0].to_i
|
72
|
+
pilercr.repeat_length = temp[1].to_i
|
73
|
+
pilercr.total_spacers = temp[2].to_i
|
74
|
+
pilercr.repeat_sequence = temp[3]
|
75
|
+
elsif temp.length == 6 # line with unknown spacer length
|
76
|
+
pilercr.repeats << MgNu::Pilercr::Repeat.new(:position => temp[0].to_i,
|
77
|
+
:length => temp[1].to_i,
|
78
|
+
:identity => temp[2].to_f,
|
79
|
+
:spacer_length => temp[3].to_i,
|
80
|
+
:match_line => temp[4],
|
81
|
+
:spacer => temp[5])
|
82
|
+
elsif temp.length == 7 # normal repeat line
|
83
|
+
pilercr.repeats << MgNu::Pilercr::Repeat.new(:position => temp[0].to_i,
|
84
|
+
:length => temp[1].to_i,
|
85
|
+
:identity => temp[2].to_f,
|
86
|
+
:spacer_length => temp[3].to_i,
|
87
|
+
:left_flank => temp[4],
|
88
|
+
:match_line => temp[5],
|
89
|
+
:spacer => temp[6])
|
90
|
+
|
91
|
+
else
|
92
|
+
$stderr.puts "WARN: Unknown line format"
|
93
|
+
$stderr.puts line
|
94
|
+
end
|
95
|
+
end # end if/else
|
96
|
+
end # end buffer.each do |line|
|
97
|
+
pilercr
|
98
|
+
end # end process_buffer method
|
99
|
+
|
100
|
+
end # end of MgNu::Parser::Pilercr class
|
101
|
+
end # end of MgNu::Parser module
|
102
|
+
end # end of MgNu module
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# this is a hack of MgNu::Parser::Genbank to deal specifically with
|
2
|
+
# prodigal's limited GFF support
|
3
|
+
require 'mgnu/genbank/feature'
|
4
|
+
require 'mgnu/genbank/location'
|
5
|
+
require 'mgnu/genbank/qualifier'
|
6
|
+
|
7
|
+
module MgNu
|
8
|
+
module Parser
|
9
|
+
class Prodigal
|
10
|
+
|
11
|
+
attr_reader :file
|
12
|
+
attr_accessor :name, :length, :definition, :features
|
13
|
+
|
14
|
+
include MgNu::Loggable
|
15
|
+
include MgNu::Parser
|
16
|
+
include Enumerable
|
17
|
+
|
18
|
+
# create a new prodigal parser
|
19
|
+
def initialize(filename = nil,debug=false)
|
20
|
+
@debug = debug
|
21
|
+
if filename
|
22
|
+
if File.exists?(filename) and File.readable?(filename)
|
23
|
+
@file = File.open(filename)
|
24
|
+
else
|
25
|
+
error("MgNu::Parser::Prodigal.new(): problems with filename")
|
26
|
+
raise "File doesn't exist or is not readable!"
|
27
|
+
end
|
28
|
+
else
|
29
|
+
error("MgNu::Parser::Prodigal.new(): need a filename")
|
30
|
+
raise "no filename given!"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def each
|
35
|
+
buffer = parse_until(@file,/^\/\//,false)
|
36
|
+
while (buffer.length > 0) do
|
37
|
+
buffer.shift if buffer[0] =~ /^\/\//
|
38
|
+
yield parse(buffer)
|
39
|
+
buffer = parse_until(@file,/^\/\//,false)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def parse(buffer)
|
44
|
+
return if buffer.nil?
|
45
|
+
return if buffer.length == 0
|
46
|
+
if buffer[0] =~ /^DEFINITION\s+(.+)$/
|
47
|
+
pseq = MgNu::Parser::Prodigal::Sequence.new(:definition => $1)
|
48
|
+
if buffer[0] =~ /^DEFINITION\s+seqnum=(\d+);seqlen=(\d+);seqhdr="(.+)\s*";gc_cont=([0-9\.]+);transl_table=(\d+).*$/
|
49
|
+
buffer.shift
|
50
|
+
pseq.seqnum = $1.to_i
|
51
|
+
pseq.length = $2.to_i
|
52
|
+
pseq.seqhdr = $3
|
53
|
+
pseq.gc_cont = $4.to_f
|
54
|
+
pseq.transl_table = $5.to_i
|
55
|
+
pseq.name = pseq.seqhdr.split(/\s+/)[0]
|
56
|
+
#pseq.features = parse_features(buffer)
|
57
|
+
pseq.parse_features(buffer)
|
58
|
+
return pseq
|
59
|
+
else
|
60
|
+
$stderr.puts "ERROR: unknown format for DEFINITION line"
|
61
|
+
$stderr.puts buffer[0]
|
62
|
+
exit(1)
|
63
|
+
end # end if /DEFINITION/
|
64
|
+
else
|
65
|
+
$stderr.puts "ERROR: buffer didn't begin with DEFINITION"
|
66
|
+
$stderr.puts buffer[0]
|
67
|
+
exit(1)
|
68
|
+
end # end if /DEFINITION/
|
69
|
+
end # end of def parse
|
70
|
+
|
71
|
+
# yielded from MgNu::Parser::Prodigal
|
72
|
+
class Sequence
|
73
|
+
attr_accessor :name, :length, :definition, :features
|
74
|
+
attr_accessor :seqnum, :seqhdr, :gc_cont, :transl_table
|
75
|
+
|
76
|
+
def initialize(options = {})
|
77
|
+
@name = options.has_key?(:name) ? options[:name] : ""
|
78
|
+
@length = options.has_key?(:length) ? options[:length] : ""
|
79
|
+
@definition = options.has_key?(:definition) ? options[:definition] : ""
|
80
|
+
@seqnum = options.has_key?(:seqnum) ? options[:seqnum] : ""
|
81
|
+
@seqhdr = options.has_key?(:seqhdr) ? options[:seqhdr] : ""
|
82
|
+
@gc_cont = options.has_key?(:gc_cont) ? options[:gc_cont] : ""
|
83
|
+
@transl_table = options.has_key?(:transl_table) ? options[:transl_table] : ""
|
84
|
+
@features = Array.new
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
def parse_features(buffer)
|
89
|
+
buffer.shift if buffer[0] =~ /^FEATURES/
|
90
|
+
all_features = split_at_features(buffer.join("\n"))
|
91
|
+
|
92
|
+
all_features.each do |feature_str|
|
93
|
+
@features << MgNu::Genbank::Feature.parse(feature_str)
|
94
|
+
end
|
95
|
+
end # end parse_features
|
96
|
+
|
97
|
+
def split_at_features(str)
|
98
|
+
sep = "\001"
|
99
|
+
str.gsub(/\n(\s{5}\S)/, "\n#{sep}\\1").split(sep)
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_s
|
103
|
+
str = "DEFINITION seqnum=#{@seqnum};seqlen=#{@length};seqhdr=\"#{@seqhdr}\";gc_cont=#{@gc_cont};transl_table=#{@transl_table}\n"
|
104
|
+
str += "FEATURES Location/Qualifiers\n"
|
105
|
+
@features.each do |f|
|
106
|
+
str += "#{f.to_s}\n"
|
107
|
+
end
|
108
|
+
str += '//'
|
109
|
+
return str
|
110
|
+
end
|
111
|
+
|
112
|
+
end # end of MgNu::Parser::Prodigal::Sequence class
|
113
|
+
end # end of MgNu::Parser::Prodigal class
|
114
|
+
end # end of MgNu::Parser module
|
115
|
+
end # end of MgNu module
|
116
|
+
|
117
|
+
__END__
|
118
|
+
|
119
|
+
DEFINITION seqnum=1;seqlen=252779;seqhdr="cn_combo_scaffold_29 length_252779 read_count_231853";gc_cont=66.10;transl_table=11;uses_sd=1
|
120
|
+
FEATURES Location/Qualifiers
|
121
|
+
CDS complement(<2..85)
|
122
|
+
/note=";gc_cont=0.619;tscore=4.54;"
|
123
|
+
CDS 529..1245
|
124
|
+
/note=";gc_cont=0.646;tscore=4.54;"
|
125
|
+
CDS 1322..1747
|
126
|
+
/note=";gc_cont=0.688;tscore=4.54;"
|
127
|
+
|
128
|
+
|
129
|
+
def to_s
|
130
|
+
str = ""
|
131
|
+
str += ">Feature #{@name}\n"
|
132
|
+
@features.each do |f|
|
133
|
+
locstr = ""
|
134
|
+
if f.location.complement
|
135
|
+
if f.location.stop_continues
|
136
|
+
locstr += "<#{f.location.stop}\t"
|
137
|
+
else
|
138
|
+
locstr += "#{f.location.stop}\t"
|
139
|
+
end
|
140
|
+
if f.location.start_continues
|
141
|
+
locstr += ">#{f.location.start}\t"
|
142
|
+
else
|
143
|
+
locstr += "#{f.location.start}\t"
|
144
|
+
end
|
145
|
+
else
|
146
|
+
if f.location.start_continues
|
147
|
+
locstr += "<#{f.location.start}\t"
|
148
|
+
else
|
149
|
+
locstr += "#{f.location.start}\t"
|
150
|
+
end
|
151
|
+
if f.location.stop_continues
|
152
|
+
locstr += ">#{f.location.stop}\t"
|
153
|
+
else
|
154
|
+
locstr += "#{f.location.stop}\t"
|
155
|
+
end
|
156
|
+
end
|
157
|
+
str += "#{locstr}gene\n"
|
158
|
+
str += "\t\t\tgene\tgene#{count}\n"
|
159
|
+
f.qualifiers.sort.each do |qualifier,q|
|
160
|
+
str += "\t\t\t#{qualifier}\t#{q.value}\n"
|
161
|
+
end
|
162
|
+
str += "#{locstr}CDS\n"
|
163
|
+
f.qualifiers.sort.each do |qualifier,q|
|
164
|
+
str += "\t\t\t#{qualifier}\t#{q.value}\n"
|
165
|
+
end
|
166
|
+
str += "\t\t\tproduct\tgene_#{@number}p\n"
|
167
|
+
str += "\t\t\ttransl_table\t#{@transl_table}\n"
|
168
|
+
end # end of features.each
|
169
|
+
return str
|
170
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'mgnu/parser/sam/header'
|
2
|
+
require 'mgnu/parser/sam/alignment'
|
3
|
+
|
4
|
+
module MgNu
|
5
|
+
module Parser
|
6
|
+
class Sam
|
7
|
+
include Enumerable
|
8
|
+
include MgNu::Loggable
|
9
|
+
include MgNu::Parser
|
10
|
+
|
11
|
+
FIELDS = [:name, :flag, :hit, :position, :quality, :cigar, :mate_ref,
|
12
|
+
:mate_pos, :distance, :sequence, :query_qual, :other]
|
13
|
+
|
14
|
+
attr_reader :file, :header #, :alignments
|
15
|
+
|
16
|
+
# create a new SAM file parser
|
17
|
+
def initialize(filename = nil)
|
18
|
+
@header = nil
|
19
|
+
if filename
|
20
|
+
if File.exists?(filename) and File.readable?(filename)
|
21
|
+
# # find all the reference sequences
|
22
|
+
# # skip header lines (^@) and then only save the 3rd column
|
23
|
+
# # from the sam file input, only return unique names, then
|
24
|
+
# # split on newlines
|
25
|
+
# `egrep -v '^@' #{filename} | cut -f3 | uniq`.split(/\n/).each do |ref|
|
26
|
+
# if @references.has_key?(ref)
|
27
|
+
# $stderr.puts "Already a reference by name (#{ref})"
|
28
|
+
# $stderr.puts "... skipping"
|
29
|
+
# else
|
30
|
+
# @references[ref] = MgNu::Parser::Sam::Reference.new(:name => ref)
|
31
|
+
# end
|
32
|
+
# end
|
33
|
+
|
34
|
+
@file = File.open(filename)
|
35
|
+
end # end of exists and readable file checks
|
36
|
+
else
|
37
|
+
error "MgNu::Parser::Sam.new(): need a SAM file"
|
38
|
+
exit(1)
|
39
|
+
end # end of if/else filename
|
40
|
+
end
|
41
|
+
|
42
|
+
# override enumerables
|
43
|
+
# MgNu::Parser::Sam will emit a reference-object with every
|
44
|
+
# iteration. Iteration happens with file-reading.
|
45
|
+
def each
|
46
|
+
header_buffer = Array.new
|
47
|
+
# short-term buffer hash
|
48
|
+
alignment_buffer = Hash.new
|
49
|
+
|
50
|
+
@file.each do |line|
|
51
|
+
next if line =~ /^\s*$/
|
52
|
+
line.chomp!
|
53
|
+
if line =~ /^@/
|
54
|
+
header_buffer << line
|
55
|
+
else
|
56
|
+
if header_buffer.length > 0
|
57
|
+
@header = process_header(header_buffer)
|
58
|
+
header_buffer.clear
|
59
|
+
end
|
60
|
+
alignment_attrs = Hash[*FIELDS.zip(line.split("\t"))]
|
61
|
+
# TODO last field needs to be globbed into array
|
62
|
+
alignment = MgNu::Parser::Sam::Alignment.new(alignment_attrs)
|
63
|
+
next unless alignment.matched_and_paired?
|
64
|
+
key = alignment.first_read? ? :first : :second
|
65
|
+
if !alignment_buffer[alignment.basename]
|
66
|
+
alignment_buffer[alignment.basename] = { key => alignment }
|
67
|
+
else
|
68
|
+
read_pair = alignment_buffer[alignment.basename]
|
69
|
+
read_pair[key] = alignment
|
70
|
+
yield MgNu::Parser::Sam::Pair.new(name, read_pair[:first], read_pair[:second])
|
71
|
+
alignment_buffer.delete(alignment.basename)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def process_header(buffer)
|
78
|
+
hdr = MgNu::Parser::Sam::Header.new
|
79
|
+
buffer.each do |line|
|
80
|
+
case line
|
81
|
+
when /^@HD/
|
82
|
+
if line =~ /VN:(.+)[\s\n]/
|
83
|
+
hdr.vn = $1
|
84
|
+
end
|
85
|
+
if line =~ /SO:(.+)[\s\n]/
|
86
|
+
hdr.so = $1
|
87
|
+
end
|
88
|
+
when /^@SQ/
|
89
|
+
ref = nil
|
90
|
+
if line =~ /SN:(.+)[\s\n]/
|
91
|
+
# verify this ref is in the @references hash (from
|
92
|
+
# initialize()
|
93
|
+
if @references.has_key?($1)
|
94
|
+
ref = @references[$1]
|
95
|
+
else
|
96
|
+
$stderr.puts "WARNING: reference from header not found in alignments"
|
97
|
+
# create a ref
|
98
|
+
ref = MgNu::Parser::Sam::Reference.new(:name => $1)
|
99
|
+
@references[$1] = ref
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
if line =~ /LN:(\d+)[\s\n]/
|
104
|
+
if ref
|
105
|
+
ref.ln = $1.to_i
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
return hdr
|
111
|
+
end # end process_header_line
|
112
|
+
end # end of MgNu::Parser::Sam class
|
113
|
+
end # end of MgNu::Parser module
|
114
|
+
end # end of MgNu module
|
115
|
+
__END__
|