bacterial-annotator 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/ba_blat +60 -0
- data/bin/ba_prodigal +45 -0
- data/bin/bacterial-annotator +131 -0
- data/lib/bacterial-annotator.rb +406 -0
- data/lib/bacterial-annotator/fasta-manip.rb +166 -0
- data/lib/bacterial-annotator/genbank-manip.rb +208 -0
- data/lib/bacterial-annotator/remote-ncbi.rb +200 -0
- data/lib/bacterial-annotator/synteny-manip.rb +188 -0
- metadata +96 -0
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# author: maxime déraspe
|
|
3
|
+
# email: maxime@deraspe.net
|
|
4
|
+
# review:
|
|
5
|
+
# date: 15-02-24
|
|
6
|
+
# version: 0.0.1
|
|
7
|
+
# licence:
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FastaManip
|
|
12
|
+
|
|
13
|
+
attr_reader :fasta_flat, :fasta_file, :prodigal_files
|
|
14
|
+
|
|
15
|
+
# Initialize fasta holder
|
|
16
|
+
def initialize fasta_file, meta
|
|
17
|
+
|
|
18
|
+
@fasta_file = fasta_file
|
|
19
|
+
@fasta_flat = Bio::FlatFile.auto(@fasta_file)
|
|
20
|
+
@meta = meta
|
|
21
|
+
@prodigal_files = nil
|
|
22
|
+
@single_fasta = nil
|
|
23
|
+
|
|
24
|
+
if @fasta_flat.dbclass != Bio::FastaFormat
|
|
25
|
+
abort "Aborting : The input sequence is not a fasta file !"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Run prodigal on the genome to annotate
|
|
31
|
+
def run_prodigal root, outdir
|
|
32
|
+
@prodigal_files = {}
|
|
33
|
+
Dir.mkdir "#{outdir}" if ! Dir.exists? "#{outdir}"
|
|
34
|
+
if @meta
|
|
35
|
+
system("#{root}/prodigal.linux -p meta -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
|
36
|
+
else
|
|
37
|
+
system("#{root}/prodigal.linux -i #{@fasta_file} -a #{outdir}/Proteins.fa -d #{outdir}/Genes.fa -o #{outdir}/Genbanks.gbk -q")
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
@prodigal_files = {multiGBK: "#{outdir}/Genbanks.gbk",
|
|
41
|
+
contigs: [],
|
|
42
|
+
contigs_length: [],
|
|
43
|
+
genes: "#{outdir}/Genes.fa",
|
|
44
|
+
proteins: "#{outdir}/Proteins.fa",
|
|
45
|
+
prot_ids_by_contig: {},
|
|
46
|
+
fasta_path: "#{outdir}/single-fasta/",
|
|
47
|
+
gbk_path: "#{outdir}/single-genbank/"}
|
|
48
|
+
split_fasta outdir
|
|
49
|
+
split_genbank outdir, "#{outdir}/Genbanks.gbk"
|
|
50
|
+
extract_cds_names
|
|
51
|
+
@prodigal_files
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# Split Multi Genbanks file
|
|
56
|
+
# RETURN : array of fasta files
|
|
57
|
+
def split_fasta outdir
|
|
58
|
+
@single_fasta = {}
|
|
59
|
+
Dir.mkdir("#{outdir}/single-fasta") if ! Dir.exists?("#{outdir}/single-fasta")
|
|
60
|
+
@fasta_flat.each_entry do |seq|
|
|
61
|
+
file_name = seq.definition.chomp.split(" ")[0]
|
|
62
|
+
@prodigal_files[:contigs] << "#{file_name}"
|
|
63
|
+
@prodigal_files[:contigs_length] << seq.seq.length
|
|
64
|
+
File.open("#{outdir}/single-fasta/#{file_name}.fasta", "w") do |fwrite|
|
|
65
|
+
fwrite.write(seq)
|
|
66
|
+
end
|
|
67
|
+
@single_fasta[file_name] = seq
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# Split Multi Genbanks file
|
|
73
|
+
# RETURN : array of genbank files
|
|
74
|
+
def split_genbank outdir, multigbk
|
|
75
|
+
|
|
76
|
+
Dir.mkdir("#{outdir}/single-genbank")if ! Dir.exists?("#{outdir}/single-genbank")
|
|
77
|
+
File.open(multigbk,"r") do |f|
|
|
78
|
+
fopen = nil
|
|
79
|
+
while l = f.gets
|
|
80
|
+
if l[0..9] == "DEFINITION"
|
|
81
|
+
file_name = l.chomp.split(";")[2].gsub("seqhdr","").delete("\"").delete("=").split(" ")[0]
|
|
82
|
+
outseq, seq_length = print_sequence_for_gbk @single_fasta[file_name]
|
|
83
|
+
spacer = " " * (20-seq_length.to_s.length)
|
|
84
|
+
date = DateTime.now
|
|
85
|
+
month = Date::ABBR_MONTHNAMES[date.month]
|
|
86
|
+
day = "%02d" % date.day
|
|
87
|
+
year = date.year
|
|
88
|
+
locus = "LOCUS #{file_name}#{spacer}#{seq_length.to_s} bp DNA linear BCT #{day}-#{month}-#{year}\n"
|
|
89
|
+
fopen = File.open("#{outdir}/single-genbank/#{file_name}.gbk", "w")
|
|
90
|
+
fopen.write(locus)
|
|
91
|
+
fopen.write(l)
|
|
92
|
+
elsif l[0..1] == "//"
|
|
93
|
+
fopen.write(outseq)
|
|
94
|
+
fopen.close
|
|
95
|
+
else
|
|
96
|
+
fopen.write(l)
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# Utility function to print the sequence to the end of a gbk file
|
|
105
|
+
def print_sequence_for_gbk seq
|
|
106
|
+
|
|
107
|
+
outseq = "ORIGIN\n"
|
|
108
|
+
# puts "ORIGIN"
|
|
109
|
+
|
|
110
|
+
ntNum = 0
|
|
111
|
+
sequence = seq.seq.downcase
|
|
112
|
+
|
|
113
|
+
nt_left = true
|
|
114
|
+
it = 0
|
|
115
|
+
|
|
116
|
+
while nt_left
|
|
117
|
+
|
|
118
|
+
if sequence.length > it+60
|
|
119
|
+
nt_to_add = sequence[it..(it+59)]
|
|
120
|
+
# printf "%9s ", (ntNum - l.size + 2)
|
|
121
|
+
outseq += "%9s " % (it+1)
|
|
122
|
+
outseq += nt_to_add.scan(/.{1,10}/).join(" ")
|
|
123
|
+
outseq += "\n"
|
|
124
|
+
it += 60
|
|
125
|
+
else
|
|
126
|
+
nt_to_add = sequence[it..sequence.length-1]
|
|
127
|
+
outseq += "%9s " % (it+1)
|
|
128
|
+
outseq += nt_to_add.scan(/.{1,10}/).join(" ")
|
|
129
|
+
outseq += "\n"
|
|
130
|
+
outseq += "//"
|
|
131
|
+
nt_left = false
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
return outseq, sequence.length
|
|
137
|
+
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# extract protein and gene names
|
|
142
|
+
def extract_cds_names
|
|
143
|
+
|
|
144
|
+
prot_ids = {}
|
|
145
|
+
flatfile = Bio::FlatFile.auto(@prodigal_files[:proteins])
|
|
146
|
+
flatfile.each_entry do |entry|
|
|
147
|
+
prot_id = entry.definition.split(" ")[0]
|
|
148
|
+
contig = prot_id.split("_")[0..-2].join("_")
|
|
149
|
+
if !prot_ids.has_key? contig
|
|
150
|
+
prot_ids[contig] = []
|
|
151
|
+
end
|
|
152
|
+
prot_ids[contig] << prot_id
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
prot_ids.each do |k,prot_array|
|
|
156
|
+
prot_array.sort! { |a,b| a.split("_")[-1].to_i <=> b.split("_")[-1].to_i }
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
@prodigal_files[:prot_ids_by_contig] = prot_ids
|
|
160
|
+
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
private :extract_cds_names # :split_fasta, :split_genbank
|
|
165
|
+
|
|
166
|
+
end
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# author: maxime déraspe
|
|
3
|
+
# email: maxime@deraspe.net
|
|
4
|
+
# review:
|
|
5
|
+
# date: 15-02-24
|
|
6
|
+
# version: 0.0.1
|
|
7
|
+
# licence:
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GenbankManip
|
|
12
|
+
|
|
13
|
+
attr_accessor :gbk, :coding_seq, :cds_file
|
|
14
|
+
|
|
15
|
+
# Initialize then genbank file
|
|
16
|
+
def initialize gbk_file, outdir
|
|
17
|
+
|
|
18
|
+
@gbk_file = gbk_file
|
|
19
|
+
if ! File.exists? @gbk_file
|
|
20
|
+
fetch_ncbi_genome(@gbk_file, outdir)
|
|
21
|
+
@gbk_file = "#{outdir}/#{gbk_file}.gbk"
|
|
22
|
+
# @gbk_file += ".gbk"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
flat_gbk = Bio::FlatFile.auto(@gbk_file)
|
|
26
|
+
|
|
27
|
+
# Check if gbk is valid
|
|
28
|
+
if flat_gbk.dbclass != Bio::GenBank
|
|
29
|
+
abort "Aborting : The input #{@gbk_file} is not a valid genbank file !"
|
|
30
|
+
else
|
|
31
|
+
@gbk = flat_gbk.next_entry
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
@bioseq = @gbk.to_biosequence
|
|
35
|
+
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Prepare CDS/proteins
|
|
40
|
+
def get_cds
|
|
41
|
+
|
|
42
|
+
if @coding_seq == nil
|
|
43
|
+
|
|
44
|
+
@coding_seq = {}
|
|
45
|
+
|
|
46
|
+
# Iterate over each CDS
|
|
47
|
+
@gbk.each_cds do |ft|
|
|
48
|
+
ftH = ft.to_hash
|
|
49
|
+
loc = ft.locations
|
|
50
|
+
gene = []
|
|
51
|
+
product = []
|
|
52
|
+
protId = ""
|
|
53
|
+
if ftH.has_key? "pseudo"
|
|
54
|
+
next
|
|
55
|
+
end
|
|
56
|
+
gene = ftH["gene"] if !ftH["gene"].nil?
|
|
57
|
+
product = ftH["product"] if !ftH["product"].nil?
|
|
58
|
+
protId = ftH["protein_id"][0] if !ftH["protein_id"].nil?
|
|
59
|
+
locustag = ftH["locus_tag"][0] if !ftH["locus_tag"].nil?
|
|
60
|
+
dna = get_DNA(ft,@bioseq)
|
|
61
|
+
if ftH.has_key? "translation"
|
|
62
|
+
pep = ftH["translation"][0] if !ftH["translation"].nil?
|
|
63
|
+
else
|
|
64
|
+
pep = ""
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
pepBioSeq = Bio::Sequence.auto(pep)
|
|
68
|
+
|
|
69
|
+
@coding_seq[protId] = {location: loc,
|
|
70
|
+
locustag: locustag,
|
|
71
|
+
gene: gene[0],
|
|
72
|
+
product: product[0],
|
|
73
|
+
bioseq: pepBioSeq }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
@coding_seq
|
|
79
|
+
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# Print CDS to files
|
|
84
|
+
# RETURN : cds_file path
|
|
85
|
+
def write_cds_to_file outdir
|
|
86
|
+
|
|
87
|
+
cds_file = "#{@gbk.accession}.pep"
|
|
88
|
+
if @coding_seq == nil
|
|
89
|
+
get_cds
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
File.open("#{outdir}/#{cds_file}", "w") do |fwrite|
|
|
93
|
+
@coding_seq.each_key do |k|
|
|
94
|
+
seqout = @coding_seq[k][:bioseq].output_fasta("#{k}",60)
|
|
95
|
+
fwrite.write(seqout)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
@cds_file = "#{outdir}/" + cds_file
|
|
100
|
+
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# add annotation to a genbank file produced by prodigal
|
|
105
|
+
def add_annotation annotations, outdir, mode
|
|
106
|
+
|
|
107
|
+
nb_of_added_ft = 0
|
|
108
|
+
i = 0
|
|
109
|
+
contig = @gbk.definition.split(";")[2].
|
|
110
|
+
gsub("seqhdr","").
|
|
111
|
+
delete("\"").
|
|
112
|
+
delete("=").
|
|
113
|
+
split(" ")[0]
|
|
114
|
+
|
|
115
|
+
# iterate through
|
|
116
|
+
@gbk.features.each_with_index do |cds, ft_index|
|
|
117
|
+
|
|
118
|
+
next if cds.feature != "CDS"
|
|
119
|
+
|
|
120
|
+
if mode == 0
|
|
121
|
+
ftArray = []
|
|
122
|
+
else
|
|
123
|
+
ftArray = cds.qualifiers
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
i += 1
|
|
127
|
+
prot_id = contig+"_"+i.to_s
|
|
128
|
+
hit = nil
|
|
129
|
+
hit = annotations[prot_id] if annotations.has_key? prot_id
|
|
130
|
+
|
|
131
|
+
if hit != nil
|
|
132
|
+
locus, gene, product, note = nil
|
|
133
|
+
locus = hit[:locustag]
|
|
134
|
+
gene = hit[:gene]
|
|
135
|
+
product = hit[:product]
|
|
136
|
+
note = hit[:note]
|
|
137
|
+
|
|
138
|
+
if gene != nil
|
|
139
|
+
qGene = Bio::Feature::Qualifier.new('gene', gene)
|
|
140
|
+
ftArray.push(qGene)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
if product != nil
|
|
144
|
+
qProd = Bio::Feature::Qualifier.new('product', product)
|
|
145
|
+
ftArray.push(qProd)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
if locus != nil
|
|
149
|
+
qNote = Bio::Feature::Qualifier.new('note', "correspond to #{locus} locus")
|
|
150
|
+
ftArray.push(qNote)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
if note != nil
|
|
154
|
+
qNote = Bio::Feature::Qualifier.new('note', note)
|
|
155
|
+
ftArray.push(qNote)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
end
|
|
160
|
+
cds.qualifiers = ftArray
|
|
161
|
+
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
File.open("#{outdir}/#{contig}.gbk", "w") do |f|
|
|
165
|
+
f.write(@gbk.to_biosequence.output(:genbank))
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Bioruby doesn't support gff at this point
|
|
169
|
+
# File.open("#{outdir}/#{contig}.gff", "w") do |f|
|
|
170
|
+
# f.write(@gbk.to_biosequence.output(:gff))
|
|
171
|
+
# end
|
|
172
|
+
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
###################
|
|
177
|
+
# Private Methods #
|
|
178
|
+
###################
|
|
179
|
+
|
|
180
|
+
# Fct: Get dna sequence
|
|
181
|
+
def get_DNA (cds, seq)
|
|
182
|
+
loc = cds.locations
|
|
183
|
+
sbeg = loc[0].from.to_i
|
|
184
|
+
send = loc[0].to.to_i
|
|
185
|
+
fasta = Bio::Sequence::NA.new(seq.subseq(sbeg,send))
|
|
186
|
+
# position = "#{sbeg}..#{send}"
|
|
187
|
+
if loc[0].strand == -1
|
|
188
|
+
fasta.reverse_complement!
|
|
189
|
+
end
|
|
190
|
+
dna = Bio::Sequence.auto(fasta)
|
|
191
|
+
return dna
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# Fetch genbank genome from NCBI
|
|
196
|
+
def fetch_ncbi_genome refgenome_id, outdir
|
|
197
|
+
Bio::NCBI.default_email = 'default@default.com'
|
|
198
|
+
ncbi = Bio::NCBI::REST.new
|
|
199
|
+
genbankstring = ncbi.efetch(refgenome_id, {"db"=>'nucleotide', "rettype"=>'gb'})
|
|
200
|
+
File.open("#{outdir}/#{refgenome_id}.gbk", "w") do |f|
|
|
201
|
+
f.write(genbankstring)
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
private :fetch_ncbi_genome, :get_DNA
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
end # end of Class
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# author: maxime déraspe
|
|
3
|
+
# email: maxime@deraspe.net
|
|
4
|
+
# review:
|
|
5
|
+
# date: 15-02-24
|
|
6
|
+
# version: 0.0.1
|
|
7
|
+
# licence:
|
|
8
|
+
|
|
9
|
+
require 'mechanize'
|
|
10
|
+
require 'open-uri'
|
|
11
|
+
require 'bio'
|
|
12
|
+
|
|
13
|
+
class RemoteNCBI
|
|
14
|
+
|
|
15
|
+
attr_reader :aln_hits, :db, :xmloutput
|
|
16
|
+
|
|
17
|
+
# initialize stuff for a remote ncbi run
|
|
18
|
+
def initialize db, seq_file, outfile, pidentity
|
|
19
|
+
|
|
20
|
+
if ! ["swissprot", "refseq_protein", "nr"].include? db
|
|
21
|
+
@db = "bad database"
|
|
22
|
+
else
|
|
23
|
+
@db = db
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
url = 'http://blast.ncbi.nlm.nih.gov/Blast.cgi'\
|
|
27
|
+
'?PROGRAM=blastp&BLAST_PROGRAMS=blastp'\
|
|
28
|
+
'&PAGE_TYPE=BlastSearch&SHOW_DEFAULTS=on'\
|
|
29
|
+
'&LINK_LOC=blasthome'
|
|
30
|
+
|
|
31
|
+
@seq_file = seq_file
|
|
32
|
+
@outfile = outfile
|
|
33
|
+
@resultURI = submit_blast url
|
|
34
|
+
@pidentity = pidentity
|
|
35
|
+
|
|
36
|
+
if @resultURI != ""
|
|
37
|
+
@xmloutput = ""
|
|
38
|
+
@valid = validate_output
|
|
39
|
+
else
|
|
40
|
+
@valid = false
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
end # end of method
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# submit blast to ncbi
|
|
47
|
+
def submit_blast ncbiURL
|
|
48
|
+
|
|
49
|
+
f = @seq_file.split("/")[-1]
|
|
50
|
+
|
|
51
|
+
seq_fasta = File.read(@seq_file)
|
|
52
|
+
|
|
53
|
+
a = Mechanize.new { |agent|
|
|
54
|
+
agent.user_agent_alias = 'Linux Firefox'
|
|
55
|
+
agent.ignore_bad_chunking = true
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
toBreak = 0
|
|
59
|
+
requestID = ""
|
|
60
|
+
try = 1
|
|
61
|
+
|
|
62
|
+
while requestID == "" and try < 12
|
|
63
|
+
|
|
64
|
+
begin
|
|
65
|
+
|
|
66
|
+
a.get(ncbiURL) do |page|
|
|
67
|
+
|
|
68
|
+
search = page.form_with(:name => 'searchForm') { |form|
|
|
69
|
+
form.textareas[0].value = File.read(@seq_file)
|
|
70
|
+
form.field_with(:name => 'DATABASE').value = @db
|
|
71
|
+
form.field_with(:name => 'MAX_NUM_SEQ').value = 40
|
|
72
|
+
}.submit
|
|
73
|
+
|
|
74
|
+
search.parser.css('td').each do |td|
|
|
75
|
+
if toBreak == 1
|
|
76
|
+
requestID = td.text.gsub(" ","")
|
|
77
|
+
# puts "breaking because #{requestID}"
|
|
78
|
+
break
|
|
79
|
+
end
|
|
80
|
+
if td.text == "Request ID"
|
|
81
|
+
toBreak = 1
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
rescue
|
|
88
|
+
try += 1
|
|
89
|
+
puts "#{try} POST try for #{f}"
|
|
90
|
+
sleep 3
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
uri_parsed = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&RID=#{requestID}"
|
|
96
|
+
puts "NCBI Blast for #{f}: #{uri_parsed}"
|
|
97
|
+
|
|
98
|
+
return URI.parse("http://blast.ncbi.nlm.nih.gov/Blast.cgi?RESULTS_FILE=on&RID=#{requestID}&FORMAT_TYPE=XML&FORMAT_OBJECT=Alignment&CMD=Get")
|
|
99
|
+
|
|
100
|
+
end # end of method
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# validate the xml blast results
|
|
104
|
+
def validate_output
|
|
105
|
+
|
|
106
|
+
xmloutput = ""
|
|
107
|
+
valid = true
|
|
108
|
+
finish = false
|
|
109
|
+
|
|
110
|
+
while valid and ! finish
|
|
111
|
+
|
|
112
|
+
response = Net::HTTP.get_response(@resultURI)
|
|
113
|
+
body = response.body.split("\n")
|
|
114
|
+
if body[0] =~ /<?xml version=/
|
|
115
|
+
xmloutput = body.join("\n")
|
|
116
|
+
valid = true
|
|
117
|
+
finish = true
|
|
118
|
+
else
|
|
119
|
+
valid = false
|
|
120
|
+
body.each do |l|
|
|
121
|
+
if l =~ /Status=/
|
|
122
|
+
status = l.strip.gsub("Status=", "")
|
|
123
|
+
if status == "WAITING"
|
|
124
|
+
valid = true
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
break if valid
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
case @db
|
|
132
|
+
when 'nr', 'refseq_protein'
|
|
133
|
+
sleep 30
|
|
134
|
+
when 'swissprot'
|
|
135
|
+
sleep 10
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
if finish
|
|
141
|
+
File.open("#{@outfile}", "w") do |f|
|
|
142
|
+
f.write(xmloutput)
|
|
143
|
+
end
|
|
144
|
+
return finish
|
|
145
|
+
end
|
|
146
|
+
valid
|
|
147
|
+
|
|
148
|
+
end # end of method
|
|
149
|
+
|
|
150
|
+
# extract blast results from
|
|
151
|
+
def extract_blast_results
|
|
152
|
+
|
|
153
|
+
if !@valid
|
|
154
|
+
@aln_hits = nil
|
|
155
|
+
return
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
flat = Bio::FlatFile.auto("#{@outfile}")
|
|
159
|
+
@aln_hits = {}
|
|
160
|
+
|
|
161
|
+
flat.each_entry do |report|
|
|
162
|
+
|
|
163
|
+
report.iterations.each do |query_it|
|
|
164
|
+
prot_id = query_it.query_def.split(" ")[0]
|
|
165
|
+
query_it.hits.each do |hit|
|
|
166
|
+
if ! @aln_hits.has_key? prot_id
|
|
167
|
+
p_identity = hit.identity.to_f/hit.target_len.to_f*100
|
|
168
|
+
if p_identity >= @pidentity
|
|
169
|
+
# cleaning product definition
|
|
170
|
+
definition_clean = hit.definition.split(">")[0]
|
|
171
|
+
product = definition_clean.
|
|
172
|
+
gsub("MULTISPECIES: ","").
|
|
173
|
+
gsub(/ \[.*\]/,"").
|
|
174
|
+
gsub("RecName: Full=","").
|
|
175
|
+
split("; AltName")[0].
|
|
176
|
+
split("; Flags:")[0].
|
|
177
|
+
split(" ; Short=")[0]
|
|
178
|
+
gi = hit.hit_id.to_s.split("|")[1]
|
|
179
|
+
organism = ""
|
|
180
|
+
definition_clean = hit.definition.split(">")[0]
|
|
181
|
+
if ! definition_clean[/\[.*\]/].nil?
|
|
182
|
+
organism = definition_clean[/\[.*\]/].gsub("[","").gsub("]","")
|
|
183
|
+
end
|
|
184
|
+
@aln_hits[prot_id] = {
|
|
185
|
+
pId: hit.identity.to_f/hit.target_len.to_f*100,
|
|
186
|
+
length: hit.target_len.to_i,
|
|
187
|
+
evalue: hit.evalue,
|
|
188
|
+
score: hit.bit_score.to_f,
|
|
189
|
+
hits: [{gi: gi, product: product, org: organism}]
|
|
190
|
+
}
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
end # end of method
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
end # end of class
|