bacterial-annotator 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/ba_prodigal +1 -1
- data/bin/bacterial-annotator +13 -14
- data/lib/bacterial-annotator/{genbank-manip.rb → sequence-annotation.rb} +128 -16
- data/lib/bacterial-annotator/{fasta-manip.rb → sequence-fasta.rb} +32 -23
- data/lib/bacterial-annotator/{synteny-manip.rb → sequence-synteny.rb} +128 -8
- data/lib/bacterial-annotator.rb +211 -140
- data/lib/bacterial-comparator.rb +1 -0
- metadata +5 -6
- data/lib/bacterial-annotator/remote-ncbi.rb +0 -201
@@ -1,201 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
# author: maxime déraspe
|
3
|
-
# email: maximilien1er@gmail.com
|
4
|
-
# review:
|
5
|
-
# date: 15-02-24
|
6
|
-
# version: 0.0.1
|
7
|
-
# licence:
|
8
|
-
|
9
|
-
require 'mechanize'
|
10
|
-
require 'open-uri'
|
11
|
-
require 'bio'
|
12
|
-
|
13
|
-
class RemoteNCBI
|
14
|
-
|
15
|
-
attr_reader :aln_hits, :db, :xmloutput
|
16
|
-
|
17
|
-
# initialize stuff for a remote ncbi run
|
18
|
-
def initialize db, seq_file, outfile, pidentity
|
19
|
-
|
20
|
-
if ! ["swissprot", "refseq_protein", "nr"].include? db
|
21
|
-
@db = "nr" # bad database
|
22
|
-
else
|
23
|
-
@db = db
|
24
|
-
end
|
25
|
-
|
26
|
-
url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'\
|
27
|
-
'?PROGRAM=blastp&BLAST_PROGRAMS=blastp'\
|
28
|
-
'&PAGE_TYPE=BlastSearch&SHOW_DEFAULTS=on'\
|
29
|
-
'&LINK_LOC=blasthome'
|
30
|
-
|
31
|
-
@seq_file = seq_file
|
32
|
-
@outfile = outfile
|
33
|
-
@resultURI = submit_blast url
|
34
|
-
@pidentity = pidentity
|
35
|
-
|
36
|
-
if @resultURI != ""
|
37
|
-
@xmloutput = ""
|
38
|
-
@valid = validate_output
|
39
|
-
else
|
40
|
-
@valid = false
|
41
|
-
end
|
42
|
-
|
43
|
-
end # end of method
|
44
|
-
|
45
|
-
|
46
|
-
# submit blast to ncbi
|
47
|
-
def submit_blast ncbiURL
|
48
|
-
|
49
|
-
f = @seq_file.split("/")[-1]
|
50
|
-
|
51
|
-
seq_fasta = File.read(@seq_file)
|
52
|
-
|
53
|
-
a = Mechanize.new { |agent|
|
54
|
-
agent.user_agent_alias = 'Linux Firefox'
|
55
|
-
agent.ignore_bad_chunking = true
|
56
|
-
}
|
57
|
-
|
58
|
-
toBreak = 0
|
59
|
-
requestID = ""
|
60
|
-
try = 1
|
61
|
-
|
62
|
-
while requestID == "" and try < 12
|
63
|
-
|
64
|
-
begin
|
65
|
-
|
66
|
-
a.get(ncbiURL) do |page|
|
67
|
-
|
68
|
-
search = page.form_with(:name => 'searchForm') { |form|
|
69
|
-
form.textareas[0].value = File.read(@seq_file)
|
70
|
-
form.field_with(:name => 'DATABASE').value = @db
|
71
|
-
form.field_with(:name => 'MAX_NUM_SEQ').value = 40
|
72
|
-
}.submit
|
73
|
-
|
74
|
-
search.parser.css('td').each do |td|
|
75
|
-
if toBreak == 1
|
76
|
-
requestID = td.text.gsub(" ","")
|
77
|
-
# puts "breaking because #{requestID}"
|
78
|
-
break
|
79
|
-
end
|
80
|
-
if td.text == "Request ID"
|
81
|
-
toBreak = 1
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
end
|
86
|
-
|
87
|
-
rescue
|
88
|
-
try += 1
|
89
|
-
puts "#{try} POST try for #{f}"
|
90
|
-
sleep 3
|
91
|
-
end
|
92
|
-
|
93
|
-
end
|
94
|
-
|
95
|
-
uri_parsed = "https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&RID=#{requestID}"
|
96
|
-
puts "NCBI Blast for #{f}: #{uri_parsed}"
|
97
|
-
|
98
|
-
return URI.parse("https://blast.ncbi.nlm.nih.gov/Blast.cgi?RESULTS_FILE=on&RID=#{requestID}&FORMAT_TYPE=XML&FORMAT_OBJECT=Alignment&CMD=Get")
|
99
|
-
|
100
|
-
end # end of method
|
101
|
-
|
102
|
-
|
103
|
-
# validate the xml blast results
|
104
|
-
def validate_output
|
105
|
-
|
106
|
-
xmloutput = ""
|
107
|
-
valid = true
|
108
|
-
finish = false
|
109
|
-
|
110
|
-
while valid and ! finish
|
111
|
-
|
112
|
-
response = Net::HTTP.get_response(@resultURI)
|
113
|
-
body = response.body.split("\n")
|
114
|
-
if body[0] =~ /<?xml version=/
|
115
|
-
xmloutput = body.join("\n")
|
116
|
-
valid = true
|
117
|
-
finish = true
|
118
|
-
else
|
119
|
-
valid = false
|
120
|
-
body.each do |l|
|
121
|
-
if l =~ /Status=/
|
122
|
-
status = l.strip.gsub("Status=", "")
|
123
|
-
if status == "WAITING"
|
124
|
-
valid = true
|
125
|
-
end
|
126
|
-
end
|
127
|
-
break if valid
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
|
-
case @db
|
132
|
-
when 'nr', 'refseq_protein'
|
133
|
-
sleep 30
|
134
|
-
when 'swissprot'
|
135
|
-
sleep 10
|
136
|
-
end
|
137
|
-
|
138
|
-
end
|
139
|
-
|
140
|
-
if finish
|
141
|
-
File.open("#{@outfile}", "w") do |f|
|
142
|
-
f.write(xmloutput)
|
143
|
-
end
|
144
|
-
return finish
|
145
|
-
end
|
146
|
-
valid
|
147
|
-
|
148
|
-
end # end of method
|
149
|
-
|
150
|
-
# extract blast results from
|
151
|
-
def extract_blast_results
|
152
|
-
|
153
|
-
if !@valid
|
154
|
-
@aln_hits = nil
|
155
|
-
return
|
156
|
-
end
|
157
|
-
|
158
|
-
flat = Bio::FlatFile.auto("#{@outfile}")
|
159
|
-
@aln_hits = {}
|
160
|
-
|
161
|
-
flat.each_entry do |report|
|
162
|
-
|
163
|
-
report.iterations.each do |query_it|
|
164
|
-
prot_id = query_it.query_def.split(" ")[0]
|
165
|
-
query_it.hits.each do |hit|
|
166
|
-
if ! @aln_hits.has_key? prot_id
|
167
|
-
p_identity = hit.identity.to_f/hit.target_len.to_f*100
|
168
|
-
if p_identity >= @pidentity
|
169
|
-
# cleaning product definition
|
170
|
-
definition_clean = hit.definition.split(">")[0]
|
171
|
-
product = definition_clean.
|
172
|
-
gsub("MULTISPECIES: ","").
|
173
|
-
gsub(/ \[.*\]/,"").
|
174
|
-
gsub("RecName: Full=","").
|
175
|
-
split("; AltName")[0].
|
176
|
-
split("; Flags:")[0].
|
177
|
-
split(" ; Short=")[0].strip
|
178
|
-
gi = hit.hit_id.to_s.split("|")[1]
|
179
|
-
accession = hit.accession.to_s
|
180
|
-
organism = ""
|
181
|
-
definition_clean = hit.definition.split(">")[0]
|
182
|
-
if ! definition_clean[/\[.*\]/].nil?
|
183
|
-
organism = definition_clean[/\[.*\]/].gsub("[","").gsub("]","")
|
184
|
-
end
|
185
|
-
@aln_hits[prot_id] = {
|
186
|
-
pId: (hit.identity.to_f/hit.target_len.to_f*100).round(2),
|
187
|
-
length: hit.target_len.to_i,
|
188
|
-
evalue: hit.evalue,
|
189
|
-
score: hit.bit_score.to_f,
|
190
|
-
hits: [{gi: gi, accession: accession, product: product, org: organism}]
|
191
|
-
}
|
192
|
-
end
|
193
|
-
end
|
194
|
-
end
|
195
|
-
end
|
196
|
-
end
|
197
|
-
|
198
|
-
end # end of method
|
199
|
-
|
200
|
-
|
201
|
-
end # end of class
|