bacterial-annotator 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,201 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # author: maxime déraspe
3
- # email: maximilien1er@gmail.com
4
- # review:
5
- # date: 15-02-24
6
- # version: 0.0.1
7
- # licence:
8
-
9
- require 'mechanize'
10
- require 'open-uri'
11
- require 'bio'
12
-
13
- class RemoteNCBI
14
-
15
- attr_reader :aln_hits, :db, :xmloutput
16
-
17
- # initialize stuff for a remote ncbi run
18
- def initialize db, seq_file, outfile, pidentity
19
-
20
- if ! ["swissprot", "refseq_protein", "nr"].include? db
21
- @db = "nr" # bad database
22
- else
23
- @db = db
24
- end
25
-
26
- url = 'https://blast.ncbi.nlm.nih.gov/Blast.cgi'\
27
- '?PROGRAM=blastp&BLAST_PROGRAMS=blastp'\
28
- '&PAGE_TYPE=BlastSearch&SHOW_DEFAULTS=on'\
29
- '&LINK_LOC=blasthome'
30
-
31
- @seq_file = seq_file
32
- @outfile = outfile
33
- @resultURI = submit_blast url
34
- @pidentity = pidentity
35
-
36
- if @resultURI != ""
37
- @xmloutput = ""
38
- @valid = validate_output
39
- else
40
- @valid = false
41
- end
42
-
43
- end # end of method
44
-
45
-
46
- # submit blast to ncbi
47
- def submit_blast ncbiURL
48
-
49
- f = @seq_file.split("/")[-1]
50
-
51
- seq_fasta = File.read(@seq_file)
52
-
53
- a = Mechanize.new { |agent|
54
- agent.user_agent_alias = 'Linux Firefox'
55
- agent.ignore_bad_chunking = true
56
- }
57
-
58
- toBreak = 0
59
- requestID = ""
60
- try = 1
61
-
62
- while requestID == "" and try < 12
63
-
64
- begin
65
-
66
- a.get(ncbiURL) do |page|
67
-
68
- search = page.form_with(:name => 'searchForm') { |form|
69
- form.textareas[0].value = File.read(@seq_file)
70
- form.field_with(:name => 'DATABASE').value = @db
71
- form.field_with(:name => 'MAX_NUM_SEQ').value = 40
72
- }.submit
73
-
74
- search.parser.css('td').each do |td|
75
- if toBreak == 1
76
- requestID = td.text.gsub(" ","")
77
- # puts "breaking because #{requestID}"
78
- break
79
- end
80
- if td.text == "Request ID"
81
- toBreak = 1
82
- end
83
- end
84
-
85
- end
86
-
87
- rescue
88
- try += 1
89
- puts "#{try} POST try for #{f}"
90
- sleep 3
91
- end
92
-
93
- end
94
-
95
- uri_parsed = "https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&RID=#{requestID}"
96
- puts "NCBI Blast for #{f}: #{uri_parsed}"
97
-
98
- return URI.parse("https://blast.ncbi.nlm.nih.gov/Blast.cgi?RESULTS_FILE=on&RID=#{requestID}&FORMAT_TYPE=XML&FORMAT_OBJECT=Alignment&CMD=Get")
99
-
100
- end # end of method
101
-
102
-
103
- # validate the xml blast results
104
- def validate_output
105
-
106
- xmloutput = ""
107
- valid = true
108
- finish = false
109
-
110
- while valid and ! finish
111
-
112
- response = Net::HTTP.get_response(@resultURI)
113
- body = response.body.split("\n")
114
- if body[0] =~ /<?xml version=/
115
- xmloutput = body.join("\n")
116
- valid = true
117
- finish = true
118
- else
119
- valid = false
120
- body.each do |l|
121
- if l =~ /Status=/
122
- status = l.strip.gsub("Status=", "")
123
- if status == "WAITING"
124
- valid = true
125
- end
126
- end
127
- break if valid
128
- end
129
- end
130
-
131
- case @db
132
- when 'nr', 'refseq_protein'
133
- sleep 30
134
- when 'swissprot'
135
- sleep 10
136
- end
137
-
138
- end
139
-
140
- if finish
141
- File.open("#{@outfile}", "w") do |f|
142
- f.write(xmloutput)
143
- end
144
- return finish
145
- end
146
- valid
147
-
148
- end # end of method
149
-
150
- # extract blast results from
151
- def extract_blast_results
152
-
153
- if !@valid
154
- @aln_hits = nil
155
- return
156
- end
157
-
158
- flat = Bio::FlatFile.auto("#{@outfile}")
159
- @aln_hits = {}
160
-
161
- flat.each_entry do |report|
162
-
163
- report.iterations.each do |query_it|
164
- prot_id = query_it.query_def.split(" ")[0]
165
- query_it.hits.each do |hit|
166
- if ! @aln_hits.has_key? prot_id
167
- p_identity = hit.identity.to_f/hit.target_len.to_f*100
168
- if p_identity >= @pidentity
169
- # cleaning product definition
170
- definition_clean = hit.definition.split(">")[0]
171
- product = definition_clean.
172
- gsub("MULTISPECIES: ","").
173
- gsub(/ \[.*\]/,"").
174
- gsub("RecName: Full=","").
175
- split("; AltName")[0].
176
- split("; Flags:")[0].
177
- split(" ; Short=")[0].strip
178
- gi = hit.hit_id.to_s.split("|")[1]
179
- accession = hit.accession.to_s
180
- organism = ""
181
- definition_clean = hit.definition.split(">")[0]
182
- if ! definition_clean[/\[.*\]/].nil?
183
- organism = definition_clean[/\[.*\]/].gsub("[","").gsub("]","")
184
- end
185
- @aln_hits[prot_id] = {
186
- pId: (hit.identity.to_f/hit.target_len.to_f*100).round(2),
187
- length: hit.target_len.to_i,
188
- evalue: hit.evalue,
189
- score: hit.bit_score.to_f,
190
- hits: [{gi: gi, accession: accession, product: product, org: organism}]
191
- }
192
- end
193
- end
194
- end
195
- end
196
- end
197
-
198
- end # end of method
199
-
200
-
201
- end # end of class