protk 1.2.6.pre1 → 1.2.6.pre2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +22 -27
- data/bin/blastxml_to_table.rb +50 -3
- data/bin/make_decoy.rb +30 -2
- data/bin/mascot_search.rb +46 -27
- data/bin/msgfplus_search.rb +7 -4
- data/bin/peptide_prophet.rb +9 -0
- data/bin/protxml_to_gff.rb +122 -66
- data/bin/protxml_to_table.rb +26 -3
- data/bin/tandem_search.rb +1 -1
- data/lib/protk/constants.rb +19 -19
- data/lib/protk/data/default_config.yml +0 -7
- data/lib/protk/search_tool.rb +7 -0
- metadata +118 -90
- data/bin/mascot2xml.rb +0 -87
- data/ext/protk/simplealign/simplealign.c +0 -17
- data/lib/protk/data/pepxml_mascot_template.xml +0 -29
- data/lib/protk/data/predefined_db.trembl_annotation.yaml +0 -20
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c2cb8ece1038575e27d4d2bb8d5eff3ff2367a33
|
4
|
+
data.tar.gz: db9cd97c0b87186e7a84c53ed43e132b25d1cd15
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: df19012cdb42d14136a4f4682a3a45b2423b4b216bcdd8f959e0eaab5b7bcaa0546b5550d3a69accc1da4e158966150bb9f0e21d52c0ed2bf2c0397ca333195c
|
7
|
+
data.tar.gz: a08a7bd13e3027b21bdd24cf13491e6041b05a19f3f066468a96357b543332e40b4827356de2d4b81e1d7a67daa44a73b283b1948f7824f33c6cf738df46383d
|
data/README.md
CHANGED
@@ -8,43 +8,38 @@ Protk is a wrapper for various proteomics tools. It aims to present a consistent
|
|
8
8
|
|
9
9
|
***
|
10
10
|
|
11
|
+
## Table of Contents
|
11
12
|
|
13
|
+
* [Protk](#what-is-it?)
|
14
|
+
* [Installation](#installation)
|
15
|
+
* [Configuration](#configuration)
|
12
16
|
|
13
|
-
|
17
|
+
|
18
|
+
|
19
|
+
## Installation
|
14
20
|
|
15
|
-
|
16
|
-
|
17
|
-
```
|
18
|
-
|
21
|
+
The easiest intallation method is to use rubygems. You might need to install the libxml2 package on your system first (eg libxml-dev on Ubuntu)
|
22
|
+
|
23
|
+
``` shell
|
24
|
+
gem install protk
|
19
25
|
```
|
20
26
|
|
21
|
-
|
27
|
+
## Configuration
|
22
28
|
|
23
|
-
|
29
|
+
By default protk will install tools and databases into `.protk` in your home directory. If this is not desirable you can change the protk root default by setting the environment variable `PROTK_INSTALL_DIR`. You can also avoid using a `.protk` directory altogether (see below)
|
24
30
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
protk_setup.rb package_manager
|
30
|
-
protk_setup.rb system_packages
|
31
|
-
protk_setup.rb all
|
32
|
-
```
|
33
|
-
On Linux
|
34
|
-
|
35
|
-
```sh
|
36
|
-
rvm install 1.9.3
|
37
|
-
rvm use 1.9.3
|
38
|
-
gem install protk
|
39
|
-
sudo ~/.rvm/bin/rvm 1.9.3 do protk_setup.rb system_packages
|
40
|
-
protk_setup all
|
31
|
+
Protk includes a setup tool to install various third party proteomics tools such as the TPP, OMSSA, MS-GF+, Proteowizard. If this tool is used it installs everything under `.protk/tools`. To perform such an installation use;
|
32
|
+
|
33
|
+
```shell
|
34
|
+
protk_setup.rb tpp omssa blast msgfplus pwiz
|
41
35
|
```
|
42
36
|
|
43
|
-
|
37
|
+
Alternatively, these tools may already be present on your system, or you may prefer to install them yourself. In that case simply ensure that all executables are included in your `$PATH`. Those executables will be used as a fallback if nothing is available under the `.protk` installation directory.
|
38
|
+
|
39
|
+
|
40
|
+
Instead off using protk_setup.rb all it might be preferable to only install some of the protk tool dependencies. 'all' is just an alias for the following full target list, any of which can be omitted with the consequence that tools depending on that component will not be available.
|
41
|
+
|
44
42
|
|
45
|
-
```sh
|
46
|
-
protk_setup.rb tpp omssa blast msgfplus pwiz openms galaxyenv
|
47
|
-
```
|
48
43
|
|
49
44
|
## Sequence databases
|
50
45
|
|
data/bin/blastxml_to_table.rb
CHANGED
@@ -10,7 +10,6 @@ require 'bio'
|
|
10
10
|
require 'protk/fastadb'
|
11
11
|
require 'bio-blastxmlparser'
|
12
12
|
|
13
|
-
|
14
13
|
tool=Tool.new([:explicit_output])
|
15
14
|
tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
|
16
15
|
|
@@ -19,6 +18,16 @@ tool.option_parser.on( '-d filename','--database filename', 'Database used for B
|
|
19
18
|
tool.options.database=file
|
20
19
|
end
|
21
20
|
|
21
|
+
tool.options.gene2go=nil
|
22
|
+
tool.option_parser.on('--gene2go pathtogene2go','Path to gene2go database. If provided GO terms will be looked up') do |gene2go|
|
23
|
+
tool.options.gene2go=gene2go
|
24
|
+
end
|
25
|
+
|
26
|
+
tool.options.gitogeneid=nil
|
27
|
+
tool.option_parser.on('--gitogeneid gitogeneid.db','Path to GDBM formatted gi to geneid mapping database. If provided gene ids will be looked up') do |gitogeneid|
|
28
|
+
tool.options.gitogeneid=gitogeneid
|
29
|
+
end
|
30
|
+
|
22
31
|
exit unless tool.check_options
|
23
32
|
|
24
33
|
#require 'debugger';debugger
|
@@ -36,11 +45,43 @@ if tool.database
|
|
36
45
|
$fastadb=FastaDB.new(tool.database)
|
37
46
|
end
|
38
47
|
|
48
|
+
$gitogeneid = nil
|
49
|
+
if (tool.gitogeneid!=nil) && (File.exist? tool.gitogeneid)
|
50
|
+
require 'gdbm'
|
51
|
+
$gitogeneid = GDBM.new(tool.gitogeneid,flags=GDBM::READER)
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
$gene2go = nil
|
56
|
+
if (tool.gene2go!=nil) && (File.exist? tool.gene2go)
|
57
|
+
require 'gdbm'
|
58
|
+
$gene2go = GDBM.new(tool.gene2go,flags=GDBM::READER)
|
59
|
+
end
|
60
|
+
|
61
|
+
def gi_from_hit_id(hit_id)
|
62
|
+
gi_scan=hit_id.scan(/gi\|(\d+)/)
|
63
|
+
gi_scan.join("")
|
64
|
+
end
|
65
|
+
|
39
66
|
def generate_line(hsp,hit,query,hit_seq=nil)
|
40
|
-
|
67
|
+
|
68
|
+
line="#{query.query_id}\t#{query.query_def}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hit.accession}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
|
41
69
|
if hit_seq
|
42
70
|
line << "\t#{hit_seq}"
|
43
71
|
end
|
72
|
+
geneid=""
|
73
|
+
goterm=""
|
74
|
+
if $gitogeneid
|
75
|
+
geneid=$gitogeneid[gi_from_hit_id(hit.hit_id)]
|
76
|
+
goterm=$gene2go[geneid] if geneid!=nil && $gene2go
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
# throw "No geneid" if geneid==nil
|
81
|
+
line << "\t#{geneid}\t#{goterm}"
|
82
|
+
# require 'debugger';debugger
|
83
|
+
# puts gi_from_hit_id(hit.hit_id)
|
84
|
+
# puts $gene2go[gi_from_hit_id(hit.hit_id)]
|
44
85
|
line<<"\n"
|
45
86
|
line
|
46
87
|
end
|
@@ -61,12 +102,18 @@ blast.each do |query|
|
|
61
102
|
# if hit
|
62
103
|
hit_seq=fetch_hit_seq(hit)
|
63
104
|
hit.hsps.each do |hsp|
|
64
|
-
|
105
|
+
out_line=generate_line(hsp,hit,query,hit_seq)
|
106
|
+
|
107
|
+
out_file.write out_line
|
65
108
|
end
|
66
109
|
# end
|
67
110
|
end
|
68
111
|
end
|
69
112
|
|
113
|
+
|
114
|
+
$gitogeneid.close if $gitogeneid!=nil
|
115
|
+
$gene2go.close if $gene2go!=nil
|
116
|
+
|
70
117
|
#require 'debugger';debugger
|
71
118
|
|
72
119
|
#puts "Hi"
|
data/bin/make_decoy.rb
CHANGED
@@ -32,6 +32,16 @@ tool.option_parser.on('-P str','--prefix-string str','String to prepend to seque
|
|
32
32
|
tool.options.prefix_string=str
|
33
33
|
end
|
34
34
|
|
35
|
+
tool.options.reverse_only=false
|
36
|
+
tool.option_parser.on('--reverse-only','Just reverse sequences. Dont try to randomize') do
|
37
|
+
tool.options.reverse_only=true
|
38
|
+
end
|
39
|
+
|
40
|
+
tool.options.id_regex=".*?\\|(.*?)[ \\|]"
|
41
|
+
tool.option_parser.on('--id-regex regex','Regex for finding IDs. If reverse-only is used then this will be used to find ids and prepend with the decoy string. Default .*?\\|(.*?)[ \\|]') do regex
|
42
|
+
tool.options.id_regex=regex
|
43
|
+
end
|
44
|
+
|
35
45
|
tool.options.append=false
|
36
46
|
tool.option_parser.on('-A','--append','Append input sequences to the generated database') do
|
37
47
|
tool.options.append=true
|
@@ -64,7 +74,23 @@ genv=Constants.new()
|
|
64
74
|
|
65
75
|
decoys_tmp_file = Pathname.new(Tempfile.new("random").path).basename.to_s;
|
66
76
|
|
67
|
-
|
77
|
+
if (tool.reverse_only)
|
78
|
+
decoys_out = File.open(decoys_tmp_file,'w+')
|
79
|
+
Bio::FastaFormat.open(input_file).each do |seq|
|
80
|
+
id=nil
|
81
|
+
begin
|
82
|
+
id=seq.definition.scan(/#{id_regex}/)[0][0]
|
83
|
+
revdef=seq.definition.sub(id,"#{tool.prefix_string}#{id}")
|
84
|
+
decoys_out.write ">#{revdef}\n#{seq.aaseq}\n"
|
85
|
+
rescue
|
86
|
+
puts "Unable to parse id for #{seq.definition}. Skipping" if (id==nil)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
decoys_out.close
|
90
|
+
else
|
91
|
+
Randomize.make_decoys input_file, db_length, decoys_tmp_file, tool.prefix_string
|
92
|
+
end
|
93
|
+
|
68
94
|
cmd = "cat #{input_file} #{decoys_tmp_file} >> #{output_file}; rm #{decoys_tmp_file}" if ( tool.append )
|
69
95
|
|
70
96
|
# Randomize.make_decoys raw_db_filename, db_length, decoys_filename, decoy_prefix
|
@@ -76,4 +102,6 @@ p cmd
|
|
76
102
|
job_params= {:jobid => tool.jobid_from_filename(input_file) }
|
77
103
|
job_params[:queue]="lowmem"
|
78
104
|
job_params[:vmem]="900mb"
|
79
|
-
tool.run(cmd,genv,job_params)
|
105
|
+
tool.run(cmd,genv,job_params)
|
106
|
+
|
107
|
+
|
data/bin/mascot_search.rb
CHANGED
@@ -22,7 +22,7 @@ def login(mascot_cgi,username,password)
|
|
22
22
|
authdict[:savecookie]="1"
|
23
23
|
|
24
24
|
p "Logging in to #{mascot_cgi}/login.pl"
|
25
|
-
|
25
|
+
|
26
26
|
response = RestClient.post "#{mascot_cgi}/login.pl", authdict
|
27
27
|
|
28
28
|
cookie = response.cookies
|
@@ -40,6 +40,8 @@ def download_datfile(mascot_cgi,results_date,results_file,explicit_output,openur
|
|
40
40
|
output_path="#{results_file}"
|
41
41
|
end
|
42
42
|
|
43
|
+
puts "Writing output to #{output_path}"
|
44
|
+
|
43
45
|
require 'open-uri'
|
44
46
|
open("#{output_path}", 'wb') do |file|
|
45
47
|
file << open("#{get_url}","Cookie"=>openurlcookie).read
|
@@ -144,6 +146,12 @@ search_tool.option_parser.on( '--export format', 'Save results in a specified fo
|
|
144
146
|
search_tool.options.export_format=format
|
145
147
|
end
|
146
148
|
|
149
|
+
search_tool.options.download_only=nil
|
150
|
+
search_tool.option_parser.on( '--download-only path', 'Specify a path to an existing results file for download eg(20131113/F227185.dat)' ) do |path|
|
151
|
+
search_tool.options.download_only=path
|
152
|
+
end
|
153
|
+
|
154
|
+
|
147
155
|
search_tool.options.timeout=200
|
148
156
|
search_tool.option_parser.on( '--timeout seconds', 'Timeout for sending data file to mascot in seconds' ) do |seconds|
|
149
157
|
search_tool.options.timeout=seconds.to_i
|
@@ -151,8 +159,9 @@ end
|
|
151
159
|
|
152
160
|
exit unless search_tool.check_options
|
153
161
|
|
154
|
-
if ( ARGV[0].nil? )
|
162
|
+
if ( ARGV[0].nil? && search_tool.download_only.nil?)
|
155
163
|
puts "You must supply an input file"
|
164
|
+
puts search_tool.download_only
|
156
165
|
puts search_tool.option_parser
|
157
166
|
exit
|
158
167
|
end
|
@@ -167,7 +176,6 @@ unless ( mascot_cgi =~ /^http[s]?:\/\//)
|
|
167
176
|
end
|
168
177
|
|
169
178
|
RestClient.proxy=search_tool.httpproxy if search_tool.httpproxy
|
170
|
-
$genv.log("Var mods #{search_tool.var_mods} and fixed #{search_tool.fix_mods}",:info)
|
171
179
|
|
172
180
|
cookie=""
|
173
181
|
openurlcookie=""
|
@@ -178,36 +186,47 @@ if ( search_tool.use_security)
|
|
178
186
|
openurlcookie = "MASCOT_SESSION=#{cookie['MASCOT_SESSION']}; MASCOT_USERID=#{cookie['MASCOT_USERID']}; MASCOT_USERNAME=#{cookie['MASCOT_USERNAME']}"
|
179
187
|
end
|
180
188
|
|
181
|
-
|
182
|
-
|
189
|
+
if ( !search_tool.download_only.nil?)
|
190
|
+
parts=search_tool.download_only.split("/")
|
191
|
+
throw "Must provide a path of the format date/filename" unless parts.length==2
|
192
|
+
results_date=parts[0]
|
193
|
+
results_file=parts[1]
|
194
|
+
download_datfile mascot_cgi, results_date, results_file,search_tool.explicit_output,openurlcookie
|
195
|
+
else
|
196
|
+
#$genv.log("Var mods #{search_tool.var_mods} and fixed #{search_tool.fix_mods}",:info)
|
183
197
|
|
184
|
-
|
185
|
-
|
198
|
+
postdict = search_params_dictionary search_tool, ARGV[0]
|
199
|
+
$genv.log("Sending #{postdict}",:info)
|
186
200
|
|
187
|
-
|
201
|
+
#site = RestClient::Resource.new(mascot_cgi, timeout=300)
|
202
|
+
#search_response=site['/nph-mascot.exe?1'].post , postdict, {:cookies=>cookie}
|
188
203
|
|
204
|
+
search_response=RestClient::Request.execute(:method => :post, :url => "#{mascot_cgi}/nph-mascot.exe?1", :payload => postdict,:headers=>{:cookies=>cookie},:timeout => search_tool.options.timeout, :open_timeout => 10)
|
189
205
|
|
190
|
-
#search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
|
191
206
|
|
192
|
-
|
207
|
+
#search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
|
193
208
|
|
194
|
-
|
195
|
-
error_result= /Sorry, your search could not be performed(.*)/.match(search_response)
|
196
|
-
if ( error_result != nil )
|
197
|
-
puts error_result[0]
|
198
|
-
$genv.log("Mascot search failed with response #{search_response}",:warn)
|
199
|
-
throw "Mascot search failed with response #{search_response}"
|
200
|
-
elsif (search_tool.export_format=="mascotdat")
|
201
|
-
# Search for the location of the mascot data file in the response
|
202
|
-
results=/master_results_?2?\.pl\?file=\.*\/data\/(.*)\/(.+\.dat)/.match(search_response)
|
203
|
-
results_date=results[1]
|
204
|
-
results_file=results[2]
|
209
|
+
$genv.log("Mascot search response was #{search_response}",:info)
|
205
210
|
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
#
|
211
|
+
# Look for an error if there is one
|
212
|
+
error_result= /Sorry, your search could not be performed(.*)/.match(search_response)
|
213
|
+
if ( error_result != nil )
|
214
|
+
puts error_result[0]
|
215
|
+
$genv.log("Mascot search failed with response #{search_response}",:warn)
|
216
|
+
throw "Mascot search failed with response #{search_response}"
|
217
|
+
elsif (search_tool.export_format=="mascotdat")
|
218
|
+
# Search for the location of the mascot data file in the response
|
219
|
+
results=/master_results_?2?\.pl\?file=\.*\/data\/(.*)\/(.+\.dat)/.match(search_response)
|
220
|
+
results_date=results[1]
|
221
|
+
results_file=results[2]
|
222
|
+
|
223
|
+
download_datfile mascot_cgi, results_date, results_file,search_tool.explicit_output,openurlcookie
|
224
|
+
else
|
225
|
+
results=/master_results_?2?\.pl\?file=(\.*\/data\/.*\/.+\.dat)/.match(search_response)
|
226
|
+
results_file = results[1]
|
227
|
+
export_results mascot_cgi,cookie,results_file,search_tool.export_format, openurlcookie
|
228
|
+
# export_results mascot_cgi,cookie,results_file,search_tool.export_format
|
229
|
+
end
|
212
230
|
end
|
213
231
|
|
232
|
+
|
data/bin/msgfplus_search.rb
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
#
|
6
6
|
# Runs an MS/MS search using the MSGFPlus search engine
|
7
7
|
#
|
8
|
+
|
8
9
|
$VERBOSE=nil
|
9
10
|
require 'protk/constants'
|
10
11
|
require 'protk/command_runner'
|
@@ -18,7 +19,7 @@ input_stager = nil
|
|
18
19
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
19
20
|
#
|
20
21
|
search_tool=SearchTool.new([:background,:database,:explicit_output,:over_write,:enzyme,
|
21
|
-
:modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:
|
22
|
+
:modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:cleavage_semi])
|
22
23
|
|
23
24
|
search_tool.jobid_prefix="p"
|
24
25
|
search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
|
@@ -164,10 +165,10 @@ ARGV.each do |filename|
|
|
164
165
|
# The basic command
|
165
166
|
#
|
166
167
|
cmd= "#{make_msgfdb_cmd} java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{current_db} -s #{input_path} -o #{mzid_output_path} "
|
167
|
-
|
168
|
+
|
169
|
+
#Semi tryptic peptides
|
168
170
|
#
|
169
|
-
|
170
|
-
cmd << " -ntt #{search_tool.missed_cleavages}"
|
171
|
+
cmd << " -ntt 1" if ( search_tool.cleavage_semi )
|
171
172
|
|
172
173
|
# Precursor tolerance
|
173
174
|
#
|
@@ -235,6 +236,8 @@ ARGV.each do |filename|
|
|
235
236
|
cmd << "; cp #{mzid_output_path} #{output_path}"
|
236
237
|
else
|
237
238
|
#if search_tool.explicit_output
|
239
|
+
cmd << ";ruby -pi.bak -e \"gsub('post=\\\"?','post=\\\"X')\" #{mzid_output_path}"
|
240
|
+
cmd << ";ruby -pi.bak -e \"gsub('pre=\\\"?','pre=\\\"X')\" #{mzid_output_path}"
|
238
241
|
cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
|
239
242
|
#Then copy the pepxml to the final output path
|
240
243
|
cmd << "; mv #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
|
data/bin/peptide_prophet.rb
CHANGED
@@ -92,6 +92,11 @@ prophet_tool.option_parser.on( '--no-decoy', 'Don\'t use decoy sequences to pin
|
|
92
92
|
prophet_tool.options.no_decoys = true
|
93
93
|
end
|
94
94
|
|
95
|
+
prophet_tool.options.experiment_label=nil
|
96
|
+
prophet_tool.option_parser.on('--experiment-label label','used to commonly label all spectra belonging to one experiment (required by iProphet)') do |label|
|
97
|
+
prophet_tool.options.experiment_label = label
|
98
|
+
end
|
99
|
+
|
95
100
|
prophet_tool.options.override_database=nil
|
96
101
|
prophet_tool.option_parser.on( '--override-database database', 'Manually specify database') do |database|
|
97
102
|
prophet_tool.options.override_database = database
|
@@ -212,6 +217,10 @@ def generate_command(genv,prophet_tool,inputs,output,database,engine)
|
|
212
217
|
cmd << " -I2 -T3 -I4 -I5 -I6 -I7 "
|
213
218
|
end
|
214
219
|
|
220
|
+
if prophet_tool.experiment_label!=nil
|
221
|
+
cmd << " -E#{prophet_tool.experiment_label} "
|
222
|
+
end
|
223
|
+
|
215
224
|
unless prophet_tool.no_decoys
|
216
225
|
|
217
226
|
if engine=="omssa" || engine=="phenyx"
|
data/bin/protxml_to_gff.rb
CHANGED
@@ -29,6 +29,16 @@ tool.option_parser.on( '-d filename','--database filename', 'Database used for m
|
|
29
29
|
tool.options.database=file
|
30
30
|
end
|
31
31
|
|
32
|
+
tool.options.protein_find=nil
|
33
|
+
tool.option_parser.on( '-f term','--find term', 'Restrict output to proteins whose name matches the specified string' ) do |term|
|
34
|
+
tool.options.protein_find=term
|
35
|
+
end
|
36
|
+
|
37
|
+
tool.options.nterm_minlen=7
|
38
|
+
tool.option_parser.on( '-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len' ) do |len|
|
39
|
+
tool.options.nterm_minlen=len
|
40
|
+
end
|
41
|
+
|
32
42
|
tool.options.genome=nil
|
33
43
|
tool.option_parser.on( '-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)' ) do |file|
|
34
44
|
tool.options.genome=file
|
@@ -39,11 +49,26 @@ tool.option_parser.on('--skip-index','Don\'t index database (Index should alread
|
|
39
49
|
tool.options.skip_fasta_indexing=true
|
40
50
|
end
|
41
51
|
|
52
|
+
tool.options.stack_charge_states=false
|
53
|
+
tool.option_parser.on('--stack-charge-states','Different peptide charge states get separate gff entries') do
|
54
|
+
tool.options.stack_charge_states=true
|
55
|
+
end
|
56
|
+
|
57
|
+
tool.options.collapse_redundant_proteins=false
|
58
|
+
tool.option_parser.on('--collapse-redundant-proteins','Proteins that cover genomic regions already covered will be skipped') do
|
59
|
+
tool.options.collapse_redundant_proteins=true
|
60
|
+
end
|
61
|
+
|
42
62
|
tool.options.peptide_probability_threshold=0.95
|
43
63
|
tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
|
44
64
|
tool.options.peptide_probability_threshold=thresh.to_f
|
45
65
|
end
|
46
66
|
|
67
|
+
tool.options.protein_probability_threshold=0.99
|
68
|
+
tool.option_parser.on('--prot-threshold prob','Protein Probability Threshold (Default 0.99)') do |thresh|
|
69
|
+
tool.options.protein_probability_threshold=thresh.to_f
|
70
|
+
end
|
71
|
+
|
47
72
|
exit unless tool.check_options [:protxml,:database]
|
48
73
|
|
49
74
|
gff_out_file="peptides.gff"
|
@@ -94,7 +119,7 @@ def protein_names(protein_node)
|
|
94
119
|
end
|
95
120
|
|
96
121
|
def peptide_nodes(protein_node)
|
97
|
-
protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
122
|
+
return protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
98
123
|
end
|
99
124
|
|
100
125
|
|
@@ -210,41 +235,11 @@ def peptide_is_in_sixframe(pep_seq,gene_seq)
|
|
210
235
|
return false
|
211
236
|
end
|
212
237
|
|
213
|
-
# gene_seq should already have been reverse_complemented if on reverse strand
|
214
|
-
def get_peptide_coordinates_by_alignment(prot_seq,pep_seq,protein_info,gene_seq)
|
215
|
-
if ( peptide_is_in_sixframe(pep_seq,gene_seq))
|
216
|
-
return nil
|
217
|
-
else
|
218
|
-
puts "Warning. Actually found a gap #{protein_info.fasta_id}"
|
219
|
-
aln=GappedAligner.new().align(pep_seq,gene_seq)
|
220
|
-
unless aln.gaps.length==1
|
221
|
-
puts "More than one intron.#{aln}"
|
222
|
-
require 'debugger';debugger
|
223
|
-
end
|
224
|
-
pep_coords = []
|
225
|
-
frags = aln.fragments
|
226
|
-
frags.reverse! if protein_info.strand=='-'
|
227
|
-
|
228
|
-
frags.each { |frag|
|
229
|
-
if protein_info.strand=='+'
|
230
|
-
frag_genomic_start = protein_info.start + frag[0]
|
231
|
-
frag_genomic_end = protein_info.start + frag[1]
|
232
|
-
else
|
233
|
-
frag_genomic_start = protein_info.end - frag[1]
|
234
|
-
frag_genomic_end = protein_info.end - frag[0]
|
235
|
-
end
|
236
|
-
pep_coords << frag_genomic_start
|
237
|
-
pep_coords << frag_genomic_end
|
238
|
-
}
|
239
|
-
|
240
|
-
return [pep_coords]
|
241
|
-
end
|
242
|
-
end
|
243
|
-
|
244
238
|
def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
|
245
239
|
|
246
240
|
sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
|
247
241
|
|
242
|
+
|
248
243
|
# Assume positive strand
|
249
244
|
pi_start=pepstart*3+gene_start-1
|
250
245
|
pi_end=pepend*3+gene_start-1
|
@@ -271,6 +266,13 @@ def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,codi
|
|
271
266
|
end
|
272
267
|
else
|
273
268
|
if finding_start
|
269
|
+
|
270
|
+
if ( pi_end <= cds_end) #Whole peptide contained in a single exon
|
271
|
+
fragments << [p_i+1,pi_end]
|
272
|
+
break;
|
273
|
+
end
|
274
|
+
|
275
|
+
|
274
276
|
fragments << [p_i+1,(cds_end)]
|
275
277
|
next_coords = sorted_cds[i+1]
|
276
278
|
intron_offset = ((next_coords[0]-cds_end)-1)
|
@@ -290,9 +292,10 @@ end
|
|
290
292
|
|
291
293
|
# gene_seq should already have been reverse_complemented if on reverse strand
|
292
294
|
def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
293
|
-
if ( peptide_is_in_sixframe(pep_seq,gene_seq))
|
294
|
-
|
295
|
-
|
295
|
+
# if ( peptide_is_in_sixframe(pep_seq,gene_seq))
|
296
|
+
# Peptide is in 6-frame but on a predicted transcript
|
297
|
+
# return nil
|
298
|
+
# else
|
296
299
|
|
297
300
|
# puts "Found a gap #{protein_info.fasta_id}"
|
298
301
|
if protein_info.strand=='-'
|
@@ -315,7 +318,7 @@ def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,g
|
|
315
318
|
pep_end_i = pep_start_i+pep_seq.length
|
316
319
|
|
317
320
|
return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
|
318
|
-
end
|
321
|
+
# end
|
319
322
|
end
|
320
323
|
|
321
324
|
def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
@@ -421,7 +424,7 @@ def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end
|
|
421
424
|
end
|
422
425
|
return nil unless is_tryptic
|
423
426
|
|
424
|
-
start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-
|
427
|
+
start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-2
|
425
428
|
# require 'debugger';debugger
|
426
429
|
return [start_codon_coord,start_codon_coord+2]
|
427
430
|
else
|
@@ -442,32 +445,38 @@ def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,pepti
|
|
442
445
|
end
|
443
446
|
|
444
447
|
|
445
|
-
def
|
448
|
+
def get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
|
446
449
|
pi=protein_seq.index(peptide_seq)
|
447
450
|
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
|
451
|
+
# Since trypsin sometimes cleaves before P (ie breaking the rule)
|
452
|
+
# we don't check for it and assume those cases are real tryptic termini
|
448
453
|
reverse_leader_seq=protein_seq[0..pi].reverse
|
449
454
|
mi=reverse_leader_seq.index('M')
|
450
455
|
|
451
456
|
if ( mi==nil )
|
452
|
-
puts "No methionine found ahead of peptide sequence. Unable to determine
|
457
|
+
puts "No methionine found ahead of peptide sequence. Unable to determine n-term sequence"
|
453
458
|
return nil
|
454
459
|
end
|
455
460
|
|
456
461
|
mi=pi-mi
|
457
462
|
|
458
|
-
|
463
|
+
ntermseq=protein_seq[mi..(pi-1)]
|
464
|
+
|
465
|
+
# if ( ntermseq.length < minlen )
|
466
|
+
# return nil
|
467
|
+
# end
|
468
|
+
|
469
|
+
# $STDOUT.write protein_seq[mi..(pi+peptide_seq.length-1)]
|
470
|
+
# require 'debugger';debugger
|
471
|
+
full_seq_with_annotations = "#{ntermseq}(cleaved)#{protein_seq[(pi..(pi+peptide_seq.length-1))]}"
|
472
|
+
|
473
|
+
return full_seq_with_annotations
|
459
474
|
else
|
460
475
|
return nil
|
461
476
|
end
|
462
477
|
end
|
463
478
|
|
464
|
-
def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,genomedb=nil)
|
465
|
-
|
466
|
-
dna_sequence=nil
|
467
|
-
if !protein_info.is_sixframe
|
468
|
-
throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
|
469
|
-
dna_sequence = get_dna_sequence(protein_info,genomedb)
|
470
|
-
end
|
479
|
+
def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,dna_sequence,genomedb=nil)
|
471
480
|
|
472
481
|
prot_seq = protein_seq
|
473
482
|
pep_seq = peptide_seq
|
@@ -517,24 +526,37 @@ def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_i
|
|
517
526
|
gff_records+=[start_codon_gff]
|
518
527
|
end
|
519
528
|
|
520
|
-
|
521
|
-
|
522
|
-
# require 'debugger';debugger
|
529
|
+
end
|
530
|
+
# puts gff_records
|
523
531
|
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
532
|
+
gff_records
|
533
|
+
end
|
534
|
+
|
535
|
+
def add_putative_nterm_to_gff(gff_records,peptide_seq,protein_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
|
536
|
+
pep_id = "#{prot_id}.p#{peptide_count.to_s}"
|
537
|
+
signal_peptide = get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
|
538
|
+
if signal_peptide
|
539
|
+
$stdout.write "Nterm\t#{signal_peptide}\t#{protein_info.name}\t#{protein_seq}\n"
|
540
|
+
raw_signal_peptide=signal_peptide.sub(/\(cleaved\)/,"")
|
541
|
+
# Get raw signal_peptide sequence
|
542
|
+
|
543
|
+
signal_peptide_coords=get_peptide_coordinates(protein_seq,raw_signal_peptide,protein_info,dna_sequence)
|
544
|
+
if signal_peptide_coords
|
545
|
+
signal_peptide_coords.each do |spcoords|
|
546
|
+
signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,raw_signal_peptide,genomedb,"signalpeptide")
|
528
547
|
gff_records += signal_peptide_gff
|
529
|
-
end
|
530
548
|
end
|
531
549
|
end
|
532
|
-
|
533
|
-
|
534
550
|
end
|
535
|
-
|
551
|
+
end
|
536
552
|
|
537
|
-
|
553
|
+
def peptide_gff_is_duplicate(peptide_gff,peptides_covered_genome)
|
554
|
+
nameindex = peptide_gff.attributes.index {|obj| obj[0]=="Name" }
|
555
|
+
pep_seq = peptide_gff.attributes[nameindex][1]
|
556
|
+
existing = peptides_covered_genome[pep_seq]
|
557
|
+
return true if existing==peptide_gff.start
|
558
|
+
|
559
|
+
return false
|
538
560
|
end
|
539
561
|
|
540
562
|
proteins = parse_proteins(tool.protxml)
|
@@ -552,15 +574,23 @@ peptide_count = 0
|
|
552
574
|
protein_count = 0
|
553
575
|
total_peptides = 0
|
554
576
|
|
577
|
+
peptides_covered_genome={}
|
578
|
+
|
555
579
|
for prot in proteins
|
556
580
|
prot_prob = prot['probability']
|
557
|
-
if ( prot_prob.to_f < tool.
|
581
|
+
if ( prot_prob.to_f < tool.protein_probability_threshold )
|
558
582
|
next
|
559
583
|
end
|
560
584
|
|
561
585
|
# Gets identifiers of all proteins (includeing indistinguishable ones)
|
562
586
|
prot_names=protein_names(prot)
|
563
587
|
|
588
|
+
|
589
|
+
if tool.protein_find!=nil
|
590
|
+
prot_names=prot_names.keep_if { |pname| pname.include? tool.protein_find }
|
591
|
+
end
|
592
|
+
|
593
|
+
|
564
594
|
peptides=peptide_nodes(prot)
|
565
595
|
entries_covered=[]
|
566
596
|
for protein_name in prot_names
|
@@ -571,7 +601,7 @@ for prot in proteins
|
|
571
601
|
protein_fasta_entry = get_fasta_record(protein_name,fastadb)
|
572
602
|
protein_info = cds_info_from_fasta(protein_fasta_entry)
|
573
603
|
|
574
|
-
|
604
|
+
unless (tool.collapse_redundant_proteins && !is_new_genome_location(protein_info,entries_covered) )
|
575
605
|
|
576
606
|
protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
|
577
607
|
|
@@ -580,15 +610,41 @@ for prot in proteins
|
|
580
610
|
prot_seq = protein_fasta_entry.aaseq.to_s
|
581
611
|
throw "Not amino_acids" if prot_seq != protein_fasta_entry.seq.to_s
|
582
612
|
|
613
|
+
peptides_covered_protein=[]
|
583
614
|
peptide_count=1
|
584
615
|
for peptide in peptides
|
616
|
+
|
585
617
|
pprob = peptide['nsp_adjusted_probability'].to_f
|
586
|
-
|
587
|
-
|
588
|
-
|
618
|
+
# puts peptide
|
619
|
+
# puts pprob
|
620
|
+
pep_seq = peptide['peptide_sequence']
|
621
|
+
|
622
|
+
if ( pprob >= tool.peptide_probability_threshold && (!peptides_covered_protein.include?(pep_seq) || tool.stack_charge_states))
|
623
|
+
|
624
|
+
dna_sequence=nil
|
625
|
+
if !protein_info.is_sixframe
|
626
|
+
throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
|
627
|
+
dna_sequence = get_dna_sequence(protein_info,genomedb)
|
628
|
+
end
|
629
|
+
|
630
|
+
|
631
|
+
peptide_gff = generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
|
632
|
+
|
633
|
+
unless (peptide_gff.length==0 || peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
|
634
|
+
|
635
|
+
add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
|
636
|
+
|
637
|
+
gff_db.records += peptide_gff
|
638
|
+
|
639
|
+
peptides_covered_protein << pep_seq unless tool.stack_charge_states
|
640
|
+
peptides_covered_genome[pep_seq] = peptide_gff[0].start
|
589
641
|
|
590
|
-
|
591
|
-
|
642
|
+
total_peptides += 1
|
643
|
+
peptide_count+=1
|
644
|
+
else
|
645
|
+
puts "Duplicate peptide #{peptide_gff[0]}"
|
646
|
+
end
|
647
|
+
# puts gff_db.records.last
|
592
648
|
end
|
593
649
|
end
|
594
650
|
else
|