protk 1.2.6.pre1 → 1.2.6.pre2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c2cb8ece1038575e27d4d2bb8d5eff3ff2367a33
4
+ data.tar.gz: db9cd97c0b87186e7a84c53ed43e132b25d1cd15
5
+ SHA512:
6
+ metadata.gz: df19012cdb42d14136a4f4682a3a45b2423b4b216bcdd8f959e0eaab5b7bcaa0546b5550d3a69accc1da4e158966150bb9f0e21d52c0ed2bf2c0397ca333195c
7
+ data.tar.gz: a08a7bd13e3027b21bdd24cf13491e6041b05a19f3f066468a96357b543332e40b4827356de2d4b81e1d7a67daa44a73b283b1948f7824f33c6cf738df46383d
data/README.md CHANGED
@@ -8,43 +8,38 @@ Protk is a wrapper for various proteomics tools. It aims to present a consistent
8
8
 
9
9
  ***
10
10
 
11
+ ## Table of Contents
11
12
 
13
+ * [Protk](#what-is-it?)
14
+ * [Installation](#installation)
15
+ * [Configuration](#configuration)
12
16
 
13
- ## Basic Installation
17
+
18
+
19
+ ## Installation
14
20
 
15
- Protk depends on ruby 1.9. The recommended way to install ruby and manage ruby gems is with rvm. Install rvm using this command.
16
-
17
- ```sh
18
- curl -L https://get.rvm.io | bash -s stable
21
+ The easiest intallation method is to use rubygems. You might need to install the libxml2 package on your system first (eg libxml-dev on Ubuntu)
22
+
23
+ ``` shell
24
+ gem install protk
19
25
  ```
20
26
 
21
- Next install ruby and protk's dependencies
27
+ ## Configuration
22
28
 
23
- On OSX
29
+ By default protk will install tools and databases into `.protk` in your home directory. If this is not desirable you can change the protk root default by setting the environment variable `PROTK_INSTALL_DIR`. You can also avoid using a `.protk` directory altogether (see below)
24
30
 
25
- ```sh
26
- rvm install 1.9.3 --with-gcc=clang
27
- rvm use 1.9.3
28
- gem install protk
29
- protk_setup.rb package_manager
30
- protk_setup.rb system_packages
31
- protk_setup.rb all
32
- ```
33
- On Linux
34
-
35
- ```sh
36
- rvm install 1.9.3
37
- rvm use 1.9.3
38
- gem install protk
39
- sudo ~/.rvm/bin/rvm 1.9.3 do protk_setup.rb system_packages
40
- protk_setup all
31
+ Protk includes a setup tool to install various third party proteomics tools such as the TPP, OMSSA, MS-GF+, Proteowizard. If this tool is used it installs everything under `.protk/tools`. To perform such an installation use;
32
+
33
+ ```shell
34
+ protk_setup.rb tpp omssa blast msgfplus pwiz
41
35
  ```
42
36
 
43
- Instead off using protk_setup.rb all it might be preferable to only install some of the protk tool dependencies. 'all' is just an alias for the following full target list, any of which can be omitted with the consequence that tools depending on that component will not be available.
37
+ Alternatively, these tools may already be present on your system, or you may prefer to install them yourself. In that case simply ensure that all executables are included in your `$PATH`. Those executables will be used as a fallback if nothing is available under the `.protk` installation directory.
38
+
39
+
40
+ Instead off using protk_setup.rb all it might be preferable to only install some of the protk tool dependencies. 'all' is just an alias for the following full target list, any of which can be omitted with the consequence that tools depending on that component will not be available.
41
+
44
42
 
45
- ```sh
46
- protk_setup.rb tpp omssa blast msgfplus pwiz openms galaxyenv
47
- ```
48
43
 
49
44
  ## Sequence databases
50
45
 
@@ -10,7 +10,6 @@ require 'bio'
10
10
  require 'protk/fastadb'
11
11
  require 'bio-blastxmlparser'
12
12
 
13
-
14
13
  tool=Tool.new([:explicit_output])
15
14
  tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
16
15
 
@@ -19,6 +18,16 @@ tool.option_parser.on( '-d filename','--database filename', 'Database used for B
19
18
  tool.options.database=file
20
19
  end
21
20
 
21
+ tool.options.gene2go=nil
22
+ tool.option_parser.on('--gene2go pathtogene2go','Path to gene2go database. If provided GO terms will be looked up') do |gene2go|
23
+ tool.options.gene2go=gene2go
24
+ end
25
+
26
+ tool.options.gitogeneid=nil
27
+ tool.option_parser.on('--gitogeneid gitogeneid.db','Path to GDBM formatted gi to geneid mapping database. If provided gene ids will be looked up') do |gitogeneid|
28
+ tool.options.gitogeneid=gitogeneid
29
+ end
30
+
22
31
  exit unless tool.check_options
23
32
 
24
33
  #require 'debugger';debugger
@@ -36,11 +45,43 @@ if tool.database
36
45
  $fastadb=FastaDB.new(tool.database)
37
46
  end
38
47
 
48
+ $gitogeneid = nil
49
+ if (tool.gitogeneid!=nil) && (File.exist? tool.gitogeneid)
50
+ require 'gdbm'
51
+ $gitogeneid = GDBM.new(tool.gitogeneid,flags=GDBM::READER)
52
+ end
53
+
54
+
55
+ $gene2go = nil
56
+ if (tool.gene2go!=nil) && (File.exist? tool.gene2go)
57
+ require 'gdbm'
58
+ $gene2go = GDBM.new(tool.gene2go,flags=GDBM::READER)
59
+ end
60
+
61
+ def gi_from_hit_id(hit_id)
62
+ gi_scan=hit_id.scan(/gi\|(\d+)/)
63
+ gi_scan.join("")
64
+ end
65
+
39
66
  def generate_line(hsp,hit,query,hit_seq=nil)
40
- line="#{query.query_id}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
67
+
68
+ line="#{query.query_id}\t#{query.query_def}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hit.accession}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
41
69
  if hit_seq
42
70
  line << "\t#{hit_seq}"
43
71
  end
72
+ geneid=""
73
+ goterm=""
74
+ if $gitogeneid
75
+ geneid=$gitogeneid[gi_from_hit_id(hit.hit_id)]
76
+ goterm=$gene2go[geneid] if geneid!=nil && $gene2go
77
+ end
78
+
79
+
80
+ # throw "No geneid" if geneid==nil
81
+ line << "\t#{geneid}\t#{goterm}"
82
+ # require 'debugger';debugger
83
+ # puts gi_from_hit_id(hit.hit_id)
84
+ # puts $gene2go[gi_from_hit_id(hit.hit_id)]
44
85
  line<<"\n"
45
86
  line
46
87
  end
@@ -61,12 +102,18 @@ blast.each do |query|
61
102
  # if hit
62
103
  hit_seq=fetch_hit_seq(hit)
63
104
  hit.hsps.each do |hsp|
64
- out_file.write generate_line(hsp,hit,query,hit_seq)
105
+ out_line=generate_line(hsp,hit,query,hit_seq)
106
+
107
+ out_file.write out_line
65
108
  end
66
109
  # end
67
110
  end
68
111
  end
69
112
 
113
+
114
+ $gitogeneid.close if $gitogeneid!=nil
115
+ $gene2go.close if $gene2go!=nil
116
+
70
117
  #require 'debugger';debugger
71
118
 
72
119
  #puts "Hi"
data/bin/make_decoy.rb CHANGED
@@ -32,6 +32,16 @@ tool.option_parser.on('-P str','--prefix-string str','String to prepend to seque
32
32
  tool.options.prefix_string=str
33
33
  end
34
34
 
35
+ tool.options.reverse_only=false
36
+ tool.option_parser.on('--reverse-only','Just reverse sequences. Dont try to randomize') do
37
+ tool.options.reverse_only=true
38
+ end
39
+
40
+ tool.options.id_regex=".*?\\|(.*?)[ \\|]"
41
+ tool.option_parser.on('--id-regex regex','Regex for finding IDs. If reverse-only is used then this will be used to find ids and prepend with the decoy string. Default .*?\\|(.*?)[ \\|]') do regex
42
+ tool.options.id_regex=regex
43
+ end
44
+
35
45
  tool.options.append=false
36
46
  tool.option_parser.on('-A','--append','Append input sequences to the generated database') do
37
47
  tool.options.append=true
@@ -64,7 +74,23 @@ genv=Constants.new()
64
74
 
65
75
  decoys_tmp_file = Pathname.new(Tempfile.new("random").path).basename.to_s;
66
76
 
67
- Randomize.make_decoys input_file, db_length, decoys_tmp_file, tool.prefix_string
77
+ if (tool.reverse_only)
78
+ decoys_out = File.open(decoys_tmp_file,'w+')
79
+ Bio::FastaFormat.open(input_file).each do |seq|
80
+ id=nil
81
+ begin
82
+ id=seq.definition.scan(/#{id_regex}/)[0][0]
83
+ revdef=seq.definition.sub(id,"#{tool.prefix_string}#{id}")
84
+ decoys_out.write ">#{revdef}\n#{seq.aaseq}\n"
85
+ rescue
86
+ puts "Unable to parse id for #{seq.definition}. Skipping" if (id==nil)
87
+ end
88
+ end
89
+ decoys_out.close
90
+ else
91
+ Randomize.make_decoys input_file, db_length, decoys_tmp_file, tool.prefix_string
92
+ end
93
+
68
94
  cmd = "cat #{input_file} #{decoys_tmp_file} >> #{output_file}; rm #{decoys_tmp_file}" if ( tool.append )
69
95
 
70
96
  # Randomize.make_decoys raw_db_filename, db_length, decoys_filename, decoy_prefix
@@ -76,4 +102,6 @@ p cmd
76
102
  job_params= {:jobid => tool.jobid_from_filename(input_file) }
77
103
  job_params[:queue]="lowmem"
78
104
  job_params[:vmem]="900mb"
79
- tool.run(cmd,genv,job_params)
105
+ tool.run(cmd,genv,job_params)
106
+
107
+
data/bin/mascot_search.rb CHANGED
@@ -22,7 +22,7 @@ def login(mascot_cgi,username,password)
22
22
  authdict[:savecookie]="1"
23
23
 
24
24
  p "Logging in to #{mascot_cgi}/login.pl"
25
- p authdict
25
+
26
26
  response = RestClient.post "#{mascot_cgi}/login.pl", authdict
27
27
 
28
28
  cookie = response.cookies
@@ -40,6 +40,8 @@ def download_datfile(mascot_cgi,results_date,results_file,explicit_output,openur
40
40
  output_path="#{results_file}"
41
41
  end
42
42
 
43
+ puts "Writing output to #{output_path}"
44
+
43
45
  require 'open-uri'
44
46
  open("#{output_path}", 'wb') do |file|
45
47
  file << open("#{get_url}","Cookie"=>openurlcookie).read
@@ -144,6 +146,12 @@ search_tool.option_parser.on( '--export format', 'Save results in a specified fo
144
146
  search_tool.options.export_format=format
145
147
  end
146
148
 
149
+ search_tool.options.download_only=nil
150
+ search_tool.option_parser.on( '--download-only path', 'Specify a path to an existing results file for download eg(20131113/F227185.dat)' ) do |path|
151
+ search_tool.options.download_only=path
152
+ end
153
+
154
+
147
155
  search_tool.options.timeout=200
148
156
  search_tool.option_parser.on( '--timeout seconds', 'Timeout for sending data file to mascot in seconds' ) do |seconds|
149
157
  search_tool.options.timeout=seconds.to_i
@@ -151,8 +159,9 @@ end
151
159
 
152
160
  exit unless search_tool.check_options
153
161
 
154
- if ( ARGV[0].nil? )
162
+ if ( ARGV[0].nil? && search_tool.download_only.nil?)
155
163
  puts "You must supply an input file"
164
+ puts search_tool.download_only
156
165
  puts search_tool.option_parser
157
166
  exit
158
167
  end
@@ -167,7 +176,6 @@ unless ( mascot_cgi =~ /^http[s]?:\/\//)
167
176
  end
168
177
 
169
178
  RestClient.proxy=search_tool.httpproxy if search_tool.httpproxy
170
- $genv.log("Var mods #{search_tool.var_mods} and fixed #{search_tool.fix_mods}",:info)
171
179
 
172
180
  cookie=""
173
181
  openurlcookie=""
@@ -178,36 +186,47 @@ if ( search_tool.use_security)
178
186
  openurlcookie = "MASCOT_SESSION=#{cookie['MASCOT_SESSION']}; MASCOT_USERID=#{cookie['MASCOT_USERID']}; MASCOT_USERNAME=#{cookie['MASCOT_USERNAME']}"
179
187
  end
180
188
 
181
- postdict = search_params_dictionary search_tool, ARGV[0]
182
- $genv.log("Sending #{postdict}",:info)
189
+ if ( !search_tool.download_only.nil?)
190
+ parts=search_tool.download_only.split("/")
191
+ throw "Must provide a path of the format date/filename" unless parts.length==2
192
+ results_date=parts[0]
193
+ results_file=parts[1]
194
+ download_datfile mascot_cgi, results_date, results_file,search_tool.explicit_output,openurlcookie
195
+ else
196
+ #$genv.log("Var mods #{search_tool.var_mods} and fixed #{search_tool.fix_mods}",:info)
183
197
 
184
- #site = RestClient::Resource.new(mascot_cgi, timeout=300)
185
- #search_response=site['/nph-mascot.exe?1'].post , postdict, {:cookies=>cookie}
198
+ postdict = search_params_dictionary search_tool, ARGV[0]
199
+ $genv.log("Sending #{postdict}",:info)
186
200
 
187
- search_response=RestClient::Request.execute(:method => :post, :url => "#{mascot_cgi}/nph-mascot.exe?1", :payload => postdict,:headers=>{:cookies=>cookie},:timeout => search_tool.options.timeout, :open_timeout => 10)
201
+ #site = RestClient::Resource.new(mascot_cgi, timeout=300)
202
+ #search_response=site['/nph-mascot.exe?1'].post , postdict, {:cookies=>cookie}
188
203
 
204
+ search_response=RestClient::Request.execute(:method => :post, :url => "#{mascot_cgi}/nph-mascot.exe?1", :payload => postdict,:headers=>{:cookies=>cookie},:timeout => search_tool.options.timeout, :open_timeout => 10)
189
205
 
190
- #search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
191
206
 
192
- $genv.log("Mascot search response was #{search_response}",:info)
207
+ #search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
193
208
 
194
- # Look for an error if there is one
195
- error_result= /Sorry, your search could not be performed(.*)/.match(search_response)
196
- if ( error_result != nil )
197
- puts error_result[0]
198
- $genv.log("Mascot search failed with response #{search_response}",:warn)
199
- throw "Mascot search failed with response #{search_response}"
200
- elsif (search_tool.export_format=="mascotdat")
201
- # Search for the location of the mascot data file in the response
202
- results=/master_results_?2?\.pl\?file=\.*\/data\/(.*)\/(.+\.dat)/.match(search_response)
203
- results_date=results[1]
204
- results_file=results[2]
209
+ $genv.log("Mascot search response was #{search_response}",:info)
205
210
 
206
- download_datfile mascot_cgi, results_date, results_file,search_tool.explicit_output,openurlcookie
207
- else
208
- results=/master_results_?2?\.pl\?file=(\.*\/data\/.*\/.+\.dat)/.match(search_response)
209
- results_file = results[1]
210
- export_results mascot_cgi,cookie,results_file,search_tool.export_format, openurlcookie
211
- # export_results mascot_cgi,cookie,results_file,search_tool.export_format
211
+ # Look for an error if there is one
212
+ error_result= /Sorry, your search could not be performed(.*)/.match(search_response)
213
+ if ( error_result != nil )
214
+ puts error_result[0]
215
+ $genv.log("Mascot search failed with response #{search_response}",:warn)
216
+ throw "Mascot search failed with response #{search_response}"
217
+ elsif (search_tool.export_format=="mascotdat")
218
+ # Search for the location of the mascot data file in the response
219
+ results=/master_results_?2?\.pl\?file=\.*\/data\/(.*)\/(.+\.dat)/.match(search_response)
220
+ results_date=results[1]
221
+ results_file=results[2]
222
+
223
+ download_datfile mascot_cgi, results_date, results_file,search_tool.explicit_output,openurlcookie
224
+ else
225
+ results=/master_results_?2?\.pl\?file=(\.*\/data\/.*\/.+\.dat)/.match(search_response)
226
+ results_file = results[1]
227
+ export_results mascot_cgi,cookie,results_file,search_tool.export_format, openurlcookie
228
+ # export_results mascot_cgi,cookie,results_file,search_tool.export_format
229
+ end
212
230
  end
213
231
 
232
+
@@ -5,6 +5,7 @@
5
5
  #
6
6
  # Runs an MS/MS search using the MSGFPlus search engine
7
7
  #
8
+
8
9
  $VERBOSE=nil
9
10
  require 'protk/constants'
10
11
  require 'protk/command_runner'
@@ -18,7 +19,7 @@ input_stager = nil
18
19
  # Setup specific command-line options for this tool. Other options are inherited from SearchTool
19
20
  #
20
21
  search_tool=SearchTool.new([:background,:database,:explicit_output,:over_write,:enzyme,
21
- :modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:missed_cleavages])
22
+ :modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:cleavage_semi])
22
23
 
23
24
  search_tool.jobid_prefix="p"
24
25
  search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
@@ -164,10 +165,10 @@ ARGV.each do |filename|
164
165
  # The basic command
165
166
  #
166
167
  cmd= "#{make_msgfdb_cmd} java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{current_db} -s #{input_path} -o #{mzid_output_path} "
167
- #Missed cleavages
168
+
169
+ #Semi tryptic peptides
168
170
  #
169
- throw "Maximum value for missed cleavages is 2" if ( search_tool.missed_cleavages.to_i > 2)
170
- cmd << " -ntt #{search_tool.missed_cleavages}"
171
+ cmd << " -ntt 1" if ( search_tool.cleavage_semi )
171
172
 
172
173
  # Precursor tolerance
173
174
  #
@@ -235,6 +236,8 @@ ARGV.each do |filename|
235
236
  cmd << "; cp #{mzid_output_path} #{output_path}"
236
237
  else
237
238
  #if search_tool.explicit_output
239
+ cmd << ";ruby -pi.bak -e \"gsub('post=\\\"?','post=\\\"X')\" #{mzid_output_path}"
240
+ cmd << ";ruby -pi.bak -e \"gsub('pre=\\\"?','pre=\\\"X')\" #{mzid_output_path}"
238
241
  cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
239
242
  #Then copy the pepxml to the final output path
240
243
  cmd << "; mv #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
@@ -92,6 +92,11 @@ prophet_tool.option_parser.on( '--no-decoy', 'Don\'t use decoy sequences to pin
92
92
  prophet_tool.options.no_decoys = true
93
93
  end
94
94
 
95
+ prophet_tool.options.experiment_label=nil
96
+ prophet_tool.option_parser.on('--experiment-label label','used to commonly label all spectra belonging to one experiment (required by iProphet)') do |label|
97
+ prophet_tool.options.experiment_label = label
98
+ end
99
+
95
100
  prophet_tool.options.override_database=nil
96
101
  prophet_tool.option_parser.on( '--override-database database', 'Manually specify database') do |database|
97
102
  prophet_tool.options.override_database = database
@@ -212,6 +217,10 @@ def generate_command(genv,prophet_tool,inputs,output,database,engine)
212
217
  cmd << " -I2 -T3 -I4 -I5 -I6 -I7 "
213
218
  end
214
219
 
220
+ if prophet_tool.experiment_label!=nil
221
+ cmd << " -E#{prophet_tool.experiment_label} "
222
+ end
223
+
215
224
  unless prophet_tool.no_decoys
216
225
 
217
226
  if engine=="omssa" || engine=="phenyx"
@@ -29,6 +29,16 @@ tool.option_parser.on( '-d filename','--database filename', 'Database used for m
29
29
  tool.options.database=file
30
30
  end
31
31
 
32
+ tool.options.protein_find=nil
33
+ tool.option_parser.on( '-f term','--find term', 'Restrict output to proteins whose name matches the specified string' ) do |term|
34
+ tool.options.protein_find=term
35
+ end
36
+
37
+ tool.options.nterm_minlen=7
38
+ tool.option_parser.on( '-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len' ) do |len|
39
+ tool.options.nterm_minlen=len
40
+ end
41
+
32
42
  tool.options.genome=nil
33
43
  tool.option_parser.on( '-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)' ) do |file|
34
44
  tool.options.genome=file
@@ -39,11 +49,26 @@ tool.option_parser.on('--skip-index','Don\'t index database (Index should alread
39
49
  tool.options.skip_fasta_indexing=true
40
50
  end
41
51
 
52
+ tool.options.stack_charge_states=false
53
+ tool.option_parser.on('--stack-charge-states','Different peptide charge states get separate gff entries') do
54
+ tool.options.stack_charge_states=true
55
+ end
56
+
57
+ tool.options.collapse_redundant_proteins=false
58
+ tool.option_parser.on('--collapse-redundant-proteins','Proteins that cover genomic regions already covered will be skipped') do
59
+ tool.options.collapse_redundant_proteins=true
60
+ end
61
+
42
62
  tool.options.peptide_probability_threshold=0.95
43
63
  tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
44
64
  tool.options.peptide_probability_threshold=thresh.to_f
45
65
  end
46
66
 
67
+ tool.options.protein_probability_threshold=0.99
68
+ tool.option_parser.on('--prot-threshold prob','Protein Probability Threshold (Default 0.99)') do |thresh|
69
+ tool.options.protein_probability_threshold=thresh.to_f
70
+ end
71
+
47
72
  exit unless tool.check_options [:protxml,:database]
48
73
 
49
74
  gff_out_file="peptides.gff"
@@ -94,7 +119,7 @@ def protein_names(protein_node)
94
119
  end
95
120
 
96
121
  def peptide_nodes(protein_node)
97
- protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
122
+ return protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
98
123
  end
99
124
 
100
125
 
@@ -210,41 +235,11 @@ def peptide_is_in_sixframe(pep_seq,gene_seq)
210
235
  return false
211
236
  end
212
237
 
213
- # gene_seq should already have been reverse_complemented if on reverse strand
214
- def get_peptide_coordinates_by_alignment(prot_seq,pep_seq,protein_info,gene_seq)
215
- if ( peptide_is_in_sixframe(pep_seq,gene_seq))
216
- return nil
217
- else
218
- puts "Warning. Actually found a gap #{protein_info.fasta_id}"
219
- aln=GappedAligner.new().align(pep_seq,gene_seq)
220
- unless aln.gaps.length==1
221
- puts "More than one intron.#{aln}"
222
- require 'debugger';debugger
223
- end
224
- pep_coords = []
225
- frags = aln.fragments
226
- frags.reverse! if protein_info.strand=='-'
227
-
228
- frags.each { |frag|
229
- if protein_info.strand=='+'
230
- frag_genomic_start = protein_info.start + frag[0]
231
- frag_genomic_end = protein_info.start + frag[1]
232
- else
233
- frag_genomic_start = protein_info.end - frag[1]
234
- frag_genomic_end = protein_info.end - frag[0]
235
- end
236
- pep_coords << frag_genomic_start
237
- pep_coords << frag_genomic_end
238
- }
239
-
240
- return [pep_coords]
241
- end
242
- end
243
-
244
238
  def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
245
239
 
246
240
  sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
247
241
 
242
+
248
243
  # Assume positive strand
249
244
  pi_start=pepstart*3+gene_start-1
250
245
  pi_end=pepend*3+gene_start-1
@@ -271,6 +266,13 @@ def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,codi
271
266
  end
272
267
  else
273
268
  if finding_start
269
+
270
+ if ( pi_end <= cds_end) #Whole peptide contained in a single exon
271
+ fragments << [p_i+1,pi_end]
272
+ break;
273
+ end
274
+
275
+
274
276
  fragments << [p_i+1,(cds_end)]
275
277
  next_coords = sorted_cds[i+1]
276
278
  intron_offset = ((next_coords[0]-cds_end)-1)
@@ -290,9 +292,10 @@ end
290
292
 
291
293
  # gene_seq should already have been reverse_complemented if on reverse strand
292
294
  def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
293
- if ( peptide_is_in_sixframe(pep_seq,gene_seq))
294
- return nil
295
- else
295
+ # if ( peptide_is_in_sixframe(pep_seq,gene_seq))
296
+ # Peptide is in 6-frame but on a predicted transcript
297
+ # return nil
298
+ # else
296
299
 
297
300
  # puts "Found a gap #{protein_info.fasta_id}"
298
301
  if protein_info.strand=='-'
@@ -315,7 +318,7 @@ def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,g
315
318
  pep_end_i = pep_start_i+pep_seq.length
316
319
 
317
320
  return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
318
- end
321
+ # end
319
322
  end
320
323
 
321
324
  def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
@@ -421,7 +424,7 @@ def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end
421
424
  end
422
425
  return nil unless is_tryptic
423
426
 
424
- start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-1
427
+ start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-2
425
428
  # require 'debugger';debugger
426
429
  return [start_codon_coord,start_codon_coord+2]
427
430
  else
@@ -442,32 +445,38 @@ def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,pepti
442
445
  end
443
446
 
444
447
 
445
- def get_signal_peptide_for_peptide(peptide_seq,protein_seq)
448
+ def get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
446
449
  pi=protein_seq.index(peptide_seq)
447
450
  if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
451
+ # Since trypsin sometimes cleaves before P (ie breaking the rule)
452
+ # we don't check for it and assume those cases are real tryptic termini
448
453
  reverse_leader_seq=protein_seq[0..pi].reverse
449
454
  mi=reverse_leader_seq.index('M')
450
455
 
451
456
  if ( mi==nil )
452
- puts "No methionine found ahead of peptide sequence. Unable to determine signal peptide sequence"
457
+ puts "No methionine found ahead of peptide sequence. Unable to determine n-term sequence"
453
458
  return nil
454
459
  end
455
460
 
456
461
  mi=pi-mi
457
462
 
458
- return protein_seq[mi..(pi-1)]
463
+ ntermseq=protein_seq[mi..(pi-1)]
464
+
465
+ # if ( ntermseq.length < minlen )
466
+ # return nil
467
+ # end
468
+
469
+ # $STDOUT.write protein_seq[mi..(pi+peptide_seq.length-1)]
470
+ # require 'debugger';debugger
471
+ full_seq_with_annotations = "#{ntermseq}(cleaved)#{protein_seq[(pi..(pi+peptide_seq.length-1))]}"
472
+
473
+ return full_seq_with_annotations
459
474
  else
460
475
  return nil
461
476
  end
462
477
  end
463
478
 
464
- def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,genomedb=nil)
465
-
466
- dna_sequence=nil
467
- if !protein_info.is_sixframe
468
- throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
469
- dna_sequence = get_dna_sequence(protein_info,genomedb)
470
- end
479
+ def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,dna_sequence,genomedb=nil)
471
480
 
472
481
  prot_seq = protein_seq
473
482
  pep_seq = peptide_seq
@@ -517,24 +526,37 @@ def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_i
517
526
  gff_records+=[start_codon_gff]
518
527
  end
519
528
 
520
- signal_peptide = get_signal_peptide_for_peptide(peptide_seq,protein_seq)
521
- if signal_peptide
522
- # require 'debugger';debugger
529
+ end
530
+ # puts gff_records
523
531
 
524
- signal_peptide_coords=get_peptide_coordinates(prot_seq,signal_peptide,protein_info,dna_sequence)
525
- if signal_peptide_coords
526
- signal_peptide_coords.each do |spcoords|
527
- signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,signal_peptide,genomedb,"signalpeptide")
532
+ gff_records
533
+ end
534
+
535
+ def add_putative_nterm_to_gff(gff_records,peptide_seq,protein_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
536
+ pep_id = "#{prot_id}.p#{peptide_count.to_s}"
537
+ signal_peptide = get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
538
+ if signal_peptide
539
+ $stdout.write "Nterm\t#{signal_peptide}\t#{protein_info.name}\t#{protein_seq}\n"
540
+ raw_signal_peptide=signal_peptide.sub(/\(cleaved\)/,"")
541
+ # Get raw signal_peptide sequence
542
+
543
+ signal_peptide_coords=get_peptide_coordinates(protein_seq,raw_signal_peptide,protein_info,dna_sequence)
544
+ if signal_peptide_coords
545
+ signal_peptide_coords.each do |spcoords|
546
+ signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,raw_signal_peptide,genomedb,"signalpeptide")
528
547
  gff_records += signal_peptide_gff
529
- end
530
548
  end
531
549
  end
532
-
533
-
534
550
  end
535
- puts gff_records
551
+ end
536
552
 
537
- gff_records
553
+ def peptide_gff_is_duplicate(peptide_gff,peptides_covered_genome)
554
+ nameindex = peptide_gff.attributes.index {|obj| obj[0]=="Name" }
555
+ pep_seq = peptide_gff.attributes[nameindex][1]
556
+ existing = peptides_covered_genome[pep_seq]
557
+ return true if existing==peptide_gff.start
558
+
559
+ return false
538
560
  end
539
561
 
540
562
  proteins = parse_proteins(tool.protxml)
@@ -552,15 +574,23 @@ peptide_count = 0
552
574
  protein_count = 0
553
575
  total_peptides = 0
554
576
 
577
+ peptides_covered_genome={}
578
+
555
579
  for prot in proteins
556
580
  prot_prob = prot['probability']
557
- if ( prot_prob.to_f < tool.peptide_probability_threshold )
581
+ if ( prot_prob.to_f < tool.protein_probability_threshold )
558
582
  next
559
583
  end
560
584
 
561
585
  # Gets identifiers of all proteins (includeing indistinguishable ones)
562
586
  prot_names=protein_names(prot)
563
587
 
588
+
589
+ if tool.protein_find!=nil
590
+ prot_names=prot_names.keep_if { |pname| pname.include? tool.protein_find }
591
+ end
592
+
593
+
564
594
  peptides=peptide_nodes(prot)
565
595
  entries_covered=[]
566
596
  for protein_name in prot_names
@@ -571,7 +601,7 @@ for prot in proteins
571
601
  protein_fasta_entry = get_fasta_record(protein_name,fastadb)
572
602
  protein_info = cds_info_from_fasta(protein_fasta_entry)
573
603
 
574
- if is_new_genome_location(protein_info,entries_covered)
604
+ unless (tool.collapse_redundant_proteins && !is_new_genome_location(protein_info,entries_covered) )
575
605
 
576
606
  protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
577
607
 
@@ -580,15 +610,41 @@ for prot in proteins
580
610
  prot_seq = protein_fasta_entry.aaseq.to_s
581
611
  throw "Not amino_acids" if prot_seq != protein_fasta_entry.seq.to_s
582
612
 
613
+ peptides_covered_protein=[]
583
614
  peptide_count=1
584
615
  for peptide in peptides
616
+
585
617
  pprob = peptide['nsp_adjusted_probability'].to_f
586
- if ( pprob >= tool.peptide_probability_threshold )
587
- total_peptides += 1
588
- pep_seq = peptide['peptide_sequence']
618
+ # puts peptide
619
+ # puts pprob
620
+ pep_seq = peptide['peptide_sequence']
621
+
622
+ if ( pprob >= tool.peptide_probability_threshold && (!peptides_covered_protein.include?(pep_seq) || tool.stack_charge_states))
623
+
624
+ dna_sequence=nil
625
+ if !protein_info.is_sixframe
626
+ throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
627
+ dna_sequence = get_dna_sequence(protein_info,genomedb)
628
+ end
629
+
630
+
631
+ peptide_gff = generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
632
+
633
+ unless (peptide_gff.length==0 || peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
634
+
635
+ add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
636
+
637
+ gff_db.records += peptide_gff
638
+
639
+ peptides_covered_protein << pep_seq unless tool.stack_charge_states
640
+ peptides_covered_genome[pep_seq] = peptide_gff[0].start
589
641
 
590
- gff_db.records += generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,genomedb)
591
- peptide_count+=1
642
+ total_peptides += 1
643
+ peptide_count+=1
644
+ else
645
+ puts "Duplicate peptide #{peptide_gff[0]}"
646
+ end
647
+ # puts gff_db.records.last
592
648
  end
593
649
  end
594
650
  else