protk 1.3.1.pre3 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +21 -19
  3. data/bin/add_retention_times.rb +1 -1
  4. data/bin/interprophet.rb +16 -5
  5. data/bin/make_decoy.rb +1 -1
  6. data/bin/manage_db.rb +1 -1
  7. data/bin/mascot_search.rb +2 -2
  8. data/bin/mascot_to_pepxml.rb +1 -1
  9. data/bin/msgfplus_search.rb +26 -9
  10. data/bin/omssa_search.rb +1 -1
  11. data/bin/peptide_prophet.rb +57 -20
  12. data/bin/pepxml_to_table.rb +15 -2
  13. data/bin/protein_prophet.rb +41 -1
  14. data/bin/protk_setup.rb +2 -2
  15. data/bin/protxml_to_gff.rb +50 -42
  16. data/bin/protxml_to_psql.rb +1 -1
  17. data/bin/protxml_to_table.rb +16 -3
  18. data/bin/repair_run_summary.rb +1 -1
  19. data/bin/sixframe.rb +2 -2
  20. data/bin/swissprot_to_table.rb +1 -1
  21. data/bin/tandem_search.rb +1 -1
  22. data/bin/tandem_to_pepxml.rb +1 -1
  23. data/lib/protk/constants.rb +2 -1
  24. data/lib/protk/convert_util.rb +1 -1
  25. data/lib/protk/data/tandem-style.css +349 -0
  26. data/lib/protk/data/tandem-style.xsl +264 -0
  27. data/lib/protk/data/tandem_gpm_defaults.xml +3 -3
  28. data/lib/protk/data/tandem_isb_kscore_defaults.xml +2 -0
  29. data/lib/protk/data/tandem_isb_native_defaults.xml +3 -0
  30. data/lib/protk/data/tandem_params.xml +0 -8
  31. data/lib/protk/fastadb.rb +1 -1
  32. data/lib/protk/galaxy_stager.rb +14 -3
  33. data/lib/protk/galaxy_util.rb +39 -31
  34. data/lib/protk/gffdb.rb +6 -1
  35. data/lib/protk/manage_db_rakefile.rake +1 -1
  36. data/lib/protk/manage_db_tool.rb +1 -1
  37. data/lib/protk/pepxml.rb +159 -7
  38. data/lib/protk/plasmodb.rb +1 -1
  39. data/lib/protk/prophet_tool.rb +20 -52
  40. data/lib/protk/setup_rakefile.rake +18 -11
  41. data/lib/protk/tandem_search_tool.rb +20 -7
  42. data/lib/protk/tool.rb +1 -1
  43. data/lib/protk/uniprot_mapper.rb +1 -1
  44. metadata +10 -14
@@ -13,7 +13,7 @@ dbname=ARGV[0]
13
13
 
14
14
  # Load database spec file
15
15
  #
16
- $genv=Constants.new()
16
+ $genv=Constants.instance()
17
17
  dbdir="#{$genv.protein_database_root}/#{dbname}"
18
18
 
19
19
  dbspec_file="#{dbdir}/.protkdb.yaml"
@@ -12,7 +12,7 @@ require 'protk/tool'
12
12
  class ManageDBTool < Tool
13
13
 
14
14
  def add dbspec, dbname
15
- genv=Constants.new()
15
+ genv=Constants.instance()
16
16
  dbdir="#{genv.protein_database_root}/#{dbname}"
17
17
  %x[mkdir -p #{dbdir}]
18
18
 
@@ -1,22 +1,174 @@
1
1
  require 'rubygems'
2
- require 'rexml/document'
3
- require 'rexml/xpath'
2
+ require 'libxml'
3
+
4
+ include LibXML
5
+
6
+ # require 'rexml/document'
7
+ # require 'rexml/xpath'
4
8
 
5
9
  class PepXML
10
+
11
+ attr_accessor :file_name
12
+
6
13
  def initialize(file_name)
7
- @doc=REXML::Document.new(File.new(file_name))
14
+ @file_name=file_name
15
+
16
+ XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
17
+ pepxml_parser=XML::Parser.file("#{file_name}")
18
+
19
+ @pepxml_ns_prefix="xmlns:"
20
+ @pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
21
+ @pepxml_doc=pepxml_parser.parse
22
+ if not @pepxml_doc.root.namespaces.default
23
+ @pepxml_ns_prefix=""
24
+ @pepxml_ns=nil
25
+ end
26
+ end
27
+
28
+
29
+
30
+ # Obtain the database name from the given input file
31
+ #
32
+ def extract_db()
33
+ reader = XML::Reader.file(self.file_name)
34
+ throw "Failed to open xml file #{file_name}" unless reader!=nil
35
+
36
+ while(reader.read)
37
+ # For pep.xml files
38
+ #
39
+ if ( reader.name == "search_database" )
40
+ dbnode=reader.expand
41
+ dbvalue=dbnode['local_path']
42
+ reader.close
43
+ return dbvalue
44
+ end
45
+
46
+ # For prot.xml files
47
+ #
48
+ if ( reader.name == "protein_summary_header" )
49
+ dbnode=reader.expand
50
+ dbvalue=dbnode['reference_database']
51
+ reader.close
52
+ return dbvalue
53
+ end
54
+
55
+
56
+
57
+ end
58
+
59
+ end
60
+
61
+
62
+
63
+ # Obtain the search engine name from the input file
64
+ # The name of the engine is returned in lowercase and should contain no spaces
65
+ # Names of common engines are searched for and extracted in simplified form if possible
66
+ #
67
+ def extract_engine()
68
+ reader = XML::Reader.file(self.file_name)
69
+ throw "Failed to open xml file #{file_name}" unless reader!=nil
70
+
71
+ while(reader.read)
72
+ if ( reader.name == "search_summary" )
73
+ dbnode=reader.expand
74
+ dbvalue=dbnode['search_engine']
75
+ reader.close
76
+ engine_name=dbvalue.gsub(/ /,"_")
77
+ engine_name=engine_name.gsub(/\(/,"")
78
+ engine_name=engine_name.gsub(/\)/,"")
79
+ engine_name=engine_name.gsub(/\!/,"")
80
+ return engine_name.downcase
81
+ end
82
+ end
83
+ end
84
+
85
+
86
+ def extract_enzyme()
87
+ reader = XML::Reader.file(self.file_name)
88
+ throw "Failed to open xml file #{file_name}" unless reader!=nil
89
+
90
+ while(reader.read)
91
+ if ( reader.name == "sample_enzyme" )
92
+ dbnode=reader.expand
93
+ dbvalue=dbnode['name']
94
+ reader.close
95
+ return dbvalue.downcase
96
+ end
97
+ end
98
+ end
99
+
100
+
101
+
102
+ def type_from_base_name(basename)
103
+ # A common error is for tools to include the extension in the base_name attribute.
104
+ # We exploit this to guess the type
105
+ ext_guess=""
106
+ case basename
107
+ when /.mgf$/
108
+ ext_guess="mgf"
109
+ when /.mzML$/
110
+ ext_guess="mzML"
111
+ when /.mzXML$/
112
+ ext_guess="mzXML"
113
+ else
114
+ ext_guess=""
115
+ end
116
+ ext_guess
117
+ end
118
+
119
+ def type_from_summary_attributes(atts)
120
+ if is_valid_type(atts["raw_data_type"])
121
+ return atts["raw_data_type"]
122
+ end
123
+
124
+ if is_valid_type(atts["raw_data"])
125
+ return atts["raw_data"]
126
+ end
127
+ return ""
128
+ end
129
+
130
+ def is_valid_type(type)
131
+ case type
132
+ when /^mgf$/i
133
+ return true
134
+ when /^mzML$/i
135
+ return true
136
+ when /^mzXML$/i
137
+ return true
138
+ else
139
+ return false
140
+ end
8
141
  end
9
142
 
10
- def find_runs()
143
+
144
+ # TODO: Make this faster and more memory efficient by using XML::Reader as in the functions above
145
+ #
146
+ def find_runs()
147
+
148
+
149
+ run_summaries = @pepxml_doc.find("//#{@pepxml_ns_prefix}msms_run_summary", @pepxml_ns)
150
+
11
151
  runs = {}
12
- REXML::XPath.each(@doc,"//msms_run_summary") do |summary|
152
+ run_summaries.each do |summary|
13
153
  base_name = summary.attributes["base_name"]
14
154
  if not runs.has_key?(base_name)
15
- runs[base_name] = {:base_name => summary.attributes["base_name"],
16
- :type => summary.attributes["raw_data"]}
155
+ bn = summary.attributes["base_name"]
156
+
157
+ runs[base_name] = {:base_name => summary.attributes["base_name"]}
158
+
159
+ if is_valid_type(type_from_summary_attributes(summary.attributes))
160
+ runs[base_name][:type] = type_from_summary_attributes(summary.attributes)
161
+ elsif is_valid_type(type_from_base_name(bn))
162
+ runs[base_name][:type] = type_from_base_name(bn)
163
+ else
164
+ runs[base_name][:type] = "mzML" # Same guess as peptide prophet makes
165
+ end
166
+
17
167
  end
18
168
  end
19
169
  runs
20
170
  end
171
+
172
+
21
173
 
22
174
  end
@@ -11,7 +11,7 @@ class PlasmoDB
11
11
  if ( env!=nil)
12
12
  @genv=env
13
13
  else
14
- @genv=Constants.new
14
+ @genv=Constants.instance
15
15
  end
16
16
 
17
17
  database_file="#{@genv.protein_database_root}/#{@genv.plasmodb_annotation_database}/raw.txt"
@@ -21,63 +21,31 @@ class ProphetTool < SearchTool
21
21
 
22
22
  super(option_support)
23
23
 
24
- end
25
-
26
-
27
-
28
- # Obtain the database name from the given input file
29
- #
30
- def extract_db(file_name)
31
- reader = XML::Reader.file(file_name)
32
- throw "Failed to open xml file #{file_name}" unless reader!=nil
33
-
34
- while(reader.read)
35
- # For pep.xml files
36
- #
37
- if ( reader.name == "search_database" )
38
- dbnode=reader.expand
39
- dbvalue=dbnode['local_path']
40
- reader.close
41
- return dbvalue
42
- end
43
-
44
- # For prot.xml files
45
- #
46
- if ( reader.name == "protein_summary_header" )
47
- dbnode=reader.expand
48
- dbvalue=dbnode['reference_database']
49
- reader.close
50
- return dbvalue
51
- end
52
-
53
-
54
-
24
+ if ( option_support.include? :probability_threshold )
25
+ add_value_option(:probability_threshold,0.05,['--p-thresh val', 'Probability threshold below which PSMs are discarded'])
55
26
  end
56
27
 
57
28
  end
58
-
59
-
60
-
61
- # Obtain the search engine name from the input file
62
- # The name of the engine is returned in lowercase and should contain no spaces
63
- # Names of common engines are searched for and extracted in simplified form if possible
29
+
30
+ # TODO: Deal with multiple enzyme combos
64
31
  #
65
- def extract_engine(file_name)
66
- reader = XML::Reader.file(file_name)
67
- throw "Failed to open xml file #{file_name}" unless reader!=nil
32
+ def self.xinteract_code_for_enzyme(enzyme_name)
33
+
34
+ codes = {
35
+ 'trypsin' => 'T',
36
+ 'stricttrypsin' => 'S',
37
+ 'chymotrypsin' => 'C',
38
+ 'ralphtrypsin' => 'R',
39
+ 'aspn' => 'A',
40
+ 'gluc' => 'G',
41
+ 'glucbicarb' => 'B',
42
+ 'cnbr' => 'M',
43
+ 'elastase' => 'E',
44
+ 'lysn' => 'L',
45
+ 'nonspecific' => 'N'
46
+ }
47
+ codes[enzyme_name]
68
48
 
69
- while(reader.read)
70
- if ( reader.name == "search_summary" )
71
- dbnode=reader.expand
72
- dbvalue=dbnode['search_engine']
73
- reader.close
74
- engine_name=dbvalue.gsub(/ /,"_")
75
- engine_name=engine_name.gsub(/\(/,"")
76
- engine_name=engine_name.gsub(/\)/,"")
77
- engine_name=engine_name.gsub(/\!/,"")
78
- return engine_name.downcase
79
- end
80
- end
81
49
  end
82
50
 
83
51
  end
@@ -1,8 +1,9 @@
1
1
 
2
2
  require 'protk/constants.rb'
3
+ require 'rake/clean'
3
4
  require 'rbconfig'
4
5
 
5
- env=Constants.new
6
+ env=Constants.instance
6
7
 
7
8
  @build_dir = "#{env.protk_dir}/tmp/build"
8
9
  @download_dir = "#{env.protk_dir}/tmp/download"
@@ -10,6 +11,8 @@ env=Constants.new
10
11
  directory @build_dir
11
12
  directory @download_dir
12
13
 
14
+ CLEAN.include @build_dir, @download_dir
15
+
13
16
  def package_manager_name
14
17
  package_managers = ["brew","yum","apt-get"]
15
18
 
@@ -30,7 +33,7 @@ def clean_build_dir
30
33
  end
31
34
 
32
35
  def download_buildfile url, file
33
- sh %{cd #{@download_dir}; wget #{url}}
36
+ sh %{cd #{@download_dir}; wget -O #{file} #{url}}
34
37
  end
35
38
 
36
39
  def download_task url, packagefile
@@ -133,10 +136,10 @@ task :perl_locallib => [perl_locallib_installed_file]
133
136
  #
134
137
  # TPP
135
138
  #
136
- tpp_version="4.6.3"
139
+ tpp_version="4.8.0"
137
140
  tpp_packagefile="TPP-#{tpp_version}.tgz"
138
141
  tpp_installed_file = "#{env.tpp_root}/bin/xinteract"
139
- tpp_url = "https://dl.dropbox.com/u/226794/TPP-4.6.3.tgz"
142
+ tpp_url = "http://sourceforge.net/projects/sashimi/files/Trans-Proteomic%20Pipeline%20%28TPP%29/TPP%20v4.8%20%28philae%29%20rev%200/TPP_4.8.0-src.tgz/download"
140
143
 
141
144
  tpp_download_file = download_task tpp_url, tpp_packagefile
142
145
 
@@ -229,10 +232,11 @@ def blast_platform
229
232
  'x64-linux'
230
233
  end
231
234
 
232
- blast_version="2.2.27+"
235
+ blast_version="2.2.30+"
233
236
  blast_packagefile="ncbi-blast-#{blast_version}-#{blast_platform}.tar.gz"
234
237
  blast_url="ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/#{blast_version.chomp('+')}/#{blast_packagefile}"
235
238
  blast_installed_file="#{env.blast_root}/bin/makeblastdb"
239
+ blast_required_bin=["makeblastdb", "blastdbcmd"]
236
240
 
237
241
  download_task blast_url, blast_packagefile
238
242
 
@@ -241,8 +245,10 @@ file blast_installed_file => [@build_dir,"#{@download_dir}/#{blast_packagefile}"
241
245
  sh %{cp #{@download_dir}/#{blast_packagefile} #{@build_dir}}
242
246
  sh %{cd #{@build_dir}; gunzip #{blast_packagefile}}
243
247
  sh %{cd #{@build_dir}; tar -xvf #{blast_packagefile.chomp('.gz')}}
244
- sh %{mkdir -p #{env.blast_root}}
245
- sh %{cd #{@build_dir}; cp -r ncbi-blast-#{blast_version}/* #{env.blast_root}/}
248
+ sh %{mkdir -p #{env.blast_root}/bin}
249
+ blast_required_bin.each do |binary|
250
+ sh %{cd #{@build_dir}; cp -r ncbi-blast-#{blast_version}/bin/#{binary} #{env.blast_root}/bin/}
251
+ end
246
252
  end
247
253
 
248
254
  task :blast => blast_installed_file
@@ -251,7 +257,7 @@ task :blast => blast_installed_file
251
257
  #
252
258
  # MSGFPlus
253
259
  #
254
- msgfplus_version="20140210"
260
+ msgfplus_version="20140630"
255
261
  msgfplus_packagefile="MSGFPlus.#{msgfplus_version}.zip"
256
262
  msgfplus_url="http://proteomics.ucsd.edu/Software/MSGFPlus/MSGFPlus.#{msgfplus_version}.zip"
257
263
  msgfplus_installed_file="#{env.msgfplus_root}/MSGFPlus.jar"
@@ -274,7 +280,7 @@ def pwiz_platform
274
280
  if RbConfig::CONFIG['host_os'] =~ /darwin/
275
281
  return 'darwin-x86-xgcc40'
276
282
  end
277
- 'linux-x86_64-gcc42'
283
+ 'linux-x86_64-gcc48'
278
284
  end
279
285
 
280
286
  def platform_bunzip
@@ -284,7 +290,7 @@ def platform_bunzip
284
290
  'bunzip2'
285
291
  end
286
292
 
287
- pwiz_version="3_0_4388"
293
+ pwiz_version="3_0_6790"
288
294
  pwiz_folder_name="pwiz-bin-#{pwiz_platform}-release-#{pwiz_version}"
289
295
  pwiz_packagefile="#{pwiz_folder_name}.tar.bz2"
290
296
  pwiz_url="https://dl.dropbox.com/u/226794/#{pwiz_packagefile}"
@@ -297,7 +303,8 @@ file pwiz_installed_file => [@build_dir,"#{@download_dir}/#{pwiz_packagefile}"]
297
303
  sh %{cd #{@build_dir}; #{platform_bunzip} -f #{pwiz_packagefile}}
298
304
  sh %{cd #{@build_dir}; tar -xvf #{pwiz_packagefile.chomp('.bz2')}}
299
305
  sh %{mkdir -p #{env.pwiz_root}}
300
- sh %{cd #{@build_dir}; cp ./#{pwiz_folder_name}/* #{env.pwiz_root}/}
306
+ sh %{cd #{@build_dir}; cp ./msconvert #{env.pwiz_root}/}
307
+ sh %{cd #{@build_dir}; cp ./idconvert #{env.pwiz_root}/}
301
308
  end
302
309
 
303
310
  task :pwiz => pwiz_installed_file
@@ -48,7 +48,8 @@ class TandemSearchTool < SearchTool
48
48
  :fragment_tolu => "spectrum, fragment monoisotopic mass error units",
49
49
  :acetyl_nterm => "protein, quick acetyl",
50
50
  :output_spectra => "output, spectra",
51
- :threads => "spectrum, threads"
51
+ :threads => "spectrum, threads",
52
+ :enzyme => "protein, cleavage site"
52
53
  }
53
54
 
54
55
  @xtandem_keys_for_precursor_tol = {
@@ -61,7 +62,7 @@ class TandemSearchTool < SearchTool
61
62
 
62
63
  @option_parser.banner = "Run an X!Tandem msms search on a set of mzML input files.\n\nUsage: tandem_search.rb [options] file1.mzML file2.mzML ..."
63
64
  @options.output_suffix="_tandem"
64
-
65
+ @options.enzyme="[RK]|{P}"
65
66
  add_value_option(:tandem_params,"isb_native",['-T', '--tandem-params tandem', 'Either the full path to an xml file containing a complete set of default parameters, or one of the following (isb_native,isb_kscore,gpm). Default is isb_native'])
66
67
  add_boolean_option(:keep_params_files,false,['-K', '--keep-params-files', 'Keep X!Tandem parameter files'])
67
68
  add_boolean_option(:output_spectra,false,['--output-spectra', 'Include spectra in the output file'])
@@ -71,12 +72,24 @@ class TandemSearchTool < SearchTool
71
72
  private
72
73
  # Galaxy changes things like @ to __at__ we need to change it back
73
74
  #
74
- def decode_modification_string(mstring)
75
+ def decode_galaxy_string(mstring)
75
76
  mstring.gsub!("__at__","@")
76
77
  mstring.gsub!("__oc__","{")
77
78
  mstring.gsub!("__cc__","}")
78
79
  mstring.gsub!("__ob__","[")
79
80
  mstring.gsub!("__cb__","]")
81
+ mstring.gsub!("__gt__",">")
82
+ mstring.gsub!("__lt__","<")
83
+ mstring.gsub!("__sq__","'")
84
+ mstring.gsub!("__dq__","\"")
85
+ mstring.gsub!("__cn__","\n")
86
+ mstring.gsub!("__cr__","\r")
87
+ mstring.gsub!("__tc__","\t")
88
+ mstring.gsub!("__pd__","#")
89
+
90
+ # For characters not allowed at all by galaxy
91
+ mstring.gsub!("__pc__","|")
92
+
80
93
  mstring
81
94
  end
82
95
 
@@ -157,7 +170,6 @@ class TandemSearchTool < SearchTool
157
170
  set_option(std_params,"protein, taxon",db_info.name)
158
171
 
159
172
 
160
-
161
173
  # set_option(std_params, "protein, cleavage semi", self.cleavage_semi ? "yes" : "no")
162
174
 
163
175
  # Simple options (unique with a 1:1 mapping to parameters from this tool)
@@ -168,7 +180,7 @@ class TandemSearchTool < SearchTool
168
180
  if opt_val.is_a?(TrueClass) || opt_val.is_a?(FalseClass)
169
181
  opt_val = opt_val ? "yes" : "no"
170
182
  end
171
- append_option(std_params,xtandem_key,opt_val.to_s)
183
+ append_option(std_params,xtandem_key,decode_galaxy_string(opt_val.to_s))
172
184
  end
173
185
  end
174
186
 
@@ -182,6 +194,7 @@ class TandemSearchTool < SearchTool
182
194
  end
183
195
  end
184
196
 
197
+
185
198
  # Per residue Fixed and Variable Modifications
186
199
  #
187
200
  # These can be added using a variety of methods in xtandem
@@ -195,7 +208,7 @@ class TandemSearchTool < SearchTool
195
208
  #
196
209
 
197
210
  var_mods = self.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
198
- var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
211
+ var_mods=var_mods.collect {|mod| decode_galaxy_string(mod) }
199
212
 
200
213
  # var_mods allows motif's as well as standard mods. These should be in a separate array
201
214
  var_motifs = [].replace(var_mods)
@@ -203,7 +216,7 @@ class TandemSearchTool < SearchTool
203
216
  var_motifs.keep_if {|mod| mod.xtandem_modification_motif? }
204
217
 
205
218
  fix_mods = self.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
206
- fix_mods=fix_mods.collect {|mod| decode_modification_string(mod)}
219
+ fix_mods=fix_mods.collect {|mod| decode_galaxy_string(mod)}
207
220
 
208
221
  # We also support the --glyco and --methionineo shortcuts.
209
222
  # Add these here. No check is made for duplication