protk 1.3.1.pre3 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +21 -19
  3. data/bin/add_retention_times.rb +1 -1
  4. data/bin/interprophet.rb +16 -5
  5. data/bin/make_decoy.rb +1 -1
  6. data/bin/manage_db.rb +1 -1
  7. data/bin/mascot_search.rb +2 -2
  8. data/bin/mascot_to_pepxml.rb +1 -1
  9. data/bin/msgfplus_search.rb +26 -9
  10. data/bin/omssa_search.rb +1 -1
  11. data/bin/peptide_prophet.rb +57 -20
  12. data/bin/pepxml_to_table.rb +15 -2
  13. data/bin/protein_prophet.rb +41 -1
  14. data/bin/protk_setup.rb +2 -2
  15. data/bin/protxml_to_gff.rb +50 -42
  16. data/bin/protxml_to_psql.rb +1 -1
  17. data/bin/protxml_to_table.rb +16 -3
  18. data/bin/repair_run_summary.rb +1 -1
  19. data/bin/sixframe.rb +2 -2
  20. data/bin/swissprot_to_table.rb +1 -1
  21. data/bin/tandem_search.rb +1 -1
  22. data/bin/tandem_to_pepxml.rb +1 -1
  23. data/lib/protk/constants.rb +2 -1
  24. data/lib/protk/convert_util.rb +1 -1
  25. data/lib/protk/data/tandem-style.css +349 -0
  26. data/lib/protk/data/tandem-style.xsl +264 -0
  27. data/lib/protk/data/tandem_gpm_defaults.xml +3 -3
  28. data/lib/protk/data/tandem_isb_kscore_defaults.xml +2 -0
  29. data/lib/protk/data/tandem_isb_native_defaults.xml +3 -0
  30. data/lib/protk/data/tandem_params.xml +0 -8
  31. data/lib/protk/fastadb.rb +1 -1
  32. data/lib/protk/galaxy_stager.rb +14 -3
  33. data/lib/protk/galaxy_util.rb +39 -31
  34. data/lib/protk/gffdb.rb +6 -1
  35. data/lib/protk/manage_db_rakefile.rake +1 -1
  36. data/lib/protk/manage_db_tool.rb +1 -1
  37. data/lib/protk/pepxml.rb +159 -7
  38. data/lib/protk/plasmodb.rb +1 -1
  39. data/lib/protk/prophet_tool.rb +20 -52
  40. data/lib/protk/setup_rakefile.rake +18 -11
  41. data/lib/protk/tandem_search_tool.rb +20 -7
  42. data/lib/protk/tool.rb +1 -1
  43. data/lib/protk/uniprot_mapper.rb +1 -1
  44. metadata +10 -14
@@ -13,7 +13,7 @@ dbname=ARGV[0]
13
13
 
14
14
  # Load database spec file
15
15
  #
16
- $genv=Constants.new()
16
+ $genv=Constants.instance()
17
17
  dbdir="#{$genv.protein_database_root}/#{dbname}"
18
18
 
19
19
  dbspec_file="#{dbdir}/.protkdb.yaml"
@@ -12,7 +12,7 @@ require 'protk/tool'
12
12
  class ManageDBTool < Tool
13
13
 
14
14
  def add dbspec, dbname
15
- genv=Constants.new()
15
+ genv=Constants.instance()
16
16
  dbdir="#{genv.protein_database_root}/#{dbname}"
17
17
  %x[mkdir -p #{dbdir}]
18
18
 
@@ -1,22 +1,174 @@
1
1
  require 'rubygems'
2
- require 'rexml/document'
3
- require 'rexml/xpath'
2
+ require 'libxml'
3
+
4
+ include LibXML
5
+
6
+ # require 'rexml/document'
7
+ # require 'rexml/xpath'
4
8
 
5
9
  class PepXML
10
+
11
+ attr_accessor :file_name
12
+
6
13
  def initialize(file_name)
7
- @doc=REXML::Document.new(File.new(file_name))
14
+ @file_name=file_name
15
+
16
+ XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
17
+ pepxml_parser=XML::Parser.file("#{file_name}")
18
+
19
+ @pepxml_ns_prefix="xmlns:"
20
+ @pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
21
+ @pepxml_doc=pepxml_parser.parse
22
+ if not @pepxml_doc.root.namespaces.default
23
+ @pepxml_ns_prefix=""
24
+ @pepxml_ns=nil
25
+ end
26
+ end
27
+
28
+
29
+
30
+ # Obtain the database name from the given input file
31
+ #
32
+ def extract_db()
33
+ reader = XML::Reader.file(self.file_name)
34
+ throw "Failed to open xml file #{file_name}" unless reader!=nil
35
+
36
+ while(reader.read)
37
+ # For pep.xml files
38
+ #
39
+ if ( reader.name == "search_database" )
40
+ dbnode=reader.expand
41
+ dbvalue=dbnode['local_path']
42
+ reader.close
43
+ return dbvalue
44
+ end
45
+
46
+ # For prot.xml files
47
+ #
48
+ if ( reader.name == "protein_summary_header" )
49
+ dbnode=reader.expand
50
+ dbvalue=dbnode['reference_database']
51
+ reader.close
52
+ return dbvalue
53
+ end
54
+
55
+
56
+
57
+ end
58
+
59
+ end
60
+
61
+
62
+
63
+ # Obtain the search engine name from the input file
64
+ # The name of the engine is returned in lowercase and should contain no spaces
65
+ # Names of common engines are searched for and extracted in simplified form if possible
66
+ #
67
+ def extract_engine()
68
+ reader = XML::Reader.file(self.file_name)
69
+ throw "Failed to open xml file #{file_name}" unless reader!=nil
70
+
71
+ while(reader.read)
72
+ if ( reader.name == "search_summary" )
73
+ dbnode=reader.expand
74
+ dbvalue=dbnode['search_engine']
75
+ reader.close
76
+ engine_name=dbvalue.gsub(/ /,"_")
77
+ engine_name=engine_name.gsub(/\(/,"")
78
+ engine_name=engine_name.gsub(/\)/,"")
79
+ engine_name=engine_name.gsub(/\!/,"")
80
+ return engine_name.downcase
81
+ end
82
+ end
83
+ end
84
+
85
+
86
+ def extract_enzyme()
87
+ reader = XML::Reader.file(self.file_name)
88
+ throw "Failed to open xml file #{file_name}" unless reader!=nil
89
+
90
+ while(reader.read)
91
+ if ( reader.name == "sample_enzyme" )
92
+ dbnode=reader.expand
93
+ dbvalue=dbnode['name']
94
+ reader.close
95
+ return dbvalue.downcase
96
+ end
97
+ end
98
+ end
99
+
100
+
101
+
102
+ def type_from_base_name(basename)
103
+ # A common error is for tools to include the extension in the base_name attribute.
104
+ # We exploit this to guess the type
105
+ ext_guess=""
106
+ case basename
107
+ when /.mgf$/
108
+ ext_guess="mgf"
109
+ when /.mzML$/
110
+ ext_guess="mzML"
111
+ when /.mzXML$/
112
+ ext_guess="mzXML"
113
+ else
114
+ ext_guess=""
115
+ end
116
+ ext_guess
117
+ end
118
+
119
+ def type_from_summary_attributes(atts)
120
+ if is_valid_type(atts["raw_data_type"])
121
+ return atts["raw_data_type"]
122
+ end
123
+
124
+ if is_valid_type(atts["raw_data"])
125
+ return atts["raw_data"]
126
+ end
127
+ return ""
128
+ end
129
+
130
+ def is_valid_type(type)
131
+ case type
132
+ when /^mgf$/i
133
+ return true
134
+ when /^mzML$/i
135
+ return true
136
+ when /^mzXML$/i
137
+ return true
138
+ else
139
+ return false
140
+ end
8
141
  end
9
142
 
10
- def find_runs()
143
+
144
+ # TODO: Make this faster and more memory efficient by using XML::Reader as in the functions above
145
+ #
146
+ def find_runs()
147
+
148
+
149
+ run_summaries = @pepxml_doc.find("//#{@pepxml_ns_prefix}msms_run_summary", @pepxml_ns)
150
+
11
151
  runs = {}
12
- REXML::XPath.each(@doc,"//msms_run_summary") do |summary|
152
+ run_summaries.each do |summary|
13
153
  base_name = summary.attributes["base_name"]
14
154
  if not runs.has_key?(base_name)
15
- runs[base_name] = {:base_name => summary.attributes["base_name"],
16
- :type => summary.attributes["raw_data"]}
155
+ bn = summary.attributes["base_name"]
156
+
157
+ runs[base_name] = {:base_name => summary.attributes["base_name"]}
158
+
159
+ if is_valid_type(type_from_summary_attributes(summary.attributes))
160
+ runs[base_name][:type] = type_from_summary_attributes(summary.attributes)
161
+ elsif is_valid_type(type_from_base_name(bn))
162
+ runs[base_name][:type] = type_from_base_name(bn)
163
+ else
164
+ runs[base_name][:type] = "mzML" # Same guess as peptide prophet makes
165
+ end
166
+
17
167
  end
18
168
  end
19
169
  runs
20
170
  end
171
+
172
+
21
173
 
22
174
  end
@@ -11,7 +11,7 @@ class PlasmoDB
11
11
  if ( env!=nil)
12
12
  @genv=env
13
13
  else
14
- @genv=Constants.new
14
+ @genv=Constants.instance
15
15
  end
16
16
 
17
17
  database_file="#{@genv.protein_database_root}/#{@genv.plasmodb_annotation_database}/raw.txt"
@@ -21,63 +21,31 @@ class ProphetTool < SearchTool
21
21
 
22
22
  super(option_support)
23
23
 
24
- end
25
-
26
-
27
-
28
- # Obtain the database name from the given input file
29
- #
30
- def extract_db(file_name)
31
- reader = XML::Reader.file(file_name)
32
- throw "Failed to open xml file #{file_name}" unless reader!=nil
33
-
34
- while(reader.read)
35
- # For pep.xml files
36
- #
37
- if ( reader.name == "search_database" )
38
- dbnode=reader.expand
39
- dbvalue=dbnode['local_path']
40
- reader.close
41
- return dbvalue
42
- end
43
-
44
- # For prot.xml files
45
- #
46
- if ( reader.name == "protein_summary_header" )
47
- dbnode=reader.expand
48
- dbvalue=dbnode['reference_database']
49
- reader.close
50
- return dbvalue
51
- end
52
-
53
-
54
-
24
+ if ( option_support.include? :probability_threshold )
25
+ add_value_option(:probability_threshold,0.05,['--p-thresh val', 'Probability threshold below which PSMs are discarded'])
55
26
  end
56
27
 
57
28
  end
58
-
59
-
60
-
61
- # Obtain the search engine name from the input file
62
- # The name of the engine is returned in lowercase and should contain no spaces
63
- # Names of common engines are searched for and extracted in simplified form if possible
29
+
30
+ # TODO: Deal with multiple enzyme combos
64
31
  #
65
- def extract_engine(file_name)
66
- reader = XML::Reader.file(file_name)
67
- throw "Failed to open xml file #{file_name}" unless reader!=nil
32
+ def self.xinteract_code_for_enzyme(enzyme_name)
33
+
34
+ codes = {
35
+ 'trypsin' => 'T',
36
+ 'stricttrypsin' => 'S',
37
+ 'chymotrypsin' => 'C',
38
+ 'ralphtrypsin' => 'R',
39
+ 'aspn' => 'A',
40
+ 'gluc' => 'G',
41
+ 'glucbicarb' => 'B',
42
+ 'cnbr' => 'M',
43
+ 'elastase' => 'E',
44
+ 'lysn' => 'L',
45
+ 'nonspecific' => 'N'
46
+ }
47
+ codes[enzyme_name]
68
48
 
69
- while(reader.read)
70
- if ( reader.name == "search_summary" )
71
- dbnode=reader.expand
72
- dbvalue=dbnode['search_engine']
73
- reader.close
74
- engine_name=dbvalue.gsub(/ /,"_")
75
- engine_name=engine_name.gsub(/\(/,"")
76
- engine_name=engine_name.gsub(/\)/,"")
77
- engine_name=engine_name.gsub(/\!/,"")
78
- return engine_name.downcase
79
- end
80
- end
81
49
  end
82
50
 
83
51
  end
@@ -1,8 +1,9 @@
1
1
 
2
2
  require 'protk/constants.rb'
3
+ require 'rake/clean'
3
4
  require 'rbconfig'
4
5
 
5
- env=Constants.new
6
+ env=Constants.instance
6
7
 
7
8
  @build_dir = "#{env.protk_dir}/tmp/build"
8
9
  @download_dir = "#{env.protk_dir}/tmp/download"
@@ -10,6 +11,8 @@ env=Constants.new
10
11
  directory @build_dir
11
12
  directory @download_dir
12
13
 
14
+ CLEAN.include @build_dir, @download_dir
15
+
13
16
  def package_manager_name
14
17
  package_managers = ["brew","yum","apt-get"]
15
18
 
@@ -30,7 +33,7 @@ def clean_build_dir
30
33
  end
31
34
 
32
35
  def download_buildfile url, file
33
- sh %{cd #{@download_dir}; wget #{url}}
36
+ sh %{cd #{@download_dir}; wget -O #{file} #{url}}
34
37
  end
35
38
 
36
39
  def download_task url, packagefile
@@ -133,10 +136,10 @@ task :perl_locallib => [perl_locallib_installed_file]
133
136
  #
134
137
  # TPP
135
138
  #
136
- tpp_version="4.6.3"
139
+ tpp_version="4.8.0"
137
140
  tpp_packagefile="TPP-#{tpp_version}.tgz"
138
141
  tpp_installed_file = "#{env.tpp_root}/bin/xinteract"
139
- tpp_url = "https://dl.dropbox.com/u/226794/TPP-4.6.3.tgz"
142
+ tpp_url = "http://sourceforge.net/projects/sashimi/files/Trans-Proteomic%20Pipeline%20%28TPP%29/TPP%20v4.8%20%28philae%29%20rev%200/TPP_4.8.0-src.tgz/download"
140
143
 
141
144
  tpp_download_file = download_task tpp_url, tpp_packagefile
142
145
 
@@ -229,10 +232,11 @@ def blast_platform
229
232
  'x64-linux'
230
233
  end
231
234
 
232
- blast_version="2.2.27+"
235
+ blast_version="2.2.30+"
233
236
  blast_packagefile="ncbi-blast-#{blast_version}-#{blast_platform}.tar.gz"
234
237
  blast_url="ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/#{blast_version.chomp('+')}/#{blast_packagefile}"
235
238
  blast_installed_file="#{env.blast_root}/bin/makeblastdb"
239
+ blast_required_bin=["makeblastdb", "blastdbcmd"]
236
240
 
237
241
  download_task blast_url, blast_packagefile
238
242
 
@@ -241,8 +245,10 @@ file blast_installed_file => [@build_dir,"#{@download_dir}/#{blast_packagefile}"
241
245
  sh %{cp #{@download_dir}/#{blast_packagefile} #{@build_dir}}
242
246
  sh %{cd #{@build_dir}; gunzip #{blast_packagefile}}
243
247
  sh %{cd #{@build_dir}; tar -xvf #{blast_packagefile.chomp('.gz')}}
244
- sh %{mkdir -p #{env.blast_root}}
245
- sh %{cd #{@build_dir}; cp -r ncbi-blast-#{blast_version}/* #{env.blast_root}/}
248
+ sh %{mkdir -p #{env.blast_root}/bin}
249
+ blast_required_bin.each do |binary|
250
+ sh %{cd #{@build_dir}; cp -r ncbi-blast-#{blast_version}/bin/#{binary} #{env.blast_root}/bin/}
251
+ end
246
252
  end
247
253
 
248
254
  task :blast => blast_installed_file
@@ -251,7 +257,7 @@ task :blast => blast_installed_file
251
257
  #
252
258
  # MSGFPlus
253
259
  #
254
- msgfplus_version="20140210"
260
+ msgfplus_version="20140630"
255
261
  msgfplus_packagefile="MSGFPlus.#{msgfplus_version}.zip"
256
262
  msgfplus_url="http://proteomics.ucsd.edu/Software/MSGFPlus/MSGFPlus.#{msgfplus_version}.zip"
257
263
  msgfplus_installed_file="#{env.msgfplus_root}/MSGFPlus.jar"
@@ -274,7 +280,7 @@ def pwiz_platform
274
280
  if RbConfig::CONFIG['host_os'] =~ /darwin/
275
281
  return 'darwin-x86-xgcc40'
276
282
  end
277
- 'linux-x86_64-gcc42'
283
+ 'linux-x86_64-gcc48'
278
284
  end
279
285
 
280
286
  def platform_bunzip
@@ -284,7 +290,7 @@ def platform_bunzip
284
290
  'bunzip2'
285
291
  end
286
292
 
287
- pwiz_version="3_0_4388"
293
+ pwiz_version="3_0_6790"
288
294
  pwiz_folder_name="pwiz-bin-#{pwiz_platform}-release-#{pwiz_version}"
289
295
  pwiz_packagefile="#{pwiz_folder_name}.tar.bz2"
290
296
  pwiz_url="https://dl.dropbox.com/u/226794/#{pwiz_packagefile}"
@@ -297,7 +303,8 @@ file pwiz_installed_file => [@build_dir,"#{@download_dir}/#{pwiz_packagefile}"]
297
303
  sh %{cd #{@build_dir}; #{platform_bunzip} -f #{pwiz_packagefile}}
298
304
  sh %{cd #{@build_dir}; tar -xvf #{pwiz_packagefile.chomp('.bz2')}}
299
305
  sh %{mkdir -p #{env.pwiz_root}}
300
- sh %{cd #{@build_dir}; cp ./#{pwiz_folder_name}/* #{env.pwiz_root}/}
306
+ sh %{cd #{@build_dir}; cp ./msconvert #{env.pwiz_root}/}
307
+ sh %{cd #{@build_dir}; cp ./idconvert #{env.pwiz_root}/}
301
308
  end
302
309
 
303
310
  task :pwiz => pwiz_installed_file
@@ -48,7 +48,8 @@ class TandemSearchTool < SearchTool
48
48
  :fragment_tolu => "spectrum, fragment monoisotopic mass error units",
49
49
  :acetyl_nterm => "protein, quick acetyl",
50
50
  :output_spectra => "output, spectra",
51
- :threads => "spectrum, threads"
51
+ :threads => "spectrum, threads",
52
+ :enzyme => "protein, cleavage site"
52
53
  }
53
54
 
54
55
  @xtandem_keys_for_precursor_tol = {
@@ -61,7 +62,7 @@ class TandemSearchTool < SearchTool
61
62
 
62
63
  @option_parser.banner = "Run an X!Tandem msms search on a set of mzML input files.\n\nUsage: tandem_search.rb [options] file1.mzML file2.mzML ..."
63
64
  @options.output_suffix="_tandem"
64
-
65
+ @options.enzyme="[RK]|{P}"
65
66
  add_value_option(:tandem_params,"isb_native",['-T', '--tandem-params tandem', 'Either the full path to an xml file containing a complete set of default parameters, or one of the following (isb_native,isb_kscore,gpm). Default is isb_native'])
66
67
  add_boolean_option(:keep_params_files,false,['-K', '--keep-params-files', 'Keep X!Tandem parameter files'])
67
68
  add_boolean_option(:output_spectra,false,['--output-spectra', 'Include spectra in the output file'])
@@ -71,12 +72,24 @@ class TandemSearchTool < SearchTool
71
72
  private
72
73
  # Galaxy changes things like @ to __at__ we need to change it back
73
74
  #
74
- def decode_modification_string(mstring)
75
+ def decode_galaxy_string(mstring)
75
76
  mstring.gsub!("__at__","@")
76
77
  mstring.gsub!("__oc__","{")
77
78
  mstring.gsub!("__cc__","}")
78
79
  mstring.gsub!("__ob__","[")
79
80
  mstring.gsub!("__cb__","]")
81
+ mstring.gsub!("__gt__",">")
82
+ mstring.gsub!("__lt__","<")
83
+ mstring.gsub!("__sq__","'")
84
+ mstring.gsub!("__dq__","\"")
85
+ mstring.gsub!("__cn__","\n")
86
+ mstring.gsub!("__cr__","\r")
87
+ mstring.gsub!("__tc__","\t")
88
+ mstring.gsub!("__pd__","#")
89
+
90
+ # For characters not allowed at all by galaxy
91
+ mstring.gsub!("__pc__","|")
92
+
80
93
  mstring
81
94
  end
82
95
 
@@ -157,7 +170,6 @@ class TandemSearchTool < SearchTool
157
170
  set_option(std_params,"protein, taxon",db_info.name)
158
171
 
159
172
 
160
-
161
173
  # set_option(std_params, "protein, cleavage semi", self.cleavage_semi ? "yes" : "no")
162
174
 
163
175
  # Simple options (unique with a 1:1 mapping to parameters from this tool)
@@ -168,7 +180,7 @@ class TandemSearchTool < SearchTool
168
180
  if opt_val.is_a?(TrueClass) || opt_val.is_a?(FalseClass)
169
181
  opt_val = opt_val ? "yes" : "no"
170
182
  end
171
- append_option(std_params,xtandem_key,opt_val.to_s)
183
+ append_option(std_params,xtandem_key,decode_galaxy_string(opt_val.to_s))
172
184
  end
173
185
  end
174
186
 
@@ -182,6 +194,7 @@ class TandemSearchTool < SearchTool
182
194
  end
183
195
  end
184
196
 
197
+
185
198
  # Per residue Fixed and Variable Modifications
186
199
  #
187
200
  # These can be added using a variety of methods in xtandem
@@ -195,7 +208,7 @@ class TandemSearchTool < SearchTool
195
208
  #
196
209
 
197
210
  var_mods = self.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
198
- var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
211
+ var_mods=var_mods.collect {|mod| decode_galaxy_string(mod) }
199
212
 
200
213
  # var_mods allows motif's as well as standard mods. These should be in a separate array
201
214
  var_motifs = [].replace(var_mods)
@@ -203,7 +216,7 @@ class TandemSearchTool < SearchTool
203
216
  var_motifs.keep_if {|mod| mod.xtandem_modification_motif? }
204
217
 
205
218
  fix_mods = self.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
206
- fix_mods=fix_mods.collect {|mod| decode_modification_string(mod)}
219
+ fix_mods=fix_mods.collect {|mod| decode_galaxy_string(mod)}
207
220
 
208
221
  # We also support the --glyco and --methionineo shortcuts.
209
222
  # Add these here. No check is made for duplication