protk 1.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/README.md +85 -0
  2. data/bin/annotate_ids.rb +59 -0
  3. data/bin/big_search.rb +41 -0
  4. data/bin/correct_omssa_retention_times.rb +27 -0
  5. data/bin/feature_finder.rb +76 -0
  6. data/bin/file_convert.rb +157 -0
  7. data/bin/generate_omssa_loc.rb +42 -0
  8. data/bin/interprophet.rb +91 -0
  9. data/bin/make_decoy.rb +64 -0
  10. data/bin/manage_db.rb +123 -0
  11. data/bin/mascot_search.rb +187 -0
  12. data/bin/mascot_to_pepxml.rb +44 -0
  13. data/bin/msgfplus_search.rb +191 -0
  14. data/bin/omssa_search.rb +205 -0
  15. data/bin/peptide_prophet.rb +245 -0
  16. data/bin/pepxml_to_table.rb +78 -0
  17. data/bin/protein_prophet.rb +140 -0
  18. data/bin/protk_setup.rb +31 -0
  19. data/bin/repair_run_summary.rb +113 -0
  20. data/bin/tandem_search.rb +292 -0
  21. data/bin/template_search.rb +144 -0
  22. data/bin/unimod_to_loc.rb +118 -0
  23. data/bin/xls_to_table.rb +46 -0
  24. data/ext/protk/extconf.rb +3 -0
  25. data/ext/protk/protk.c +235 -0
  26. data/lib/protk/big_search_rakefile.rake +16 -0
  27. data/lib/protk/big_search_tool.rb +23 -0
  28. data/lib/protk/bio_sptr_extensions.rb +210 -0
  29. data/lib/protk/biotools_excel_converter.rb +60 -0
  30. data/lib/protk/command_runner.rb +84 -0
  31. data/lib/protk/constants.rb +296 -0
  32. data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
  33. data/lib/protk/data/apt-get_packages.yaml +47 -0
  34. data/lib/protk/data/brew_packages.yaml +10 -0
  35. data/lib/protk/data/default_config.yml +20 -0
  36. data/lib/protk/data/predefined_db.crap.yaml +19 -0
  37. data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
  38. data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
  39. data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
  40. data/lib/protk/data/tandem_params.xml +56 -0
  41. data/lib/protk/data/taxonomy_template.xml +9 -0
  42. data/lib/protk/data/unimod.xml +16780 -0
  43. data/lib/protk/eupathdb_gene_information_table.rb +158 -0
  44. data/lib/protk/galaxy_stager.rb +24 -0
  45. data/lib/protk/galaxy_util.rb +9 -0
  46. data/lib/protk/manage_db_rakefile.rake +484 -0
  47. data/lib/protk/manage_db_tool.rb +181 -0
  48. data/lib/protk/mascot_util.rb +63 -0
  49. data/lib/protk/omssa_util.rb +57 -0
  50. data/lib/protk/plasmodb.rb +50 -0
  51. data/lib/protk/prophet_tool.rb +85 -0
  52. data/lib/protk/protein_annotator.rb +646 -0
  53. data/lib/protk/protxml.rb +137 -0
  54. data/lib/protk/randomize.rb +7 -0
  55. data/lib/protk/search_tool.rb +182 -0
  56. data/lib/protk/setup_rakefile.rake +245 -0
  57. data/lib/protk/setup_tool.rb +19 -0
  58. data/lib/protk/spreadsheet_extensions.rb +78 -0
  59. data/lib/protk/swissprot_database.rb +38 -0
  60. data/lib/protk/tool.rb +182 -0
  61. data/lib/protk/xtandem_defaults.rb +11 -0
  62. data/lib/protk.rb +18 -0
  63. metadata +256 -0
@@ -0,0 +1,296 @@
1
+ # This file is part of protk
2
+ # Created by Ira Cooke 14/12/2010
3
+ #
4
+ # Initialises global constants.
5
+ # All tools should source this file.
6
+ #
7
+ require 'yaml'
8
+ require 'logger'
9
+ require 'pathname'
10
+ require 'ftools'
11
+
12
+ class Constants
13
+
14
+ # A Hash holding all the constants
15
+ #
16
+ @env
17
+
18
+ # These are logger attributes with thresholds as indicated
19
+ # DEBUG < INFO < WARN < ERROR < FATAL < UNKNOWN
20
+ #Debug (development mode) or Info (production)
21
+ #
22
+ @stdout_logger
23
+
24
+ #Warn
25
+ #
26
+ @file_logger
27
+
28
+
29
+
30
+ attr :info_level
31
+ attr :protk_dir
32
+
33
+ # Provides direct access to constants through methods of the same name
34
+ # This will be used for all constants other than paths
35
+ #
36
+ def method_missing(method)
37
+ @env[method.to_s]
38
+ end
39
+
40
+ # Some constants are paths. They need to be translated into real paths before being returned
41
+ #
42
+
43
+ def bin
44
+ return "#{@protk_dir}/bin"
45
+ end
46
+
47
+ def tpp_root
48
+ path=@env['tpp_root']
49
+ if ( path =~ /^\// )
50
+ return path
51
+ else
52
+ return "#{@protk_dir}/#{@env['tpp_root']}"
53
+ end
54
+ end
55
+
56
+ def xinteract
57
+ return "#{self.tpp_root}/bin/xinteract"
58
+ end
59
+
60
+ def xtandem
61
+ return "#{self.tpp_root}/bin/tandem"
62
+ end
63
+
64
+ def tandem2xml
65
+ return "#{self.tpp_root}/bin/Tandem2XML"
66
+ end
67
+
68
+ def interprophetparser
69
+ return "#{self.tpp_root}/bin/InterProphetParser"
70
+ end
71
+
72
+ def proteinprophet
73
+ return "#{self.tpp_root}/bin/ProteinProphet"
74
+ end
75
+
76
+ def mascot2xml
77
+ return "#{self.tpp_root}/bin/Mascot2XML"
78
+ end
79
+
80
+ def omssa_root
81
+ path=@env['omssa_root']
82
+ if ( path =~ /^\// )
83
+ return path
84
+ else
85
+ return "#{@protk_dir}/#{@env['omssa_root']}"
86
+ end
87
+ end
88
+
89
+ def omssacl
90
+ return "#{self.omssa_root}/omssacl"
91
+ end
92
+
93
+ def omssa2pepxml
94
+ return "#{self.omssa_root}/omssa2pepXML"
95
+ end
96
+
97
+ def openms_root
98
+ path=@env['openms_root']
99
+ if ( path =~ /^\// )
100
+ return path
101
+ else
102
+ return "#{@protk_dir}/#{@env['openms_root']}"
103
+ end
104
+ end
105
+
106
+ def msgfplus_root
107
+ path=@env['msgfplus_root']
108
+ if ( path =~ /^\// )
109
+ return path
110
+ else
111
+ return "#{@protk_dir}/#{@env['msgfplus_root']}"
112
+ end
113
+ end
114
+
115
+ def msgfplusjar
116
+ return "#{self.msgfplus_root}/MSGFPlus.jar"
117
+ end
118
+
119
+ def protein_database_root
120
+ path=@env['protein_database_root']
121
+ if ( path =~ /^\// )
122
+ return path
123
+ else
124
+ return "#{@protk_dir}/#{@env['protein_database_root']}"
125
+ end
126
+ end
127
+
128
+ def database_downloads
129
+ return "#{self.protein_database_root}/downloads"
130
+ end
131
+
132
+ def blast_root
133
+ path=@env['blast_root']
134
+ if ( path =~ /^\// )
135
+ return path
136
+ else
137
+ return "#{@protk_dir}/#{@env['blast_root']}"
138
+ end
139
+ end
140
+
141
+ def makeblastdb
142
+ return "#{self.blast_root}/bin/makeblastdb"
143
+ end
144
+
145
+ def log_file
146
+ path=@env['log_file']
147
+ if ( path =~ /^\// )
148
+ return path
149
+ else
150
+ return "#{@protk_dir}/#{@env['log_file']}"
151
+ end
152
+ end
153
+
154
+
155
+ # Read the global constants file and initialize our class @env variable
156
+ # Initialize loggers
157
+ #
158
+ def initialize
159
+
160
+ @protk_dir="#{Dir.home}/.protk"
161
+
162
+
163
+ default_config_yml = YAML.load_file "#{File.dirname(__FILE__)}/data/default_config.yml"
164
+ throw "Unable to read the config file at #{File.dirname(__FILE__)}/data/default_config.yml" unless default_config_yml!=nil
165
+
166
+ @env=default_config_yml
167
+ throw "No data found in config file" unless @env!=nil
168
+ @info_level=default_config_yml['message_level']
169
+
170
+
171
+ end
172
+
173
+
174
+ def initialize_loggers
175
+ log_dir = Pathname.new(self.log_file).dirname
176
+ log_dir.mkpath unless log_dir.exist?
177
+
178
+ @stdout_logger=Logger.new(STDOUT)
179
+ @file_logger=Logger.new(self.log_file,'daily')
180
+
181
+ throw "Unable to create file logger at path #{self.log_file}" unless @file_logger!=nil
182
+ throw "Unable to create stdout logger " unless @stdout_logger!=nil
183
+
184
+
185
+
186
+ case @info_level
187
+ when "info"
188
+ @stdout_logger.level=Logger::INFO
189
+ when "debug"
190
+ @stdout_logger.level=Logger::DEBUG
191
+ when "warn"
192
+ @stdout_logger.level=Logger::WARN
193
+ end
194
+
195
+ end
196
+
197
+ # Write a message to all logger objects
198
+ #
199
+ def log(message,level)
200
+ if ( @stdout_logger == nil || @file_logger == nil)
201
+ initialize_loggers
202
+ end
203
+ @stdout_logger.send(level,message)
204
+ @file_logger.send(level,message)
205
+ end
206
+
207
+ def path_for_builtin_database(dbname)
208
+ "#{self.protein_database_root}/#{dbname}/current.fasta"
209
+ end
210
+
211
+
212
+ def dbexist?(dbname)
213
+ Pathname.new("#{self.protein_database_root}/#{dbname}").exist?
214
+ end
215
+
216
+ # Based on the database shortname and global database path, find the most current version of the required database
217
+ # If dbname corresponds to a folder in the dbroot this function returns the path of the database with an extension
218
+ # appropriate to the database type
219
+ #
220
+ # If dbname is a full path to a file this tool will first import the file as a temporary database
221
+ # and will then return its full path
222
+ #
223
+ def current_database_for_name(dbname)
224
+ dbroot=self.protein_database_root
225
+
226
+ throw "Protein database directory not specified" unless dbroot!=nil
227
+ throw "Protein database directory #{dbroot} does not exist" unless Pathname(dbroot).exist?
228
+
229
+ # Remove any trailing slashes or spaces from the end of dbroot if present
230
+ #
231
+ dbroot.sub!(/(\/*\s*)$/,"")
232
+
233
+ return path_for_builtin_database(dbname)
234
+
235
+ end
236
+
237
+
238
+ # Runs the given command in a local shell
239
+ #
240
+ def run_local(command_string)
241
+ self.log("Command: #{command_string} started",:info)
242
+ status = Open4::popen4("#{command_string} ") do |pid, stdin, stdout, stderr|
243
+ puts "PID #{pid}"
244
+
245
+ stdout.each { |line| self.log(line.chomp,:info) }
246
+
247
+ stderr.each { |line| self.log(line.chomp,:warn) }
248
+
249
+ end
250
+ if ( status!=0 )
251
+ # We terminated with some error code so log as an error
252
+ self.log( "Command: #{command_string} exited with status #{status.to_s}",:error)
253
+ else
254
+ self.log( "Command: #{command_string} exited with status #{status.to_s}",:info)
255
+ end
256
+ status
257
+ end
258
+
259
+ def import_fasta_database(dbroot,path_to_fasta_file)
260
+
261
+ tmp_dbroot=Pathname.new("#{dbroot}/tmp/")
262
+
263
+ dest_fasta_file_name=Pathname.new(path_to_fasta_file).basename
264
+ dest_fasta_file_path=Pathname.new("#{tmp_dbroot}#{dest_fasta_file_name}")
265
+
266
+ if ( !dest_fasta_file_path.exist? )
267
+
268
+ Dir.mkdir(tmp_dbroot) unless tmp_dbroot.exist? && tmp_dbroot.directory?
269
+
270
+ throw "Unable to make temporary database directory #{tmp_dbroot}" unless tmp_dbroot.exist?
271
+
272
+ link_cmd = "ln -s #{path_to_fasta_file} #{dest_fasta_file_path}"
273
+
274
+ result= %x[#{link_cmd}]
275
+ p result
276
+ end
277
+
278
+ check_cmd="#{self.ncbi_tools_bin}/blastdbcmd -info -db #{dest_fasta_file_path}"
279
+ result = %x[#{check_cmd}]
280
+
281
+ if ( result=="")
282
+
283
+ throw "Unable to create temporary database #{dest_fasta_file_path}" unless dest_fasta_file_path.exist?
284
+ cmd="#{self.makeblastdb} -in #{dest_fasta_file_path} -parse_seqids"
285
+ p cmd
286
+ self.run_local(cmd)
287
+
288
+ end
289
+
290
+ return dest_fasta_file_path.to_s
291
+
292
+ end
293
+
294
+
295
+
296
+ end
@@ -0,0 +1,63 @@
1
+ <?xml version="1.0" encoding="ISO-8859-1"?>
2
+ <PARAMETERS version="1.3" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/schemas/Param_1_3.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3
+ <NODE name="FeatureFinderCentroided" description="Detects two-dimensional features in LC-MS data.">
4
+ <ITEM name="version" value="1.9.0" type="string" description="Version of the tool that generated this parameters file." tags="advanced" />
5
+ <NODE name="1" description="Instance &apos;1&apos; section for &apos;FeatureFinderCentroided&apos;">
6
+ <ITEM name="in" value="" type="string" description="input file" tags="input file,required" restrictions="*.mzML" />
7
+ <ITEM name="out" value="" type="string" description="output file" tags="output file,required" restrictions="*.featureXML" />
8
+ <ITEM name="seeds" value="" type="string" description="User specified seed list" tags="input file" restrictions="*.featureXML" />
9
+ <ITEM name="log" value="" type="string" description="Name of log file (created only when specified)" tags="advanced" />
10
+ <ITEM name="debug" value="0" type="int" description="Sets the debug level" tags="advanced" />
11
+ <ITEM name="threads" value="1" type="int" description="Sets the number of threads allowed to be used by the TOPP tool" />
12
+ <ITEM name="no_progress" value="false" type="string" description="Disables progress logging to command line" tags="advanced" restrictions="true,false" />
13
+ <ITEM name="test" value="false" type="string" description="Enables the test mode (needed for internal use only)" tags="advanced" restrictions="true,false" />
14
+ <NODE name="algorithm" description="Algorithm section">
15
+ <ITEM name="debug" value="false" type="string" description="When debug mode is activated, several files with intermediate results are written to the folder &apos;debug&apos; (do not use in parallel mode)." restrictions="true,false" />
16
+ <NODE name="intensity" description="Settings for the calculation of a score indicating if a peak&apos;s intensity is significant in the local environment (between 0 and 1)">
17
+ <ITEM name="bins" value="10" type="int" description="Number of bins per dimension (RT and m/z). The higher this value, the more local the intensity significance score is.#br#This parameter should be decreased, if the algorithm is used on small regions of a map." restrictions="1:" />
18
+ </NODE>
19
+ <NODE name="mass_trace" description="Settings for the calculation of a score indicating if a peak is part of a mass trace (between 0 and 1).">
20
+ <ITEM name="mz_tolerance" value="0.02" type="float" description="Tolerated m/z deviation of peaks belonging to the same mass trace.#br#It should be larger than the m/z resolution of the instument.#br#This value must be smaller than that 1/charge_high!" restrictions="0:" />
21
+ <ITEM name="min_spectra" value="10" type="int" description="Number of spectra that have to show a similar peak mass in a mass trace." restrictions="1:" />
22
+ <ITEM name="max_missing" value="1" type="int" description="Number of consecutive spectra where a high mass deviation or missing peak is acceptable.#br#This parameter should be well below &apos;min_spectra&apos;!" restrictions="0:" />
23
+ <ITEM name="slope_bound" value="0.1" type="float" description="The maximum slope of mass trace intensities when extending from the highest peak.#br#This parameter is important to seperate overlapping elution peaks.#br#It should be increased if feature elution profiles fluctuate a lot." restrictions="0:" />
24
+ </NODE>
25
+ <NODE name="isotopic_pattern" description="Settings for the calculation of a score indicating if a peak is part of a isotoipic pattern (between 0 and 1).">
26
+ <ITEM name="charge_low" value="1" type="int" description="Lowest charge to search for." restrictions="1:" />
27
+ <ITEM name="charge_high" value="4" type="int" description="Highest charge to search for." restrictions="1:" />
28
+ <ITEM name="mz_tolerance" value="0.04" type="float" description="Tolerated m/z deviation from the theoretical isotopic pattern.#br#It should be larger than the m/z resolution of the instument.#br#This value must be smaller than that 1/charge_high!" restrictions="0:" />
29
+ <ITEM name="intensity_percentage" value="10" type="float" description="Isotopic peaks that contribute more than this percentage to the overall isotope pattern intensity must be present." tags="advanced" restrictions="0:100" />
30
+ <ITEM name="intensity_percentage_optional" value="0.1" type="float" description="Isotopic peaks that contribute more than this percentage to the overall isotope pattern intensity can be missing." tags="advanced" restrictions="0:100" />
31
+ <ITEM name="optional_fit_improvement" value="2" type="float" description="Minimal percental improvement of isotope fit to allow leaving out an optional peak." tags="advanced" restrictions="0:100" />
32
+ <ITEM name="mass_window_width" value="25" type="float" description="Window width in Dalton for precalculation of estimated isotope distribtions." tags="advanced" restrictions="1:200" />
33
+ </NODE>
34
+ <NODE name="seed" description="Settings that determine which peaks are considered a seed">
35
+ <ITEM name="min_score" value="0.8" type="float" description="Minimum seed score a peak has to reach to be used as seed.#br#The seed score is the geometric mean of intensity score, mass trace score and isotope pattern score.#br#If your features show a large deviation from the averagene isotope distribution or from an gaussian elution profile, lower this score." restrictions="0:1" />
36
+ </NODE>
37
+ <NODE name="fit" description="Settings for the model fitting">
38
+ <ITEM name="epsilon_abs" value="0.0001" type="float" description="Absolute epsilon used for convergence of the fit." tags="advanced" restrictions="0:" />
39
+ <ITEM name="epsilon_rel" value="0.0001" type="float" description="Relative epsilon used for convergence of the fit." tags="advanced" restrictions="0:" />
40
+ <ITEM name="max_iterations" value="500" type="int" description="Maximum number of iterations of the fit." tags="advanced" restrictions="1:" />
41
+ </NODE>
42
+ <NODE name="feature" description="Settings for the features (intensity, quality assessment, ...)">
43
+ <ITEM name="min_score" value="0.7" type="float" description="Feature score threshold for a feature to be reported.#br#The feature score is the geometric mean of the average relative deviation and the correlation between the model and the observed peaks." restrictions="0:1" />
44
+ <ITEM name="min_isotope_fit" value="0.8" type="float" description="Minimum isotope fit of the feature before model fitting." tags="advanced" restrictions="0:1" />
45
+ <ITEM name="min_trace_score" value="0.5" type="float" description="Trace score threshold.#br#Traces below this threshold are removed after the model fitting.#br#This parameter is important for features that overlap in m/z dimension." tags="advanced" restrictions="0:1" />
46
+ <ITEM name="min_rt_span" value="0.333" type="float" description="Minimum RT span in relation to extended area that has to remain after model fitting." tags="advanced" restrictions="0:1" />
47
+ <ITEM name="max_rt_span" value="2.5" type="float" description="Maximum RT span in relation to extended area that the model is allowed to have." tags="advanced" restrictions="0.5:" />
48
+ <ITEM name="rt_shape" value="symmetric" type="string" description="Choose model used for RT profile fitting. If set to symmetric a gauss shape is used, in case of asymmetric an EGH shape is used." tags="advanced" restrictions="symmetric,asymmetric" />
49
+ <ITEM name="max_intersection" value="0.35" type="float" description="Maximum allowed intersection of features." tags="advanced" restrictions="0:1" />
50
+ <ITEM name="reported_mz" value="monoisotopic" type="string" description="The mass type that is reported for features.#br#&apos;maximum&apos; returns the m/z value of the highest mass trace.#br#&apos;average&apos; returns the intensity-weighted average m/z value of all contained peaks.#br#&apos;monoisotopic&apos; returns the monoisotopic m/z value derived from the fitted isotope model." restrictions="maximum,average,monoisotopic" />
51
+ </NODE>
52
+ <NODE name="user-seed" description="Settings for user-specified seeds.">
53
+ <ITEM name="rt_tolerance" value="5" type="float" description="Allowed RT deviation of seeds from the user-specified seed position." restrictions="0:" />
54
+ <ITEM name="mz_tolerance" value="1.1" type="float" description="Allowed m/z deviation of seeds from the user-specified seed position." restrictions="0:" />
55
+ <ITEM name="min_score" value="0.5" type="float" description="Overwrites &apos;seed:min_score&apos; for user-specified seeds. The cutoff is typically a bit lower in this case." restrictions="0:1" />
56
+ </NODE>
57
+ <NODE name="debug" description="">
58
+ <ITEM name="pseudo_rt_shift" value="500" type="float" description="Pseudo RT shift used when ." tags="advanced" restrictions="1:" />
59
+ </NODE>
60
+ </NODE>
61
+ </NODE>
62
+ </NODE>
63
+ </PARAMETERS>
@@ -0,0 +1,47 @@
1
+ # Lists applications and their package dependencies
2
+ # For each application here include a shell script named install-appname.sh
3
+ #
4
+
5
+ rvm:
6
+ - curl
7
+ - git
8
+ - patch
9
+ - build-essential
10
+ - openssl
11
+ - libreadline6
12
+ - libreadline6-dev
13
+ - openssl
14
+ - libreadline6
15
+ - libreadline6-dev
16
+ - git-core
17
+ - zlib1g
18
+ - zlib1g-dev
19
+ - libssl-dev
20
+ - autoconf
21
+ - libc6-dev
22
+ - ncurses-dev
23
+ - automake
24
+ - libtool
25
+ - bison
26
+ - subversion
27
+ - pkg-config
28
+
29
+ galaxy:
30
+ - mercurial
31
+
32
+ tpp:
33
+ - cpanminus
34
+ - g++
35
+ - subversion
36
+ - libbz2-dev
37
+ - swig
38
+ - expat
39
+ - libpng12-dev
40
+ - gnuplot
41
+ - libperl-dev
42
+ - build-essential
43
+ - libgd2-xpm
44
+ - libfuse-dev
45
+ - libcurl4-openssl-dev
46
+ - libxml2-dev
47
+ - libgd2-xpm-dev
@@ -0,0 +1,10 @@
1
+ # Lists applications and their package dependencies
2
+ # For each application here include a shell script named install-appname.sh
3
+ #
4
+
5
+ common:
6
+ - wget
7
+ - gd
8
+ - libpng12
9
+ - cpanm
10
+ - libxml2
@@ -0,0 +1,20 @@
1
+ # This file is part of protk
2
+ # Created by Ira Cooke 14/12/2010
3
+ #
4
+ # This file contains default global constants and settings.
5
+ # User editable settings are found in ~/.protk/config.yml
6
+ #
7
+
8
+ message_level: info
9
+ protein_database_root: Databases
10
+ plasmodb_annotation_database: plasmodb_annotation
11
+ uniprot_sprot_annotation_database: swissprot_annotation
12
+ uniprot_trembl_annotation_database: trembl_annotation
13
+ galaxy_root: galaxy
14
+ default_mascot_server: www.matrixscience.com
15
+ tpp_root: tools/tpp
16
+ omssa_root: tools/omssa
17
+ openms_root: tools/openms
18
+ msgfplus_root: tools/msgfplus
19
+ blast_root: tools/blast
20
+ log_file: Logs/protk.log
@@ -0,0 +1,19 @@
1
+ #
2
+ # This is a predefined setup file for manage_db
3
+ #
4
+ # The crap database from gpmdb
5
+ #
6
+ ---
7
+ :description: The crap database from gpmdb
8
+ :decoy_prefix: decoy_
9
+ :make_blast_index: true
10
+ :include_filters: []
11
+ :format: fasta
12
+ :id_regexes:
13
+ - sp\|(.*)\|
14
+ :decoys: true
15
+ :archive_old: true
16
+ :sources:
17
+ - - ftp://ftp.thegpm.org/fasta/crap/crap.fasta
18
+ - none
19
+ :is_annotation_db: false
@@ -0,0 +1,25 @@
1
+ #
2
+ # This is a predefined setup file for manage_db
3
+ #
4
+ # Swissprot database filtered for human entries only and appending the CRAP database from gpmdb
5
+ # Requires that CRAP is installed first
6
+ #
7
+ ---
8
+ :description: Swissprot database filtered for human entries only and appending the CRAP database from gpmdb
9
+ :decoy_prefix: decoy_
10
+ :make_msgf_index: true
11
+ :make_blast_index: true
12
+ :include_filters:
13
+ - - OS=Homo\ssapiens
14
+ - - .*
15
+ :format: fasta
16
+ :id_regexes:
17
+ - sp\|.*\|(.*?)\s
18
+ - sp\|(.*)\|
19
+ :decoys: true
20
+ :archive_old: true
21
+ :sources:
22
+ - - ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
23
+ - ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt
24
+ - downloads/ftp.gpmdb.org/fasta/crap/crap.fasta
25
+ :is_annotation_db: false
@@ -0,0 +1,20 @@
1
+ #
2
+ # This is a predefined setup file for manage_db
3
+ #
4
+ # Swissprot_uniprot annotation database (full entries for each protein)
5
+ #
6
+ ---
7
+ :description: Swissprot_uniprot annotation database (full entries for each protein)
8
+ :archive_old: false
9
+ :is_annotation_db: true
10
+ :decoy_prefix: decoy_
11
+ :include_filters: []
12
+
13
+ :format: dat
14
+ :id_regexes: []
15
+
16
+ :make_blast_index: false
17
+ :sources:
18
+ - - ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
19
+ - ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt
20
+ :decoys: false
@@ -0,0 +1,20 @@
1
+ #
2
+ # This is a predefined setup file for manage_db
3
+ #
4
+ # Fasta files to be indexed for annotation (by protvis)
5
+ #
6
+ ---
7
+ :description: An indexed version of the swissprot database for annotation (sequences only)
8
+ :archive_old: false
9
+ :is_annotation_db: true
10
+ :decoy_prefix: decoy_
11
+ :make_blast_index: true
12
+ :format: fasta
13
+ :include_filters: []
14
+
15
+ :id_regexes: []
16
+
17
+ :sources:
18
+ - - ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
19
+ - ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt
20
+ :decoys: false
@@ -0,0 +1,56 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <bioml>
3
+
4
+ <note> TAXONOMY FILE. This is a file containing references to the sequence databases. Point it to your own taxonomy.xml if needed.</note>
5
+ <note type="input" label="list path, taxonomy information">Temporary tandem taxonomy file generated for each run</note>
6
+
7
+ <note> PROTEIN SEQUENCE DATABASE. This refers to identifiers in the taxonomy.xml, not the .fasta files themselves! Make sure the database you want is present as an entry in the taxonomy.xml referenced above. This is REQUIRED. </note>
8
+ <note type="input" label="protein, taxon">sphuman</note>
9
+
10
+ <note> FILE LOCATIONS. Replace them with your input (.mzXML) file and output file -- these are REQUIRED. Optionally a log file and a sequence output file of all protein sequences identified in the first-pass can be specified. Use of FULL path (not relative) paths is recommended. </note>
11
+ <note type="input" label="spectrum, path">/var/www/ISB/data/Data/microTOF/1010/mt164/mt164-CD14LPS_RD4_01_3725.d_raw.mzML</note>
12
+ <note type="input" label="output, path">/var/www/ISB/data/Data/microTOF/1010/mt164/mt164-CD14LPS_RD4_01_3725.d_raw.tandem</note>
13
+
14
+ <note> DEFAULT PARAMETERS. The value of "isb_default_input_kscore.xml" is recommended. Change to "isb_default_input_native.xml" for native X!Tandem scoring.</note>
15
+ <note type="input" label="list path, default parameters">/usr/local/tpp-4-4-0/bin/isb_default_input_kscore.xml</note>
16
+
17
+ <note> FRAGMENT MASS TOLERANCES </note>
18
+ <note type="input" label="spectrum, fragment monoisotopic mass error">0.65</note>
19
+
20
+ <note> PRECURSOR MASS TOLERANCES. In the example below, a -2.0 Da to 4.0 Da (monoisotopic mass) window is searched for peptide candidates. Since this is monoisotopic mass, so for non-accurate-mass instruments, for which the precursor is often taken nearer to the isotopically averaged mass, an asymmetric tolerance (-2.0 Da to 4.0 Da) is preferable. This somewhat imitates a (-3.0 Da to 3.0 Da) window for averaged mass (but not exactly)</note>
21
+ <note type="input" label="spectrum, parent monoisotopic mass error minus">100</note>
22
+ <note type="input" label="spectrum, parent monoisotopic mass error plus">100</note>
23
+ <note type="input" label="spectrum, parent monoisotopic mass error units">ppm</note>
24
+ <note>The value for this parameter may be 'Daltons' or 'ppm': all other values are ignored</note>
25
+ <note type="input" label="spectrum, parent monoisotopic mass isotope error">yes</note>
26
+ <note>This allows peptide candidates in windows around -1 Da and -2 Da from the acquired mass to be considered. Only applicable when the minus/plus window above is set to less than 0.5 Da. Good for accurate-mass instruments for which the reported precursor mass is not corrected to the monoisotopic mass. </note>
27
+
28
+
29
+ <note> MODIFICATIONS. In the example below, there is a static (carbamidomethyl) modification on C, and variable modifications on M (oxidation). Multiple modifications can be separated by commas, as in "80.0@S,80.0@T". Peptide terminal modifications can be specified with the symbol '[' for N-terminus and ']' for C-terminus, such as 42.0@[ . </note>
30
+ <note id="carbamidomethyl-fixed" type="input" label="residue, modification mass">57.021464@C</note>
31
+ <note id="methionine-oxidation-variable" type="input" label="residue, potential modification mass">15.994915@M</note>
32
+ <note id="glyco-variable" type="input" label="residue, potential modification motif">0.998@N!{P}[ST]</note>
33
+ <note> You can specify a variable modification when present in a motif. For instance, 0.998@N!{P}[ST] is a deamidation modification on N only if it is present in an N[any but P][S or T] motif (N-glycosite). </note>
34
+
35
+ <note type="input" label="protein, N-terminal residue modification mass"></note>
36
+ <note type="input" label="protein, C-terminal residue modification mass"></note>
37
+ <note> These are *static* modifications on the PROTEINS' N or C-termini. </note>
38
+
39
+ <note> SEMI-TRYPTICS AND MISSED CLEAVAGES. In the example below, semitryptic peptides are allowed, and up to 2 missed cleavages are allowed. </note>
40
+ <note type="input" label="protein, cleavage semi">yes</note>
41
+ <note type="input" label="scoring, maximum missed cleavage sites">2</note>
42
+
43
+ <note> REFINEMENT. Do not use unless you know what you are doing. Set "refine" to "yes" and specify what you want to search in the refinement. For non-confusing results, repeat the same modifications you set above for the first-pass here.</note>
44
+ <note type="input" label="refine">no</note>
45
+ <note type="input" label="refine, maximum valid expectation value">0.1</note>
46
+ <note type="input" label="refine, modification mass">57.012@C</note>
47
+ <note type="input" label="refine, potential modification mass">15.994915@M</note>
48
+ <note type="input" label="refine, potential modification motif"></note>
49
+ <note type="input" label="refine, cleavage semi">yes</note>
50
+ <note type="input" label="refine, unanticipated cleavage">no</note>
51
+ <note type="input" label="refine, potential N-terminus modifications"></note>
52
+ <note type="input" label="refine, potential C-terminus modifications"></note>
53
+ <note type="input" label="refine, point mutations">no</note>
54
+ <note type="input" label="refine, use potential modifications for full refinement">no</note>
55
+
56
+ </bioml>
@@ -0,0 +1,9 @@
1
+ <?xml version="1.0"?>
2
+ <bioml label="x! taxon-to-file matching list">
3
+
4
+ <taxon label="database_label">
5
+ <file format="peptide" URL="full_path_to_database_fasta_file" />
6
+ </taxon>
7
+
8
+ </bioml>
9
+