protk 1.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/README.md +85 -0
  2. data/bin/annotate_ids.rb +59 -0
  3. data/bin/big_search.rb +41 -0
  4. data/bin/correct_omssa_retention_times.rb +27 -0
  5. data/bin/feature_finder.rb +76 -0
  6. data/bin/file_convert.rb +157 -0
  7. data/bin/generate_omssa_loc.rb +42 -0
  8. data/bin/interprophet.rb +91 -0
  9. data/bin/make_decoy.rb +64 -0
  10. data/bin/manage_db.rb +123 -0
  11. data/bin/mascot_search.rb +187 -0
  12. data/bin/mascot_to_pepxml.rb +44 -0
  13. data/bin/msgfplus_search.rb +191 -0
  14. data/bin/omssa_search.rb +205 -0
  15. data/bin/peptide_prophet.rb +245 -0
  16. data/bin/pepxml_to_table.rb +78 -0
  17. data/bin/protein_prophet.rb +140 -0
  18. data/bin/protk_setup.rb +31 -0
  19. data/bin/repair_run_summary.rb +113 -0
  20. data/bin/tandem_search.rb +292 -0
  21. data/bin/template_search.rb +144 -0
  22. data/bin/unimod_to_loc.rb +118 -0
  23. data/bin/xls_to_table.rb +46 -0
  24. data/ext/protk/extconf.rb +3 -0
  25. data/ext/protk/protk.c +235 -0
  26. data/lib/protk/big_search_rakefile.rake +16 -0
  27. data/lib/protk/big_search_tool.rb +23 -0
  28. data/lib/protk/bio_sptr_extensions.rb +210 -0
  29. data/lib/protk/biotools_excel_converter.rb +60 -0
  30. data/lib/protk/command_runner.rb +84 -0
  31. data/lib/protk/constants.rb +296 -0
  32. data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
  33. data/lib/protk/data/apt-get_packages.yaml +47 -0
  34. data/lib/protk/data/brew_packages.yaml +10 -0
  35. data/lib/protk/data/default_config.yml +20 -0
  36. data/lib/protk/data/predefined_db.crap.yaml +19 -0
  37. data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
  38. data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
  39. data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
  40. data/lib/protk/data/tandem_params.xml +56 -0
  41. data/lib/protk/data/taxonomy_template.xml +9 -0
  42. data/lib/protk/data/unimod.xml +16780 -0
  43. data/lib/protk/eupathdb_gene_information_table.rb +158 -0
  44. data/lib/protk/galaxy_stager.rb +24 -0
  45. data/lib/protk/galaxy_util.rb +9 -0
  46. data/lib/protk/manage_db_rakefile.rake +484 -0
  47. data/lib/protk/manage_db_tool.rb +181 -0
  48. data/lib/protk/mascot_util.rb +63 -0
  49. data/lib/protk/omssa_util.rb +57 -0
  50. data/lib/protk/plasmodb.rb +50 -0
  51. data/lib/protk/prophet_tool.rb +85 -0
  52. data/lib/protk/protein_annotator.rb +646 -0
  53. data/lib/protk/protxml.rb +137 -0
  54. data/lib/protk/randomize.rb +7 -0
  55. data/lib/protk/search_tool.rb +182 -0
  56. data/lib/protk/setup_rakefile.rake +245 -0
  57. data/lib/protk/setup_tool.rb +19 -0
  58. data/lib/protk/spreadsheet_extensions.rb +78 -0
  59. data/lib/protk/swissprot_database.rb +38 -0
  60. data/lib/protk/tool.rb +182 -0
  61. data/lib/protk/xtandem_defaults.rb +11 -0
  62. data/lib/protk.rb +18 -0
  63. metadata +256 -0
data/README.md ADDED
@@ -0,0 +1,85 @@
1
+ # protk ( Proteomics toolkit )
2
+
3
+
4
+ ***
5
+ ## What is it?
6
+
7
+ Protk is a wrapper for various proteomics tools. Initially it focusses on MS/MS database search and validation.
8
+
9
+ ## Why do we need a wrapper around these tools
10
+
11
+ The aim of protk is present a consistent interface to numerous proteomics tools that is as uniform as possible. Protk also provides built-in support for managing protein databases.
12
+
13
+ ***
14
+
15
+
16
+
17
+ ## Basic Installation
18
+
19
+ 1. Install rvm
20
+ curl -L https://get.rvm.io | bash -s stable
21
+
22
+ On OSX
23
+ - rvm install 1.9.3 --with-gcc=clang
24
+ - rvm use 1.9.3
25
+ - gem install protk
26
+ - protk_setup.rb all
27
+
28
+ On Linux
29
+ - rvm install 1.9.3
30
+ - rvm use 1.9.3
31
+ - gem install protk
32
+ - sudo protk_setup.rb system_dependencies
33
+ - protk_setup all
34
+
35
+
36
+ ## Sequence databases
37
+
38
+ After running the setup.sh script you should run manage_db.rb to install specific sequence databases for use by the search engines. Protk comes with several predefined database configurations. For example, to install a database consisting of human entries from Swissprot plus known contaminants use the following command;
39
+
40
+ manage_db.rb add sphuman
41
+
42
+ You should now be able to run database searches, specifying this database by using the -d sphuman flag. Every month or so swissprot will release a new database version. You can keep your database up to date using;
43
+
44
+ manage_db.rb update sphuman
45
+
46
+ This will update the database only if any of its source files (or ftp release notes) have changed. The manage_db.rb tool also allows completely custom databases to be configured. Setup requires adding quite a few command-line options but once setup databases can easily be updated without further config. The example below shows the commandline arguments required to manually configure the sphuman database.
47
+
48
+ manage_db.rb add --ftp-source 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt' --include-filters '/OS=Homo\ssapiens/' --id-regex 'sp\|.*\|(.*?)\s' --add-decoys --make-blast-index --archive-old sphuman
49
+
50
+
51
+ ## Galaxy integration
52
+
53
+ Although all the protk tools can be run directly from the command-line a nicer way to run them (and visualise outputs) is to use the galaxy web application.
54
+
55
+ 1. Check out and install the latest stable galaxy [see the official galaxy wiki for more detailed setup instructions](http://wiki.g2.bx.psu.edu/Admin/Get%20Galaxy,"galaxy wiki")
56
+
57
+ hg clone https://bitbucket.org/galaxy/galaxy-dist
58
+ cd galaxy-dist
59
+ sh run.sh
60
+
61
+ 2. Make the protk tools available to galaxy.
62
+ - Create a directory for galaxy tool dependencies. It's best if this directory is outside the galaxy-dist directory. I usually create a directory called `tool_depends` alongside `galaxy-dist`.
63
+ - Open the file `universe_wsgi.ini` in the `galaxy-dist` directory and set the configuration option `tool_dependency_dir` to point to the directory you just created
64
+ - Create a symbolic link from the protk directory to the appropriate subdirectory of `<tool_dependency_dir>`. In the instructions below substitute 1.0.0 for the version number of [the protk galaxy tools](https://bitbucket.org/iracooke/protk-toolshed "protk galaxy tools") you are using.
65
+
66
+ cd <tool_dependency_dir>
67
+ mkdir protk
68
+ cd protk
69
+ mkdir 1.0.0
70
+ ln -s 1.0.0 default
71
+ ln -s <path_where_protk_was_installed> 1.0.0/bin
72
+
73
+ 3. Configure the shell in which galaxy tools will run.
74
+ - Create a symlink to the `env.sh` file so it will be sourced by galaxy as it runs each tool. This file should have been autogenerated by `setup.sh`
75
+
76
+ ln -s <path_where_protk_was_installed>/env.sh 1.0.0/env.sh
77
+
78
+ 4. Install the protk galaxy wrapper tools from the galaxy toolshed. You will need to restart galaxy after doing so for the new datatype sniffers to be activated.
79
+
80
+ 5. After installing the protk wrapper tools from the toolshed it will be necessary to tell those tools about databases you have installed. Use the manage_db.rb tool to do this. To do this, first edit config.yml to make sure the `galaxy_root` setting points to the root directory of your galaxy installation (this will allow `manage_db.rb` to update the `pepxml_databases.loc` file inside `galaxy_root/tool-data`). The run the following command and then restart the galaxy server;
81
+
82
+ manage_db.rb list -G
83
+
84
+
85
+
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of MSLIMS
4
+ # Created by Ira Cooke 21/7/2011
5
+ #
6
+ # Takes an input file with a list of identified proteins and creates a table with swissprot/uniprot database details in various columns for each protein in the input file.
7
+ #
8
+ #
9
+ require 'protk/constants'
10
+ require 'protk/command_runner'
11
+ require 'protk/prophet_tool'
12
+ require 'protk/protein_annotator'
13
+
14
+
15
+
16
+ # Setup specific command-line options for this tool. Other options are inherited from Tool
17
+ #
18
+ id_tool=ProphetTool.new({:explicit_output=>true,:over_write=>true})
19
+ id_tool.option_parser.banner = "Run ID annotation on a prot.xml input file.\n\nUsage: annotate_ids.rb [options] file1.prot.xml"
20
+ id_tool.options.output_prefix="annotated_"
21
+
22
+
23
+ id_tool.options.input_format=nil
24
+ id_tool.option_parser.on( '-I', '--input-format format', 'Format of input file' ) do |format|
25
+ id_tool.options.input_format = format
26
+ end
27
+
28
+ id_tool.option_parser.parse!
29
+
30
+ # Obtain a global environment object
31
+ genv=Constants.new
32
+
33
+ input_file=ARGV[0]
34
+
35
+ database_file=id_tool.extract_db(input_file)
36
+
37
+ output_file=nil
38
+
39
+ if ( id_tool.explicit_output==nil)
40
+ output_file="#{id_tool.output_prefix}#{input_file}#{id_tool.output_suffix}.xls"
41
+ else
42
+ output_file=id_tool.explicit_output
43
+ end
44
+
45
+ converter=ProteinAnnotator.new
46
+
47
+ begin
48
+ outpath=Pathname.new(output_file)
49
+
50
+ if ( id_tool.over_write || !outpath.exist? )
51
+ converter.convert(input_file,output_file,id_tool.input_format)
52
+ else
53
+ p "Output file #{output_file} already exists"
54
+ end
55
+
56
+ rescue Exception
57
+ p "Couldn't convert #{input_file}"
58
+ raise
59
+ end
data/bin/big_search.rb ADDED
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 14/12/2010
5
+ #
6
+ # Runs an MS/MS search using multiple search engines on multiple files in parallel
7
+ # Merges results using interprophet to produce a single output file
8
+ #
9
+ # This tool assumes that datasets are from an ESI-QUAD-TOF instrument
10
+ #
11
+ require 'protk/constants'
12
+ require 'protk/command_runner'
13
+ require 'protk/search_tool'
14
+ require 'protk/big_search_tool'
15
+ require 'rest_client'
16
+ require 'rake'
17
+
18
+ # Environment with global constants
19
+ #
20
+ genv=Constants.new
21
+
22
+ # Setup specific command-line options for this tool. Other options are inherited from SearchTool
23
+ #
24
+ search_tool=SearchTool.new({:msms_search=>true,:background=>false,:database=>true,:over_write=>true,:glyco=>true,:explicit_output=>true})
25
+ search_tool.jobid_prefix="b"
26
+
27
+ search_tool.option_parser.banner = "Run a multi-search engine search on a set of input files.\n\nUsage: big_search.rb [options] file1.mzML file2.mzML ..."
28
+ search_tool.options.output_suffix="_multisearch"
29
+
30
+
31
+ search_tool.options.ncpu=1
32
+ search_tool.option_parser.on( '-N', '--ncpu n', 'Split tasks into n separate processes if possible' ) do |n|
33
+ search_tool.options.ncpu=n
34
+ end
35
+
36
+ search_tool.option_parser.parse!
37
+
38
+ bgsrch = BigSearchTool.new
39
+
40
+
41
+ p bgsrch.run ["hi", "howdy"]
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 14/12/2010
5
+ #
6
+ # Corrects retention times in omssa output
7
+ #
8
+
9
+ $VERBOSE=nil
10
+
11
+ require 'protk/constants'
12
+ require 'protk/command_runner'
13
+ require 'protk/tool'
14
+ require 'protk/omssa_util'
15
+
16
+ # Environment with global constants
17
+ #
18
+ genv=Constants.new
19
+
20
+ tool=Tool.new
21
+ tool.option_parser.banner = "Correct retention times on a pepxml file produced by omssa using information from an mgf file.\n\nUsage: correct_omssa_retention_times.rb [options] file1.pep.xml file2.mgf"
22
+ tool.option_parser.parse!
23
+
24
+
25
+ OMSSAUtil.add_retention_times(ARGV[1],ARGV[0],tool.over_write,true)
26
+
27
+
@@ -0,0 +1,76 @@
1
+ #
2
+ # This file is part of protk
3
+ # Created by Ira Cooke 21/3/2012
4
+ #
5
+ # A wrapper for the OpenMS FeatureFinder tools (FeatureFinderCentroided and FeatureFinderIsotopeWavelet)
6
+ #
7
+ #
8
+ #!/bin/sh
9
+ if [ -z "$PROTK_RUBY_PATH" ] ; then
10
+ PROTK_RUBY_PATH=`which ruby`
11
+ fi
12
+
13
+ eval 'exec "$PROTK_RUBY_PATH" $PROTK_RUBY_FLAGS -rubygems -x -S $0 ${1+"$@"}'
14
+ echo "The 'exec \"$PROTK_RUBY_PATH\" -x -S ...' failed!" >&2
15
+ exit 1
16
+ #! ruby
17
+ #
18
+
19
+ $LOAD_PATH.unshift("#{File.dirname(__FILE__)}/lib/")
20
+
21
+ require 'constants'
22
+ require 'command_runner'
23
+ require 'tool'
24
+
25
+ # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
26
+ #
27
+ tool=Tool.new({:explicit_output=>true, :background=>true,:over_write=>true})
28
+ tool.option_parser.banner = "Find molecular features on a set of input files.\n\nUsage: feature_finder.rb [options] file1.mzML file2.mzML ..."
29
+
30
+ tool.options.profile = false
31
+ tool.option_parser.on( '--profile',"Input files are profile data" ) do
32
+ tool.options.profile = true
33
+ end
34
+
35
+ tool.option_parser.parse!
36
+
37
+ # Obtain a global environment object
38
+ genv=Constants.new
39
+
40
+ def run_ff(genv,tool,cmd,output_path,jobid)
41
+ if ( !tool.over_write && Pathname.new(output_path).exist? )
42
+ genv.log("Skipping analysis on existing file #{output_path}",:warn)
43
+ else
44
+ jobscript_path="#{output_path}.pbs.sh"
45
+ job_params={:jobid=>jobid, :vmem=>"12Gb", :queue => "sixteen"}
46
+ code=tool.run(cmd,genv,job_params,jobscript_path)
47
+ throw "Command failed with exit code #{code}" unless code==0
48
+ end
49
+ end
50
+
51
+
52
+ throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
53
+ throw "The profile option is not yet implemented" if ( tool.profile )
54
+
55
+ ini_file="#{File.dirname(__FILE__)}/params/FeatureFinderCentroided.ini"
56
+
57
+ ARGV.each do |filen|
58
+ input_file=filen.chomp
59
+ throw "Input must be an mzML file" unless input_file=~/\.mzML$/
60
+
61
+ input_basename=input_file.gsub(/\.mzML$/,'')
62
+ output_filename=tool.explicit_output
63
+ output_file="#{input_basename}.featureXML" if output_filename==nil
64
+
65
+ if ( tool.over_write || !Pathname.new(output_file).exist? )
66
+ output_dir=Pathname.new(output_file).dirname.realpath.to_s
67
+ output_base_filename=Pathname.new(output_file).basename.to_s
68
+ cmd=""
69
+ cmd<<"#{genv.openms_root}/FeatureFinderCentroided -in #{Pathname.new(input_file).realpath.to_s} -out #{output_dir}/#{output_base_filename} -ini #{ini_file}"
70
+
71
+ run_ff(genv,tool,cmd,output_file,tool.jobid_from_filename(input_basename))
72
+
73
+ else
74
+ genv.log("Skipping search on existing file #{output_file}",:warn)
75
+ end
76
+ end
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 14/12/2010
5
+ #
6
+ # Wrapper for msconvert
7
+ #
8
+
9
+ require 'protk/constants'
10
+ require 'protk/command_runner'
11
+ require 'protk/tool'
12
+ require 'tempfile'
13
+ require 'libxml'
14
+
15
+ include LibXML
16
+
17
+
18
+ # Read the input file and search for an instance of the charge state cvParam inside a precursor tag. Return true if one is found. False otherwise
19
+ #
20
+ def has_charge_information(input_filename)
21
+ #<precursorList count="1">
22
+ # <precursor spectrumRef="controllerType=0 controllerNumber=1 scan=59">
23
+ # <isolationWindow>
24
+ # <cvParam cvRef="MS" accession="MS:1000827" name="isolation window target m/z" value="939.43" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
25
+ # <cvParam cvRef="MS" accession="MS:1000828" name="isolation window lower offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
26
+ # <cvParam cvRef="MS" accession="MS:1000829" name="isolation window upper offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
27
+ # </isolationWindow>
28
+ # <selectedIonList count="1">
29
+ # <selectedIon>
30
+ # <cvParam cvRef="MS" accession="MS:1000744" name="selected ion m/z" value="939.432189941406" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
31
+ # <cvParam cvRef="MS" accession="MS:1000041" name="charge state" value="2"/>
32
+ # <cvParam cvRef="MS" accession="MS:1000042" name="peak intensity" value="1321.692016601563" unitCvRef="MS" unitAccession="MS:1000131" unitName="number of counts"/>
33
+ # </selectedIon>
34
+ # </selectedIonList>
35
+
36
+ reader=XML::Reader.file(input_filename)
37
+
38
+ while(reader.read)
39
+
40
+ if ( reader.local_name=="precursor")
41
+
42
+ subdoc=reader.read_inner_xml
43
+
44
+ if ( subdoc =~ /MS:1000041/ )
45
+ return true
46
+ end
47
+
48
+ end
49
+
50
+ end
51
+
52
+ return false
53
+
54
+ end
55
+
56
+
57
+
58
+ # Setup specific command-line options for this tool. Other options are inherited from Tool
59
+ #
60
+ convert_tool=Tool.new({:explicit_output=>true,:over_write=>true,:maldi=>true})
61
+ convert_tool.option_parser.banner = "Convert files between different formats.\n\nUsage: file_convert.rb [options] input_file output_file"
62
+
63
+ # Special case (usually tool specific options use capitals). Use lowercase l here to mimick maldi option in the search_tool class
64
+ #
65
+ convert_tool.options.maldi=false
66
+ convert_tool.option_parser.on( '-l', '--maldi', 'Input Files are MALDI Spectra' ) do
67
+ convert_tool.options.maldi=true
68
+ end
69
+
70
+ convert_tool.options.output_format="mgf"
71
+ convert_tool.option_parser.on( '-F', '--format fmt', 'Convert to a specified format' ) do |fmt|
72
+ convert_tool.options.output_format=fmt
73
+ end
74
+
75
+ #convert_tool.options.missing_charge_state="false"
76
+ #convert_tool.option_parser.on( '-C', '--missing-charges', 'No attempt will be made to write charge states. Leads to better looking spectrum names' ) do |fmt|
77
+ # convert_tool.options.output_format=fmt
78
+ #end
79
+ #end
80
+
81
+
82
+
83
+ convert_tool.option_parser.parse!
84
+
85
+
86
+
87
+ # Environment with global constants
88
+ #
89
+ genv=Constants.new
90
+
91
+ filename=ARGV[0]
92
+
93
+
94
+ input_ext=Pathname.new(filename).extname
95
+ input_relative_filename=Pathname.new(filename).basename.to_s
96
+
97
+ base_output_dir=Pathname.new(filename).dirname.realpath.to_s #Default output dir is input dir
98
+
99
+ output_basename=input_relative_filename.gsub(/#{input_ext}$/,"").to_s
100
+
101
+ if ( convert_tool.explicit_output )
102
+ output_filepath=Pathname.new(convert_tool.explicit_output)
103
+ base_output_dir=output_filepath.dirname.to_s
104
+
105
+ if ( convert_tool.explicit_output=~/^\//) # It's an absolute path so use absolute path as output dir
106
+ # Convert base_output_dir to realpath
107
+ #
108
+ base_output_dir=Pathname.new(base_output_dir).realpath.to_s
109
+ end
110
+
111
+ output_filename=output_filepath.basename.to_s
112
+
113
+ end
114
+
115
+ # Create a uniquely named directory to hold the output. This is the only way to know the output of msconvert
116
+ #
117
+ output_dir="#{base_output_dir}/#{Pathname.new(Tempfile.new("file_convert").path).basename.to_s}"
118
+ Dir.mkdir(output_dir)
119
+
120
+
121
+ throw "Input format is the same as output format" if ( input_ext==".#{convert_tool.output_format}" )
122
+
123
+ genv.log("Converting #{filename} to #{convert_tool.output_format}",:info)
124
+ runner=CommandRunner.new(genv)
125
+ basedir=Pathname.new(filename).dirname.to_s #Where we run the tool
126
+
127
+ if ( convert_tool.maldi )
128
+ #For MALDI we know the charge is 1 so set it explicitly. Sometimes it is missing from the data
129
+ runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
130
+ else
131
+ if ( has_charge_information(filename) )
132
+ runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.<ChargeState>\" --#{convert_tool.output_format} -o #{output_dir}")
133
+ else
134
+ # If input file is missing charges the best we can do is just assign charge=1. Search engines can choose to ignore this value anyway.
135
+ #
136
+ runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
137
+ end
138
+ end
139
+
140
+ # Find out what the output name was
141
+ #
142
+ tmp_output_filename=""
143
+ Dir.foreach(output_dir) { |entry_name|
144
+ if ( entry_name=~/^\.$/ || entry_name=~/^\.\.$/ )
145
+ else
146
+ tmp_output_filename=entry_name
147
+ end
148
+ }
149
+
150
+ # Cleanup after converting
151
+ cmd = "cd #{output_dir};pwd; mv #{tmp_output_filename} #{base_output_dir}/#{output_filename}; cd ../; pwd;rm -r #{output_dir}"
152
+
153
+ code =runner.run_local(cmd)
154
+
155
+ throw "Command failed with exit code #{code}" unless code==0
156
+
157
+ throw "Failed to create output file #{base_output_dir}/#{output_filename}" unless ( FileTest.exists?("#{base_output_dir}/#{output_filename}") )
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of MSLIMS
4
+ # Created by Ira Cooke 12/4/2010
5
+ #
6
+ # Generates files required by the omssa galaxy wrapper
7
+ #
8
+
9
+ require 'protk/constants'
10
+ # Environment with global constants
11
+ #
12
+ genv=Constants.new
13
+
14
+ # Set search engine specific parameters on the SearchTool object
15
+ #
16
+ omssa_root="#{genv.omssa_root}/omssacl"
17
+ # Get ommssa to print out a list of its acceptable modifications
18
+ acceptable_mods=%x[#{omssa_root} -ml].split(/\n/).collect do |mod|
19
+
20
+ mod_vals=mod.split(":")
21
+ [mod_vals[0].lstrip.rstrip,mod_vals[1].lstrip.rstrip]
22
+
23
+ end
24
+
25
+ # Drop the header
26
+ #
27
+ acceptable_mods.shift
28
+
29
+ loc_output=File.new("omssa_mods.loc",'w')
30
+
31
+ loc_output << "#This file lists the names of chemical modifications accepted by OMMSA\n"
32
+ loc_output << "#\n"
33
+ loc_output << "#\n"
34
+
35
+ acceptable_mods.each { |am|
36
+ key = am[1].downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
37
+ loc_output << "#{am[1]}\t#{key}_\t#{am[0]}\t#{key}_\n"
38
+ }
39
+
40
+ loc_output.close
41
+
42
+
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 18/1/2011
5
+ #
6
+ # Runs the InterProphet tool on a set of pep.xml files generated by peptide_prophet
7
+ #
8
+ #
9
+
10
+ require 'protk/constants'
11
+ require 'protk/command_runner'
12
+ require 'protk/prophet_tool'
13
+
14
+ # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
15
+ #
16
+ prophet_tool=ProphetTool.new({:explicit_output=>true})
17
+ prophet_tool.option_parser.banner = "Run InterProphet on a set of pep.xml input files.\n\nUsage: interprophet.rb [options] file1.pep.xml file2.pep.xml ..."
18
+ prophet_tool.options.output_suffix="_iproph"
19
+
20
+
21
+ prophet_tool.options.no_nss=""
22
+ prophet_tool.option_parser.on( '--no-nss', 'Don\'t use NSS (Number of Sibling Searches) in Model' ) do
23
+ prophet_tool.options.no_nss="NONSS"
24
+ end
25
+
26
+ prophet_tool.options.no_nrs=""
27
+ prophet_tool.option_parser.on('--no-nrs', 'Don\'t use NRS (Number of Replicate Spectra) in Model' ) do
28
+ prophet_tool.options.no_nrs="NONRS"
29
+ end
30
+
31
+ prophet_tool.options.no_nse=""
32
+ prophet_tool.option_parser.on('--no-nse', 'Don\'t use NSE (Number of Sibling Experiments) in Model' ) do
33
+ prophet_tool.options.no_nse="NONSE"
34
+ end
35
+
36
+ prophet_tool.options.no_nsi=""
37
+ prophet_tool.option_parser.on("--no-nsi",'Don\'t use NSE (Number of Sibling Ions) in Model' ) do
38
+ prophet_tool.options.no_nsi="NONSI"
39
+ end
40
+
41
+ prophet_tool.options.no_nsm=""
42
+ prophet_tool.option_parser.on("--no-nsm",'Don\'t use NSE (Number of Sibling Modifications) in Model' ) do
43
+ prophet_tool.options.no_nsm="NONSM"
44
+ end
45
+
46
+ prophet_tool.options.min_prob=""
47
+ prophet_tool.option_parser.on("--minprob mp","Minimum probability cutoff ") do |mp|
48
+ prophet_tool.options.min_prob=mp
49
+ end
50
+
51
+ prophet_tool.option_parser.parse!
52
+
53
+
54
+ # Obtain a global environment object
55
+ genv=Constants.new
56
+
57
+ if ( prophet_tool.explicit_output != nil )
58
+ output_file=prophet_tool.explicit_output
59
+ else
60
+ output_file="#{prophet_tool.output_prefix}interact#{prophet_tool.output_suffix}.pep.xml"
61
+ end
62
+
63
+ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
64
+
65
+ cmd="#{genv.interprophetparser} #{prophet_tool.options.no_nss} #{prophet_tool.options.no_nrs} #{prophet_tool.options.no_nse} #{prophet_tool.options.no_nsi} #{prophet_tool.options.no_nsm}"
66
+ cmd << " MINPROB=#{min_prob}" if ( prophet_tool.options.min_prob !="" )
67
+
68
+ inputs = ARGV.collect {|file_name|
69
+ file_name.chomp
70
+ }
71
+
72
+ cmd << " #{inputs.join(" ")} #{output_file}"
73
+
74
+ genv.log("Running #{cmd}",:info)
75
+
76
+ # Run the analysis
77
+ #
78
+ jobscript_path="#{output_file}.pbs.sh"
79
+ job_params={:jobid=>"iprophet", :vmem=>"900mb", :queue => "lowmem"}
80
+ code = prophet_tool.run(cmd,genv,job_params,jobscript_path)
81
+ throw "Command failed with exit code #{code}" unless code==0
82
+
83
+ else
84
+ genv.log("Interprophet output file #{output_file} already exists. Run with -r option to replace",:warn)
85
+ end
86
+
87
+
88
+
89
+
90
+
91
+
data/bin/make_decoy.rb ADDED
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 9/3/2012
5
+ #
6
+ # Create a decoy database based on a set of real protein sequences
7
+ #
8
+ #
9
+
10
+ require 'libxml'
11
+ require 'protk/constants'
12
+ require 'protk/command_runner'
13
+ require 'protk/tool'
14
+ require 'bio'
15
+
16
+ include LibXML
17
+
18
+ # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
19
+ #
20
+ tool=Tool.new({:explicit_output=>true})
21
+ tool.option_parser.banner = "Create a decoy database from real protein sequences.\n\nUsage: make_decoy.rb [options] realdb.fasta"
22
+
23
+ tool.options.db_length=0
24
+ tool.option_parser.on('-L len','--db-length len','Number of sequences to generate') do |len|
25
+ tool.options.db_length=len.to_i
26
+ end
27
+
28
+ tool.options.prefix_string="decoy_"
29
+ tool.option_parser.on('-P str','--prefix-string str','String to prepend to sequence ids') do |str|
30
+ tool.options.prefix_string=str
31
+ end
32
+
33
+ tool.options.append=false
34
+ tool.option_parser.on('-A','--append','Append input sequences to the generated database') do
35
+ tool.options.append=true
36
+ end
37
+
38
+
39
+ tool.option_parser.parse!
40
+
41
+ input_file=ARGV[0]
42
+
43
+
44
+ db_length=tool.db_length
45
+ if ( db_length==0) #If no db length was specified use the number of entries in the input file
46
+ db_length=Bio::FastaFormat.open(input_file).count
47
+ p "Found #{db_length} entries in input file"
48
+ end
49
+
50
+ output_file="decoy_#{input_file}"
51
+
52
+ output_file = tool.explicit_output if tool.explicit_output!=nil
53
+
54
+ genv=Constants.new()
55
+
56
+ Randomize.make_decoys #{input_file} #{db_length} #{output_file} #{tool.prefix_string}"
57
+ cmd << "cat #{input_file} >> #{output_file}" if ( tool.append )
58
+ p cmd
59
+ # Run the conversion
60
+ #
61
+ job_params= {:jobid => tool.jobid_from_filename(input_file) }
62
+ job_params[:queue]="lowmem"
63
+ job_params[:vmem]="900mb"
64
+ tool.run(cmd,genv,job_params)