protk 1.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/README.md +85 -0
  2. data/bin/annotate_ids.rb +59 -0
  3. data/bin/big_search.rb +41 -0
  4. data/bin/correct_omssa_retention_times.rb +27 -0
  5. data/bin/feature_finder.rb +76 -0
  6. data/bin/file_convert.rb +157 -0
  7. data/bin/generate_omssa_loc.rb +42 -0
  8. data/bin/interprophet.rb +91 -0
  9. data/bin/make_decoy.rb +64 -0
  10. data/bin/manage_db.rb +123 -0
  11. data/bin/mascot_search.rb +187 -0
  12. data/bin/mascot_to_pepxml.rb +44 -0
  13. data/bin/msgfplus_search.rb +191 -0
  14. data/bin/omssa_search.rb +205 -0
  15. data/bin/peptide_prophet.rb +245 -0
  16. data/bin/pepxml_to_table.rb +78 -0
  17. data/bin/protein_prophet.rb +140 -0
  18. data/bin/protk_setup.rb +31 -0
  19. data/bin/repair_run_summary.rb +113 -0
  20. data/bin/tandem_search.rb +292 -0
  21. data/bin/template_search.rb +144 -0
  22. data/bin/unimod_to_loc.rb +118 -0
  23. data/bin/xls_to_table.rb +46 -0
  24. data/ext/protk/extconf.rb +3 -0
  25. data/ext/protk/protk.c +235 -0
  26. data/lib/protk/big_search_rakefile.rake +16 -0
  27. data/lib/protk/big_search_tool.rb +23 -0
  28. data/lib/protk/bio_sptr_extensions.rb +210 -0
  29. data/lib/protk/biotools_excel_converter.rb +60 -0
  30. data/lib/protk/command_runner.rb +84 -0
  31. data/lib/protk/constants.rb +296 -0
  32. data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
  33. data/lib/protk/data/apt-get_packages.yaml +47 -0
  34. data/lib/protk/data/brew_packages.yaml +10 -0
  35. data/lib/protk/data/default_config.yml +20 -0
  36. data/lib/protk/data/predefined_db.crap.yaml +19 -0
  37. data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
  38. data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
  39. data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
  40. data/lib/protk/data/tandem_params.xml +56 -0
  41. data/lib/protk/data/taxonomy_template.xml +9 -0
  42. data/lib/protk/data/unimod.xml +16780 -0
  43. data/lib/protk/eupathdb_gene_information_table.rb +158 -0
  44. data/lib/protk/galaxy_stager.rb +24 -0
  45. data/lib/protk/galaxy_util.rb +9 -0
  46. data/lib/protk/manage_db_rakefile.rake +484 -0
  47. data/lib/protk/manage_db_tool.rb +181 -0
  48. data/lib/protk/mascot_util.rb +63 -0
  49. data/lib/protk/omssa_util.rb +57 -0
  50. data/lib/protk/plasmodb.rb +50 -0
  51. data/lib/protk/prophet_tool.rb +85 -0
  52. data/lib/protk/protein_annotator.rb +646 -0
  53. data/lib/protk/protxml.rb +137 -0
  54. data/lib/protk/randomize.rb +7 -0
  55. data/lib/protk/search_tool.rb +182 -0
  56. data/lib/protk/setup_rakefile.rake +245 -0
  57. data/lib/protk/setup_tool.rb +19 -0
  58. data/lib/protk/spreadsheet_extensions.rb +78 -0
  59. data/lib/protk/swissprot_database.rb +38 -0
  60. data/lib/protk/tool.rb +182 -0
  61. data/lib/protk/xtandem_defaults.rb +11 -0
  62. data/lib/protk.rb +18 -0
  63. metadata +256 -0
data/README.md ADDED
@@ -0,0 +1,85 @@
1
+ # protk ( Proteomics toolkit )
2
+
3
+
4
+ ***
5
+ ## What is it?
6
+
7
+ Protk is a wrapper for various proteomics tools. Initially it focusses on MS/MS database search and validation.
8
+
9
+ ## Why do we need a wrapper around these tools
10
+
11
+ The aim of protk is present a consistent interface to numerous proteomics tools that is as uniform as possible. Protk also provides built-in support for managing protein databases.
12
+
13
+ ***
14
+
15
+
16
+
17
+ ## Basic Installation
18
+
19
+ 1. Install rvm
20
+ curl -L https://get.rvm.io | bash -s stable
21
+
22
+ On OSX
23
+ - rvm install 1.9.3 --with-gcc=clang
24
+ - rvm use 1.9.3
25
+ - gem install protk
26
+ - protk_setup.rb all
27
+
28
+ On Linux
29
+ - rvm install 1.9.3
30
+ - rvm use 1.9.3
31
+ - gem install protk
32
+ - sudo protk_setup.rb system_dependencies
33
+ - protk_setup all
34
+
35
+
36
+ ## Sequence databases
37
+
38
+ After running the setup.sh script you should run manage_db.rb to install specific sequence databases for use by the search engines. Protk comes with several predefined database configurations. For example, to install a database consisting of human entries from Swissprot plus known contaminants use the following command;
39
+
40
+ manage_db.rb add sphuman
41
+
42
+ You should now be able to run database searches, specifying this database by using the -d sphuman flag. Every month or so swissprot will release a new database version. You can keep your database up to date using;
43
+
44
+ manage_db.rb update sphuman
45
+
46
+ This will update the database only if any of its source files (or ftp release notes) have changed. The manage_db.rb tool also allows completely custom databases to be configured. Setup requires adding quite a few command-line options but once setup databases can easily be updated without further config. The example below shows the commandline arguments required to manually configure the sphuman database.
47
+
48
+ manage_db.rb add --ftp-source 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt' --include-filters '/OS=Homo\ssapiens/' --id-regex 'sp\|.*\|(.*?)\s' --add-decoys --make-blast-index --archive-old sphuman
49
+
50
+
51
+ ## Galaxy integration
52
+
53
+ Although all the protk tools can be run directly from the command-line a nicer way to run them (and visualise outputs) is to use the galaxy web application.
54
+
55
+ 1. Check out and install the latest stable galaxy [see the official galaxy wiki for more detailed setup instructions](http://wiki.g2.bx.psu.edu/Admin/Get%20Galaxy,"galaxy wiki")
56
+
57
+ hg clone https://bitbucket.org/galaxy/galaxy-dist
58
+ cd galaxy-dist
59
+ sh run.sh
60
+
61
+ 2. Make the protk tools available to galaxy.
62
+ - Create a directory for galaxy tool dependencies. It's best if this directory is outside the galaxy-dist directory. I usually create a directory called `tool_depends` alongside `galaxy-dist`.
63
+ - Open the file `universe_wsgi.ini` in the `galaxy-dist` directory and set the configuration option `tool_dependency_dir` to point to the directory you just created
64
+ - Create a symbolic link from the protk directory to the appropriate subdirectory of `<tool_dependency_dir>`. In the instructions below substitute 1.0.0 for the version number of [the protk galaxy tools](https://bitbucket.org/iracooke/protk-toolshed "protk galaxy tools") you are using.
65
+
66
+ cd <tool_dependency_dir>
67
+ mkdir protk
68
+ cd protk
69
+ mkdir 1.0.0
70
+ ln -s 1.0.0 default
71
+ ln -s <path_where_protk_was_installed> 1.0.0/bin
72
+
73
+ 3. Configure the shell in which galaxy tools will run.
74
+ - Create a symlink to the `env.sh` file so it will be sourced by galaxy as it runs each tool. This file should have been autogenerated by `setup.sh`
75
+
76
+ ln -s <path_where_protk_was_installed>/env.sh 1.0.0/env.sh
77
+
78
+ 4. Install the protk galaxy wrapper tools from the galaxy toolshed. You will need to restart galaxy after doing so for the new datatype sniffers to be activated.
79
+
80
+ 5. After installing the protk wrapper tools from the toolshed it will be necessary to tell those tools about databases you have installed. Use the manage_db.rb tool to do this. To do this, first edit config.yml to make sure the `galaxy_root` setting points to the root directory of your galaxy installation (this will allow `manage_db.rb` to update the `pepxml_databases.loc` file inside `galaxy_root/tool-data`). The run the following command and then restart the galaxy server;
81
+
82
+ manage_db.rb list -G
83
+
84
+
85
+
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of MSLIMS
4
+ # Created by Ira Cooke 21/7/2011
5
+ #
6
+ # Takes an input file with a list of identified proteins and creates a table with swissprot/uniprot database details in various columns for each protein in the input file.
7
+ #
8
+ #
9
+ require 'protk/constants'
10
+ require 'protk/command_runner'
11
+ require 'protk/prophet_tool'
12
+ require 'protk/protein_annotator'
13
+
14
+
15
+
16
+ # Setup specific command-line options for this tool. Other options are inherited from Tool
17
+ #
18
+ id_tool=ProphetTool.new({:explicit_output=>true,:over_write=>true})
19
+ id_tool.option_parser.banner = "Run ID annotation on a prot.xml input file.\n\nUsage: annotate_ids.rb [options] file1.prot.xml"
20
+ id_tool.options.output_prefix="annotated_"
21
+
22
+
23
+ id_tool.options.input_format=nil
24
+ id_tool.option_parser.on( '-I', '--input-format format', 'Format of input file' ) do |format|
25
+ id_tool.options.input_format = format
26
+ end
27
+
28
+ id_tool.option_parser.parse!
29
+
30
+ # Obtain a global environment object
31
+ genv=Constants.new
32
+
33
+ input_file=ARGV[0]
34
+
35
+ database_file=id_tool.extract_db(input_file)
36
+
37
+ output_file=nil
38
+
39
+ if ( id_tool.explicit_output==nil)
40
+ output_file="#{id_tool.output_prefix}#{input_file}#{id_tool.output_suffix}.xls"
41
+ else
42
+ output_file=id_tool.explicit_output
43
+ end
44
+
45
+ converter=ProteinAnnotator.new
46
+
47
+ begin
48
+ outpath=Pathname.new(output_file)
49
+
50
+ if ( id_tool.over_write || !outpath.exist? )
51
+ converter.convert(input_file,output_file,id_tool.input_format)
52
+ else
53
+ p "Output file #{output_file} already exists"
54
+ end
55
+
56
+ rescue Exception
57
+ p "Couldn't convert #{input_file}"
58
+ raise
59
+ end
data/bin/big_search.rb ADDED
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 14/12/2010
5
+ #
6
+ # Runs an MS/MS search using multiple search engines on multiple files in parallel
7
+ # Merges results using interprophet to produce a single output file
8
+ #
9
+ # This tool assumes that datasets are from an ESI-QUAD-TOF instrument
10
+ #
11
+ require 'protk/constants'
12
+ require 'protk/command_runner'
13
+ require 'protk/search_tool'
14
+ require 'protk/big_search_tool'
15
+ require 'rest_client'
16
+ require 'rake'
17
+
18
+ # Environment with global constants
19
+ #
20
+ genv=Constants.new
21
+
22
+ # Setup specific command-line options for this tool. Other options are inherited from SearchTool
23
+ #
24
+ search_tool=SearchTool.new({:msms_search=>true,:background=>false,:database=>true,:over_write=>true,:glyco=>true,:explicit_output=>true})
25
+ search_tool.jobid_prefix="b"
26
+
27
+ search_tool.option_parser.banner = "Run a multi-search engine search on a set of input files.\n\nUsage: big_search.rb [options] file1.mzML file2.mzML ..."
28
+ search_tool.options.output_suffix="_multisearch"
29
+
30
+
31
+ search_tool.options.ncpu=1
32
+ search_tool.option_parser.on( '-N', '--ncpu n', 'Split tasks into n separate processes if possible' ) do |n|
33
+ search_tool.options.ncpu=n
34
+ end
35
+
36
+ search_tool.option_parser.parse!
37
+
38
+ bgsrch = BigSearchTool.new
39
+
40
+
41
+ p bgsrch.run ["hi", "howdy"]
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 14/12/2010
5
+ #
6
+ # Corrects retention times in omssa output
7
+ #
8
+
9
+ $VERBOSE=nil
10
+
11
+ require 'protk/constants'
12
+ require 'protk/command_runner'
13
+ require 'protk/tool'
14
+ require 'protk/omssa_util'
15
+
16
+ # Environment with global constants
17
+ #
18
+ genv=Constants.new
19
+
20
+ tool=Tool.new
21
+ tool.option_parser.banner = "Correct retention times on a pepxml file produced by omssa using information from an mgf file.\n\nUsage: correct_omssa_retention_times.rb [options] file1.pep.xml file2.mgf"
22
+ tool.option_parser.parse!
23
+
24
+
25
+ OMSSAUtil.add_retention_times(ARGV[1],ARGV[0],tool.over_write,true)
26
+
27
+
@@ -0,0 +1,76 @@
1
+ #
2
+ # This file is part of protk
3
+ # Created by Ira Cooke 21/3/2012
4
+ #
5
+ # A wrapper for the OpenMS FeatureFinder tools (FeatureFinderCentroided and FeatureFinderIsotopeWavelet)
6
+ #
7
+ #
8
+ #!/bin/sh
9
+ if [ -z "$PROTK_RUBY_PATH" ] ; then
10
+ PROTK_RUBY_PATH=`which ruby`
11
+ fi
12
+
13
+ eval 'exec "$PROTK_RUBY_PATH" $PROTK_RUBY_FLAGS -rubygems -x -S $0 ${1+"$@"}'
14
+ echo "The 'exec \"$PROTK_RUBY_PATH\" -x -S ...' failed!" >&2
15
+ exit 1
16
+ #! ruby
17
+ #
18
+
19
+ $LOAD_PATH.unshift("#{File.dirname(__FILE__)}/lib/")
20
+
21
+ require 'constants'
22
+ require 'command_runner'
23
+ require 'tool'
24
+
25
+ # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
26
+ #
27
+ tool=Tool.new({:explicit_output=>true, :background=>true,:over_write=>true})
28
+ tool.option_parser.banner = "Find molecular features on a set of input files.\n\nUsage: feature_finder.rb [options] file1.mzML file2.mzML ..."
29
+
30
+ tool.options.profile = false
31
+ tool.option_parser.on( '--profile',"Input files are profile data" ) do
32
+ tool.options.profile = true
33
+ end
34
+
35
+ tool.option_parser.parse!
36
+
37
+ # Obtain a global environment object
38
+ genv=Constants.new
39
+
40
+ def run_ff(genv,tool,cmd,output_path,jobid)
41
+ if ( !tool.over_write && Pathname.new(output_path).exist? )
42
+ genv.log("Skipping analysis on existing file #{output_path}",:warn)
43
+ else
44
+ jobscript_path="#{output_path}.pbs.sh"
45
+ job_params={:jobid=>jobid, :vmem=>"12Gb", :queue => "sixteen"}
46
+ code=tool.run(cmd,genv,job_params,jobscript_path)
47
+ throw "Command failed with exit code #{code}" unless code==0
48
+ end
49
+ end
50
+
51
+
52
+ throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
53
+ throw "The profile option is not yet implemented" if ( tool.profile )
54
+
55
+ ini_file="#{File.dirname(__FILE__)}/params/FeatureFinderCentroided.ini"
56
+
57
+ ARGV.each do |filen|
58
+ input_file=filen.chomp
59
+ throw "Input must be an mzML file" unless input_file=~/\.mzML$/
60
+
61
+ input_basename=input_file.gsub(/\.mzML$/,'')
62
+ output_filename=tool.explicit_output
63
+ output_file="#{input_basename}.featureXML" if output_filename==nil
64
+
65
+ if ( tool.over_write || !Pathname.new(output_file).exist? )
66
+ output_dir=Pathname.new(output_file).dirname.realpath.to_s
67
+ output_base_filename=Pathname.new(output_file).basename.to_s
68
+ cmd=""
69
+ cmd<<"#{genv.openms_root}/FeatureFinderCentroided -in #{Pathname.new(input_file).realpath.to_s} -out #{output_dir}/#{output_base_filename} -ini #{ini_file}"
70
+
71
+ run_ff(genv,tool,cmd,output_file,tool.jobid_from_filename(input_basename))
72
+
73
+ else
74
+ genv.log("Skipping search on existing file #{output_file}",:warn)
75
+ end
76
+ end
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 14/12/2010
5
+ #
6
+ # Wrapper for msconvert
7
+ #
8
+
9
+ require 'protk/constants'
10
+ require 'protk/command_runner'
11
+ require 'protk/tool'
12
+ require 'tempfile'
13
+ require 'libxml'
14
+
15
+ include LibXML
16
+
17
+
18
+ # Read the input file and search for an instance of the charge state cvParam inside a precursor tag. Return true if one is found. False otherwise
19
+ #
20
+ def has_charge_information(input_filename)
21
+ #<precursorList count="1">
22
+ # <precursor spectrumRef="controllerType=0 controllerNumber=1 scan=59">
23
+ # <isolationWindow>
24
+ # <cvParam cvRef="MS" accession="MS:1000827" name="isolation window target m/z" value="939.43" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
25
+ # <cvParam cvRef="MS" accession="MS:1000828" name="isolation window lower offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
26
+ # <cvParam cvRef="MS" accession="MS:1000829" name="isolation window upper offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
27
+ # </isolationWindow>
28
+ # <selectedIonList count="1">
29
+ # <selectedIon>
30
+ # <cvParam cvRef="MS" accession="MS:1000744" name="selected ion m/z" value="939.432189941406" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
31
+ # <cvParam cvRef="MS" accession="MS:1000041" name="charge state" value="2"/>
32
+ # <cvParam cvRef="MS" accession="MS:1000042" name="peak intensity" value="1321.692016601563" unitCvRef="MS" unitAccession="MS:1000131" unitName="number of counts"/>
33
+ # </selectedIon>
34
+ # </selectedIonList>
35
+
36
+ reader=XML::Reader.file(input_filename)
37
+
38
+ while(reader.read)
39
+
40
+ if ( reader.local_name=="precursor")
41
+
42
+ subdoc=reader.read_inner_xml
43
+
44
+ if ( subdoc =~ /MS:1000041/ )
45
+ return true
46
+ end
47
+
48
+ end
49
+
50
+ end
51
+
52
+ return false
53
+
54
+ end
55
+
56
+
57
+
58
+ # Setup specific command-line options for this tool. Other options are inherited from Tool
59
+ #
60
+ convert_tool=Tool.new({:explicit_output=>true,:over_write=>true,:maldi=>true})
61
+ convert_tool.option_parser.banner = "Convert files between different formats.\n\nUsage: file_convert.rb [options] input_file output_file"
62
+
63
+ # Special case (usually tool specific options use capitals). Use lowercase l here to mimick maldi option in the search_tool class
64
+ #
65
+ convert_tool.options.maldi=false
66
+ convert_tool.option_parser.on( '-l', '--maldi', 'Input Files are MALDI Spectra' ) do
67
+ convert_tool.options.maldi=true
68
+ end
69
+
70
+ convert_tool.options.output_format="mgf"
71
+ convert_tool.option_parser.on( '-F', '--format fmt', 'Convert to a specified format' ) do |fmt|
72
+ convert_tool.options.output_format=fmt
73
+ end
74
+
75
+ #convert_tool.options.missing_charge_state="false"
76
+ #convert_tool.option_parser.on( '-C', '--missing-charges', 'No attempt will be made to write charge states. Leads to better looking spectrum names' ) do |fmt|
77
+ # convert_tool.options.output_format=fmt
78
+ #end
79
+ #end
80
+
81
+
82
+
83
+ convert_tool.option_parser.parse!
84
+
85
+
86
+
87
+ # Environment with global constants
88
+ #
89
+ genv=Constants.new
90
+
91
+ filename=ARGV[0]
92
+
93
+
94
+ input_ext=Pathname.new(filename).extname
95
+ input_relative_filename=Pathname.new(filename).basename.to_s
96
+
97
+ base_output_dir=Pathname.new(filename).dirname.realpath.to_s #Default output dir is input dir
98
+
99
+ output_basename=input_relative_filename.gsub(/#{input_ext}$/,"").to_s
100
+
101
+ if ( convert_tool.explicit_output )
102
+ output_filepath=Pathname.new(convert_tool.explicit_output)
103
+ base_output_dir=output_filepath.dirname.to_s
104
+
105
+ if ( convert_tool.explicit_output=~/^\//) # It's an absolute path so use absolute path as output dir
106
+ # Convert base_output_dir to realpath
107
+ #
108
+ base_output_dir=Pathname.new(base_output_dir).realpath.to_s
109
+ end
110
+
111
+ output_filename=output_filepath.basename.to_s
112
+
113
+ end
114
+
115
+ # Create a uniquely named directory to hold the output. This is the only way to know the output of msconvert
116
+ #
117
+ output_dir="#{base_output_dir}/#{Pathname.new(Tempfile.new("file_convert").path).basename.to_s}"
118
+ Dir.mkdir(output_dir)
119
+
120
+
121
+ throw "Input format is the same as output format" if ( input_ext==".#{convert_tool.output_format}" )
122
+
123
+ genv.log("Converting #{filename} to #{convert_tool.output_format}",:info)
124
+ runner=CommandRunner.new(genv)
125
+ basedir=Pathname.new(filename).dirname.to_s #Where we run the tool
126
+
127
+ if ( convert_tool.maldi )
128
+ #For MALDI we know the charge is 1 so set it explicitly. Sometimes it is missing from the data
129
+ runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
130
+ else
131
+ if ( has_charge_information(filename) )
132
+ runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.<ChargeState>\" --#{convert_tool.output_format} -o #{output_dir}")
133
+ else
134
+ # If input file is missing charges the best we can do is just assign charge=1. Search engines can choose to ignore this value anyway.
135
+ #
136
+ runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
137
+ end
138
+ end
139
+
140
+ # Find out what the output name was
141
+ #
142
+ tmp_output_filename=""
143
+ Dir.foreach(output_dir) { |entry_name|
144
+ if ( entry_name=~/^\.$/ || entry_name=~/^\.\.$/ )
145
+ else
146
+ tmp_output_filename=entry_name
147
+ end
148
+ }
149
+
150
+ # Cleanup after converting
151
+ cmd = "cd #{output_dir};pwd; mv #{tmp_output_filename} #{base_output_dir}/#{output_filename}; cd ../; pwd;rm -r #{output_dir}"
152
+
153
+ code =runner.run_local(cmd)
154
+
155
+ throw "Command failed with exit code #{code}" unless code==0
156
+
157
+ throw "Failed to create output file #{base_output_dir}/#{output_filename}" unless ( FileTest.exists?("#{base_output_dir}/#{output_filename}") )
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of MSLIMS
4
+ # Created by Ira Cooke 12/4/2010
5
+ #
6
+ # Generates files required by the omssa galaxy wrapper
7
+ #
8
+
9
+ require 'protk/constants'
10
+ # Environment with global constants
11
+ #
12
+ genv=Constants.new
13
+
14
+ # Set search engine specific parameters on the SearchTool object
15
+ #
16
+ omssa_root="#{genv.omssa_root}/omssacl"
17
+ # Get ommssa to print out a list of its acceptable modifications
18
+ acceptable_mods=%x[#{omssa_root} -ml].split(/\n/).collect do |mod|
19
+
20
+ mod_vals=mod.split(":")
21
+ [mod_vals[0].lstrip.rstrip,mod_vals[1].lstrip.rstrip]
22
+
23
+ end
24
+
25
+ # Drop the header
26
+ #
27
+ acceptable_mods.shift
28
+
29
+ loc_output=File.new("omssa_mods.loc",'w')
30
+
31
+ loc_output << "#This file lists the names of chemical modifications accepted by OMMSA\n"
32
+ loc_output << "#\n"
33
+ loc_output << "#\n"
34
+
35
+ acceptable_mods.each { |am|
36
+ key = am[1].downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
37
+ loc_output << "#{am[1]}\t#{key}_\t#{am[0]}\t#{key}_\n"
38
+ }
39
+
40
+ loc_output.close
41
+
42
+
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 18/1/2011
5
+ #
6
+ # Runs the InterProphet tool on a set of pep.xml files generated by peptide_prophet
7
+ #
8
+ #
9
+
10
+ require 'protk/constants'
11
+ require 'protk/command_runner'
12
+ require 'protk/prophet_tool'
13
+
14
+ # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
15
+ #
16
+ prophet_tool=ProphetTool.new({:explicit_output=>true})
17
+ prophet_tool.option_parser.banner = "Run InterProphet on a set of pep.xml input files.\n\nUsage: interprophet.rb [options] file1.pep.xml file2.pep.xml ..."
18
+ prophet_tool.options.output_suffix="_iproph"
19
+
20
+
21
+ prophet_tool.options.no_nss=""
22
+ prophet_tool.option_parser.on( '--no-nss', 'Don\'t use NSS (Number of Sibling Searches) in Model' ) do
23
+ prophet_tool.options.no_nss="NONSS"
24
+ end
25
+
26
+ prophet_tool.options.no_nrs=""
27
+ prophet_tool.option_parser.on('--no-nrs', 'Don\'t use NRS (Number of Replicate Spectra) in Model' ) do
28
+ prophet_tool.options.no_nrs="NONRS"
29
+ end
30
+
31
+ prophet_tool.options.no_nse=""
32
+ prophet_tool.option_parser.on('--no-nse', 'Don\'t use NSE (Number of Sibling Experiments) in Model' ) do
33
+ prophet_tool.options.no_nse="NONSE"
34
+ end
35
+
36
+ prophet_tool.options.no_nsi=""
37
+ prophet_tool.option_parser.on("--no-nsi",'Don\'t use NSE (Number of Sibling Ions) in Model' ) do
38
+ prophet_tool.options.no_nsi="NONSI"
39
+ end
40
+
41
+ prophet_tool.options.no_nsm=""
42
+ prophet_tool.option_parser.on("--no-nsm",'Don\'t use NSE (Number of Sibling Modifications) in Model' ) do
43
+ prophet_tool.options.no_nsm="NONSM"
44
+ end
45
+
46
+ prophet_tool.options.min_prob=""
47
+ prophet_tool.option_parser.on("--minprob mp","Minimum probability cutoff ") do |mp|
48
+ prophet_tool.options.min_prob=mp
49
+ end
50
+
51
+ prophet_tool.option_parser.parse!
52
+
53
+
54
+ # Obtain a global environment object
55
+ genv=Constants.new
56
+
57
+ if ( prophet_tool.explicit_output != nil )
58
+ output_file=prophet_tool.explicit_output
59
+ else
60
+ output_file="#{prophet_tool.output_prefix}interact#{prophet_tool.output_suffix}.pep.xml"
61
+ end
62
+
63
+ if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
64
+
65
+ cmd="#{genv.interprophetparser} #{prophet_tool.options.no_nss} #{prophet_tool.options.no_nrs} #{prophet_tool.options.no_nse} #{prophet_tool.options.no_nsi} #{prophet_tool.options.no_nsm}"
66
+ cmd << " MINPROB=#{min_prob}" if ( prophet_tool.options.min_prob !="" )
67
+
68
+ inputs = ARGV.collect {|file_name|
69
+ file_name.chomp
70
+ }
71
+
72
+ cmd << " #{inputs.join(" ")} #{output_file}"
73
+
74
+ genv.log("Running #{cmd}",:info)
75
+
76
+ # Run the analysis
77
+ #
78
+ jobscript_path="#{output_file}.pbs.sh"
79
+ job_params={:jobid=>"iprophet", :vmem=>"900mb", :queue => "lowmem"}
80
+ code = prophet_tool.run(cmd,genv,job_params,jobscript_path)
81
+ throw "Command failed with exit code #{code}" unless code==0
82
+
83
+ else
84
+ genv.log("Interprophet output file #{output_file} already exists. Run with -r option to replace",:warn)
85
+ end
86
+
87
+
88
+
89
+
90
+
91
+
data/bin/make_decoy.rb ADDED
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 9/3/2012
5
+ #
6
+ # Create a decoy database based on a set of real protein sequences
7
+ #
8
+ #
9
+
10
+ require 'libxml'
11
+ require 'protk/constants'
12
+ require 'protk/command_runner'
13
+ require 'protk/tool'
14
+ require 'bio'
15
+
16
+ include LibXML
17
+
18
+ # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
19
+ #
20
+ tool=Tool.new({:explicit_output=>true})
21
+ tool.option_parser.banner = "Create a decoy database from real protein sequences.\n\nUsage: make_decoy.rb [options] realdb.fasta"
22
+
23
+ tool.options.db_length=0
24
+ tool.option_parser.on('-L len','--db-length len','Number of sequences to generate') do |len|
25
+ tool.options.db_length=len.to_i
26
+ end
27
+
28
+ tool.options.prefix_string="decoy_"
29
+ tool.option_parser.on('-P str','--prefix-string str','String to prepend to sequence ids') do |str|
30
+ tool.options.prefix_string=str
31
+ end
32
+
33
+ tool.options.append=false
34
+ tool.option_parser.on('-A','--append','Append input sequences to the generated database') do
35
+ tool.options.append=true
36
+ end
37
+
38
+
39
+ tool.option_parser.parse!
40
+
41
+ input_file=ARGV[0]
42
+
43
+
44
+ db_length=tool.db_length
45
+ if ( db_length==0) #If no db length was specified use the number of entries in the input file
46
+ db_length=Bio::FastaFormat.open(input_file).count
47
+ p "Found #{db_length} entries in input file"
48
+ end
49
+
50
+ output_file="decoy_#{input_file}"
51
+
52
+ output_file = tool.explicit_output if tool.explicit_output!=nil
53
+
54
+ genv=Constants.new()
55
+
56
+ Randomize.make_decoys #{input_file} #{db_length} #{output_file} #{tool.prefix_string}"
57
+ cmd << "cat #{input_file} >> #{output_file}" if ( tool.append )
58
+ p cmd
59
+ # Run the conversion
60
+ #
61
+ job_params= {:jobid => tool.jobid_from_filename(input_file) }
62
+ job_params[:queue]="lowmem"
63
+ job_params[:vmem]="900mb"
64
+ tool.run(cmd,genv,job_params)