protk 1.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +85 -0
- data/bin/annotate_ids.rb +59 -0
- data/bin/big_search.rb +41 -0
- data/bin/correct_omssa_retention_times.rb +27 -0
- data/bin/feature_finder.rb +76 -0
- data/bin/file_convert.rb +157 -0
- data/bin/generate_omssa_loc.rb +42 -0
- data/bin/interprophet.rb +91 -0
- data/bin/make_decoy.rb +64 -0
- data/bin/manage_db.rb +123 -0
- data/bin/mascot_search.rb +187 -0
- data/bin/mascot_to_pepxml.rb +44 -0
- data/bin/msgfplus_search.rb +191 -0
- data/bin/omssa_search.rb +205 -0
- data/bin/peptide_prophet.rb +245 -0
- data/bin/pepxml_to_table.rb +78 -0
- data/bin/protein_prophet.rb +140 -0
- data/bin/protk_setup.rb +31 -0
- data/bin/repair_run_summary.rb +113 -0
- data/bin/tandem_search.rb +292 -0
- data/bin/template_search.rb +144 -0
- data/bin/unimod_to_loc.rb +118 -0
- data/bin/xls_to_table.rb +46 -0
- data/ext/protk/extconf.rb +3 -0
- data/ext/protk/protk.c +235 -0
- data/lib/protk/big_search_rakefile.rake +16 -0
- data/lib/protk/big_search_tool.rb +23 -0
- data/lib/protk/bio_sptr_extensions.rb +210 -0
- data/lib/protk/biotools_excel_converter.rb +60 -0
- data/lib/protk/command_runner.rb +84 -0
- data/lib/protk/constants.rb +296 -0
- data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
- data/lib/protk/data/apt-get_packages.yaml +47 -0
- data/lib/protk/data/brew_packages.yaml +10 -0
- data/lib/protk/data/default_config.yml +20 -0
- data/lib/protk/data/predefined_db.crap.yaml +19 -0
- data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
- data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
- data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
- data/lib/protk/data/tandem_params.xml +56 -0
- data/lib/protk/data/taxonomy_template.xml +9 -0
- data/lib/protk/data/unimod.xml +16780 -0
- data/lib/protk/eupathdb_gene_information_table.rb +158 -0
- data/lib/protk/galaxy_stager.rb +24 -0
- data/lib/protk/galaxy_util.rb +9 -0
- data/lib/protk/manage_db_rakefile.rake +484 -0
- data/lib/protk/manage_db_tool.rb +181 -0
- data/lib/protk/mascot_util.rb +63 -0
- data/lib/protk/omssa_util.rb +57 -0
- data/lib/protk/plasmodb.rb +50 -0
- data/lib/protk/prophet_tool.rb +85 -0
- data/lib/protk/protein_annotator.rb +646 -0
- data/lib/protk/protxml.rb +137 -0
- data/lib/protk/randomize.rb +7 -0
- data/lib/protk/search_tool.rb +182 -0
- data/lib/protk/setup_rakefile.rake +245 -0
- data/lib/protk/setup_tool.rb +19 -0
- data/lib/protk/spreadsheet_extensions.rb +78 -0
- data/lib/protk/swissprot_database.rb +38 -0
- data/lib/protk/tool.rb +182 -0
- data/lib/protk/xtandem_defaults.rb +11 -0
- data/lib/protk.rb +18 -0
- metadata +256 -0
data/README.md
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
# protk ( Proteomics toolkit )
|
2
|
+
|
3
|
+
|
4
|
+
***
|
5
|
+
## What is it?
|
6
|
+
|
7
|
+
Protk is a wrapper for various proteomics tools. Initially it focusses on MS/MS database search and validation.
|
8
|
+
|
9
|
+
## Why do we need a wrapper around these tools
|
10
|
+
|
11
|
+
The aim of protk is present a consistent interface to numerous proteomics tools that is as uniform as possible. Protk also provides built-in support for managing protein databases.
|
12
|
+
|
13
|
+
***
|
14
|
+
|
15
|
+
|
16
|
+
|
17
|
+
## Basic Installation
|
18
|
+
|
19
|
+
1. Install rvm
|
20
|
+
curl -L https://get.rvm.io | bash -s stable
|
21
|
+
|
22
|
+
On OSX
|
23
|
+
- rvm install 1.9.3 --with-gcc=clang
|
24
|
+
- rvm use 1.9.3
|
25
|
+
- gem install protk
|
26
|
+
- protk_setup.rb all
|
27
|
+
|
28
|
+
On Linux
|
29
|
+
- rvm install 1.9.3
|
30
|
+
- rvm use 1.9.3
|
31
|
+
- gem install protk
|
32
|
+
- sudo protk_setup.rb system_dependencies
|
33
|
+
- protk_setup all
|
34
|
+
|
35
|
+
|
36
|
+
## Sequence databases
|
37
|
+
|
38
|
+
After running the setup.sh script you should run manage_db.rb to install specific sequence databases for use by the search engines. Protk comes with several predefined database configurations. For example, to install a database consisting of human entries from Swissprot plus known contaminants use the following command;
|
39
|
+
|
40
|
+
manage_db.rb add sphuman
|
41
|
+
|
42
|
+
You should now be able to run database searches, specifying this database by using the -d sphuman flag. Every month or so swissprot will release a new database version. You can keep your database up to date using;
|
43
|
+
|
44
|
+
manage_db.rb update sphuman
|
45
|
+
|
46
|
+
This will update the database only if any of its source files (or ftp release notes) have changed. The manage_db.rb tool also allows completely custom databases to be configured. Setup requires adding quite a few command-line options but once setup databases can easily be updated without further config. The example below shows the commandline arguments required to manually configure the sphuman database.
|
47
|
+
|
48
|
+
manage_db.rb add --ftp-source 'ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt' --include-filters '/OS=Homo\ssapiens/' --id-regex 'sp\|.*\|(.*?)\s' --add-decoys --make-blast-index --archive-old sphuman
|
49
|
+
|
50
|
+
|
51
|
+
## Galaxy integration
|
52
|
+
|
53
|
+
Although all the protk tools can be run directly from the command-line a nicer way to run them (and visualise outputs) is to use the galaxy web application.
|
54
|
+
|
55
|
+
1. Check out and install the latest stable galaxy [see the official galaxy wiki for more detailed setup instructions](http://wiki.g2.bx.psu.edu/Admin/Get%20Galaxy,"galaxy wiki")
|
56
|
+
|
57
|
+
hg clone https://bitbucket.org/galaxy/galaxy-dist
|
58
|
+
cd galaxy-dist
|
59
|
+
sh run.sh
|
60
|
+
|
61
|
+
2. Make the protk tools available to galaxy.
|
62
|
+
- Create a directory for galaxy tool dependencies. It's best if this directory is outside the galaxy-dist directory. I usually create a directory called `tool_depends` alongside `galaxy-dist`.
|
63
|
+
- Open the file `universe_wsgi.ini` in the `galaxy-dist` directory and set the configuration option `tool_dependency_dir` to point to the directory you just created
|
64
|
+
- Create a symbolic link from the protk directory to the appropriate subdirectory of `<tool_dependency_dir>`. In the instructions below substitute 1.0.0 for the version number of [the protk galaxy tools](https://bitbucket.org/iracooke/protk-toolshed "protk galaxy tools") you are using.
|
65
|
+
|
66
|
+
cd <tool_dependency_dir>
|
67
|
+
mkdir protk
|
68
|
+
cd protk
|
69
|
+
mkdir 1.0.0
|
70
|
+
ln -s 1.0.0 default
|
71
|
+
ln -s <path_where_protk_was_installed> 1.0.0/bin
|
72
|
+
|
73
|
+
3. Configure the shell in which galaxy tools will run.
|
74
|
+
- Create a symlink to the `env.sh` file so it will be sourced by galaxy as it runs each tool. This file should have been autogenerated by `setup.sh`
|
75
|
+
|
76
|
+
ln -s <path_where_protk_was_installed>/env.sh 1.0.0/env.sh
|
77
|
+
|
78
|
+
4. Install the protk galaxy wrapper tools from the galaxy toolshed. You will need to restart galaxy after doing so for the new datatype sniffers to be activated.
|
79
|
+
|
80
|
+
5. After installing the protk wrapper tools from the toolshed it will be necessary to tell those tools about databases you have installed. Use the manage_db.rb tool to do this. To do this, first edit config.yml to make sure the `galaxy_root` setting points to the root directory of your galaxy installation (this will allow `manage_db.rb` to update the `pepxml_databases.loc` file inside `galaxy_root/tool-data`). The run the following command and then restart the galaxy server;
|
81
|
+
|
82
|
+
manage_db.rb list -G
|
83
|
+
|
84
|
+
|
85
|
+
|
data/bin/annotate_ids.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of MSLIMS
|
4
|
+
# Created by Ira Cooke 21/7/2011
|
5
|
+
#
|
6
|
+
# Takes an input file with a list of identified proteins and creates a table with swissprot/uniprot database details in various columns for each protein in the input file.
|
7
|
+
#
|
8
|
+
#
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/command_runner'
|
11
|
+
require 'protk/prophet_tool'
|
12
|
+
require 'protk/protein_annotator'
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
# Setup specific command-line options for this tool. Other options are inherited from Tool
|
17
|
+
#
|
18
|
+
id_tool=ProphetTool.new({:explicit_output=>true,:over_write=>true})
|
19
|
+
id_tool.option_parser.banner = "Run ID annotation on a prot.xml input file.\n\nUsage: annotate_ids.rb [options] file1.prot.xml"
|
20
|
+
id_tool.options.output_prefix="annotated_"
|
21
|
+
|
22
|
+
|
23
|
+
id_tool.options.input_format=nil
|
24
|
+
id_tool.option_parser.on( '-I', '--input-format format', 'Format of input file' ) do |format|
|
25
|
+
id_tool.options.input_format = format
|
26
|
+
end
|
27
|
+
|
28
|
+
id_tool.option_parser.parse!
|
29
|
+
|
30
|
+
# Obtain a global environment object
|
31
|
+
genv=Constants.new
|
32
|
+
|
33
|
+
input_file=ARGV[0]
|
34
|
+
|
35
|
+
database_file=id_tool.extract_db(input_file)
|
36
|
+
|
37
|
+
output_file=nil
|
38
|
+
|
39
|
+
if ( id_tool.explicit_output==nil)
|
40
|
+
output_file="#{id_tool.output_prefix}#{input_file}#{id_tool.output_suffix}.xls"
|
41
|
+
else
|
42
|
+
output_file=id_tool.explicit_output
|
43
|
+
end
|
44
|
+
|
45
|
+
converter=ProteinAnnotator.new
|
46
|
+
|
47
|
+
begin
|
48
|
+
outpath=Pathname.new(output_file)
|
49
|
+
|
50
|
+
if ( id_tool.over_write || !outpath.exist? )
|
51
|
+
converter.convert(input_file,output_file,id_tool.input_format)
|
52
|
+
else
|
53
|
+
p "Output file #{output_file} already exists"
|
54
|
+
end
|
55
|
+
|
56
|
+
rescue Exception
|
57
|
+
p "Couldn't convert #{input_file}"
|
58
|
+
raise
|
59
|
+
end
|
data/bin/big_search.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 14/12/2010
|
5
|
+
#
|
6
|
+
# Runs an MS/MS search using multiple search engines on multiple files in parallel
|
7
|
+
# Merges results using interprophet to produce a single output file
|
8
|
+
#
|
9
|
+
# This tool assumes that datasets are from an ESI-QUAD-TOF instrument
|
10
|
+
#
|
11
|
+
require 'protk/constants'
|
12
|
+
require 'protk/command_runner'
|
13
|
+
require 'protk/search_tool'
|
14
|
+
require 'protk/big_search_tool'
|
15
|
+
require 'rest_client'
|
16
|
+
require 'rake'
|
17
|
+
|
18
|
+
# Environment with global constants
|
19
|
+
#
|
20
|
+
genv=Constants.new
|
21
|
+
|
22
|
+
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
23
|
+
#
|
24
|
+
search_tool=SearchTool.new({:msms_search=>true,:background=>false,:database=>true,:over_write=>true,:glyco=>true,:explicit_output=>true})
|
25
|
+
search_tool.jobid_prefix="b"
|
26
|
+
|
27
|
+
search_tool.option_parser.banner = "Run a multi-search engine search on a set of input files.\n\nUsage: big_search.rb [options] file1.mzML file2.mzML ..."
|
28
|
+
search_tool.options.output_suffix="_multisearch"
|
29
|
+
|
30
|
+
|
31
|
+
search_tool.options.ncpu=1
|
32
|
+
search_tool.option_parser.on( '-N', '--ncpu n', 'Split tasks into n separate processes if possible' ) do |n|
|
33
|
+
search_tool.options.ncpu=n
|
34
|
+
end
|
35
|
+
|
36
|
+
search_tool.option_parser.parse!
|
37
|
+
|
38
|
+
bgsrch = BigSearchTool.new
|
39
|
+
|
40
|
+
|
41
|
+
p bgsrch.run ["hi", "howdy"]
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 14/12/2010
|
5
|
+
#
|
6
|
+
# Corrects retention times in omssa output
|
7
|
+
#
|
8
|
+
|
9
|
+
$VERBOSE=nil
|
10
|
+
|
11
|
+
require 'protk/constants'
|
12
|
+
require 'protk/command_runner'
|
13
|
+
require 'protk/tool'
|
14
|
+
require 'protk/omssa_util'
|
15
|
+
|
16
|
+
# Environment with global constants
|
17
|
+
#
|
18
|
+
genv=Constants.new
|
19
|
+
|
20
|
+
tool=Tool.new
|
21
|
+
tool.option_parser.banner = "Correct retention times on a pepxml file produced by omssa using information from an mgf file.\n\nUsage: correct_omssa_retention_times.rb [options] file1.pep.xml file2.mgf"
|
22
|
+
tool.option_parser.parse!
|
23
|
+
|
24
|
+
|
25
|
+
OMSSAUtil.add_retention_times(ARGV[1],ARGV[0],tool.over_write,true)
|
26
|
+
|
27
|
+
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#
|
2
|
+
# This file is part of protk
|
3
|
+
# Created by Ira Cooke 21/3/2012
|
4
|
+
#
|
5
|
+
# A wrapper for the OpenMS FeatureFinder tools (FeatureFinderCentroided and FeatureFinderIsotopeWavelet)
|
6
|
+
#
|
7
|
+
#
|
8
|
+
#!/bin/sh
|
9
|
+
if [ -z "$PROTK_RUBY_PATH" ] ; then
|
10
|
+
PROTK_RUBY_PATH=`which ruby`
|
11
|
+
fi
|
12
|
+
|
13
|
+
eval 'exec "$PROTK_RUBY_PATH" $PROTK_RUBY_FLAGS -rubygems -x -S $0 ${1+"$@"}'
|
14
|
+
echo "The 'exec \"$PROTK_RUBY_PATH\" -x -S ...' failed!" >&2
|
15
|
+
exit 1
|
16
|
+
#! ruby
|
17
|
+
#
|
18
|
+
|
19
|
+
$LOAD_PATH.unshift("#{File.dirname(__FILE__)}/lib/")
|
20
|
+
|
21
|
+
require 'constants'
|
22
|
+
require 'command_runner'
|
23
|
+
require 'tool'
|
24
|
+
|
25
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
26
|
+
#
|
27
|
+
tool=Tool.new({:explicit_output=>true, :background=>true,:over_write=>true})
|
28
|
+
tool.option_parser.banner = "Find molecular features on a set of input files.\n\nUsage: feature_finder.rb [options] file1.mzML file2.mzML ..."
|
29
|
+
|
30
|
+
tool.options.profile = false
|
31
|
+
tool.option_parser.on( '--profile',"Input files are profile data" ) do
|
32
|
+
tool.options.profile = true
|
33
|
+
end
|
34
|
+
|
35
|
+
tool.option_parser.parse!
|
36
|
+
|
37
|
+
# Obtain a global environment object
|
38
|
+
genv=Constants.new
|
39
|
+
|
40
|
+
def run_ff(genv,tool,cmd,output_path,jobid)
|
41
|
+
if ( !tool.over_write && Pathname.new(output_path).exist? )
|
42
|
+
genv.log("Skipping analysis on existing file #{output_path}",:warn)
|
43
|
+
else
|
44
|
+
jobscript_path="#{output_path}.pbs.sh"
|
45
|
+
job_params={:jobid=>jobid, :vmem=>"12Gb", :queue => "sixteen"}
|
46
|
+
code=tool.run(cmd,genv,job_params,jobscript_path)
|
47
|
+
throw "Command failed with exit code #{code}" unless code==0
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
|
53
|
+
throw "The profile option is not yet implemented" if ( tool.profile )
|
54
|
+
|
55
|
+
ini_file="#{File.dirname(__FILE__)}/params/FeatureFinderCentroided.ini"
|
56
|
+
|
57
|
+
ARGV.each do |filen|
|
58
|
+
input_file=filen.chomp
|
59
|
+
throw "Input must be an mzML file" unless input_file=~/\.mzML$/
|
60
|
+
|
61
|
+
input_basename=input_file.gsub(/\.mzML$/,'')
|
62
|
+
output_filename=tool.explicit_output
|
63
|
+
output_file="#{input_basename}.featureXML" if output_filename==nil
|
64
|
+
|
65
|
+
if ( tool.over_write || !Pathname.new(output_file).exist? )
|
66
|
+
output_dir=Pathname.new(output_file).dirname.realpath.to_s
|
67
|
+
output_base_filename=Pathname.new(output_file).basename.to_s
|
68
|
+
cmd=""
|
69
|
+
cmd<<"#{genv.openms_root}/FeatureFinderCentroided -in #{Pathname.new(input_file).realpath.to_s} -out #{output_dir}/#{output_base_filename} -ini #{ini_file}"
|
70
|
+
|
71
|
+
run_ff(genv,tool,cmd,output_file,tool.jobid_from_filename(input_basename))
|
72
|
+
|
73
|
+
else
|
74
|
+
genv.log("Skipping search on existing file #{output_file}",:warn)
|
75
|
+
end
|
76
|
+
end
|
data/bin/file_convert.rb
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 14/12/2010
|
5
|
+
#
|
6
|
+
# Wrapper for msconvert
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/command_runner'
|
11
|
+
require 'protk/tool'
|
12
|
+
require 'tempfile'
|
13
|
+
require 'libxml'
|
14
|
+
|
15
|
+
include LibXML
|
16
|
+
|
17
|
+
|
18
|
+
# Read the input file and search for an instance of the charge state cvParam inside a precursor tag. Return true if one is found. False otherwise
|
19
|
+
#
|
20
|
+
def has_charge_information(input_filename)
|
21
|
+
#<precursorList count="1">
|
22
|
+
# <precursor spectrumRef="controllerType=0 controllerNumber=1 scan=59">
|
23
|
+
# <isolationWindow>
|
24
|
+
# <cvParam cvRef="MS" accession="MS:1000827" name="isolation window target m/z" value="939.43" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
25
|
+
# <cvParam cvRef="MS" accession="MS:1000828" name="isolation window lower offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
26
|
+
# <cvParam cvRef="MS" accession="MS:1000829" name="isolation window upper offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
27
|
+
# </isolationWindow>
|
28
|
+
# <selectedIonList count="1">
|
29
|
+
# <selectedIon>
|
30
|
+
# <cvParam cvRef="MS" accession="MS:1000744" name="selected ion m/z" value="939.432189941406" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
31
|
+
# <cvParam cvRef="MS" accession="MS:1000041" name="charge state" value="2"/>
|
32
|
+
# <cvParam cvRef="MS" accession="MS:1000042" name="peak intensity" value="1321.692016601563" unitCvRef="MS" unitAccession="MS:1000131" unitName="number of counts"/>
|
33
|
+
# </selectedIon>
|
34
|
+
# </selectedIonList>
|
35
|
+
|
36
|
+
reader=XML::Reader.file(input_filename)
|
37
|
+
|
38
|
+
while(reader.read)
|
39
|
+
|
40
|
+
if ( reader.local_name=="precursor")
|
41
|
+
|
42
|
+
subdoc=reader.read_inner_xml
|
43
|
+
|
44
|
+
if ( subdoc =~ /MS:1000041/ )
|
45
|
+
return true
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
return false
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
|
58
|
+
# Setup specific command-line options for this tool. Other options are inherited from Tool
|
59
|
+
#
|
60
|
+
convert_tool=Tool.new({:explicit_output=>true,:over_write=>true,:maldi=>true})
|
61
|
+
convert_tool.option_parser.banner = "Convert files between different formats.\n\nUsage: file_convert.rb [options] input_file output_file"
|
62
|
+
|
63
|
+
# Special case (usually tool specific options use capitals). Use lowercase l here to mimick maldi option in the search_tool class
|
64
|
+
#
|
65
|
+
convert_tool.options.maldi=false
|
66
|
+
convert_tool.option_parser.on( '-l', '--maldi', 'Input Files are MALDI Spectra' ) do
|
67
|
+
convert_tool.options.maldi=true
|
68
|
+
end
|
69
|
+
|
70
|
+
convert_tool.options.output_format="mgf"
|
71
|
+
convert_tool.option_parser.on( '-F', '--format fmt', 'Convert to a specified format' ) do |fmt|
|
72
|
+
convert_tool.options.output_format=fmt
|
73
|
+
end
|
74
|
+
|
75
|
+
#convert_tool.options.missing_charge_state="false"
|
76
|
+
#convert_tool.option_parser.on( '-C', '--missing-charges', 'No attempt will be made to write charge states. Leads to better looking spectrum names' ) do |fmt|
|
77
|
+
# convert_tool.options.output_format=fmt
|
78
|
+
#end
|
79
|
+
#end
|
80
|
+
|
81
|
+
|
82
|
+
|
83
|
+
convert_tool.option_parser.parse!
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
# Environment with global constants
|
88
|
+
#
|
89
|
+
genv=Constants.new
|
90
|
+
|
91
|
+
filename=ARGV[0]
|
92
|
+
|
93
|
+
|
94
|
+
input_ext=Pathname.new(filename).extname
|
95
|
+
input_relative_filename=Pathname.new(filename).basename.to_s
|
96
|
+
|
97
|
+
base_output_dir=Pathname.new(filename).dirname.realpath.to_s #Default output dir is input dir
|
98
|
+
|
99
|
+
output_basename=input_relative_filename.gsub(/#{input_ext}$/,"").to_s
|
100
|
+
|
101
|
+
if ( convert_tool.explicit_output )
|
102
|
+
output_filepath=Pathname.new(convert_tool.explicit_output)
|
103
|
+
base_output_dir=output_filepath.dirname.to_s
|
104
|
+
|
105
|
+
if ( convert_tool.explicit_output=~/^\//) # It's an absolute path so use absolute path as output dir
|
106
|
+
# Convert base_output_dir to realpath
|
107
|
+
#
|
108
|
+
base_output_dir=Pathname.new(base_output_dir).realpath.to_s
|
109
|
+
end
|
110
|
+
|
111
|
+
output_filename=output_filepath.basename.to_s
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
# Create a uniquely named directory to hold the output. This is the only way to know the output of msconvert
|
116
|
+
#
|
117
|
+
output_dir="#{base_output_dir}/#{Pathname.new(Tempfile.new("file_convert").path).basename.to_s}"
|
118
|
+
Dir.mkdir(output_dir)
|
119
|
+
|
120
|
+
|
121
|
+
throw "Input format is the same as output format" if ( input_ext==".#{convert_tool.output_format}" )
|
122
|
+
|
123
|
+
genv.log("Converting #{filename} to #{convert_tool.output_format}",:info)
|
124
|
+
runner=CommandRunner.new(genv)
|
125
|
+
basedir=Pathname.new(filename).dirname.to_s #Where we run the tool
|
126
|
+
|
127
|
+
if ( convert_tool.maldi )
|
128
|
+
#For MALDI we know the charge is 1 so set it explicitly. Sometimes it is missing from the data
|
129
|
+
runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
|
130
|
+
else
|
131
|
+
if ( has_charge_information(filename) )
|
132
|
+
runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.<ChargeState>\" --#{convert_tool.output_format} -o #{output_dir}")
|
133
|
+
else
|
134
|
+
# If input file is missing charges the best we can do is just assign charge=1. Search engines can choose to ignore this value anyway.
|
135
|
+
#
|
136
|
+
runner.run_local("cd #{basedir}; #{genv.tpp_root}/msconvert #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
# Find out what the output name was
|
141
|
+
#
|
142
|
+
tmp_output_filename=""
|
143
|
+
Dir.foreach(output_dir) { |entry_name|
|
144
|
+
if ( entry_name=~/^\.$/ || entry_name=~/^\.\.$/ )
|
145
|
+
else
|
146
|
+
tmp_output_filename=entry_name
|
147
|
+
end
|
148
|
+
}
|
149
|
+
|
150
|
+
# Cleanup after converting
|
151
|
+
cmd = "cd #{output_dir};pwd; mv #{tmp_output_filename} #{base_output_dir}/#{output_filename}; cd ../; pwd;rm -r #{output_dir}"
|
152
|
+
|
153
|
+
code =runner.run_local(cmd)
|
154
|
+
|
155
|
+
throw "Command failed with exit code #{code}" unless code==0
|
156
|
+
|
157
|
+
throw "Failed to create output file #{base_output_dir}/#{output_filename}" unless ( FileTest.exists?("#{base_output_dir}/#{output_filename}") )
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of MSLIMS
|
4
|
+
# Created by Ira Cooke 12/4/2010
|
5
|
+
#
|
6
|
+
# Generates files required by the omssa galaxy wrapper
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'protk/constants'
|
10
|
+
# Environment with global constants
|
11
|
+
#
|
12
|
+
genv=Constants.new
|
13
|
+
|
14
|
+
# Set search engine specific parameters on the SearchTool object
|
15
|
+
#
|
16
|
+
omssa_root="#{genv.omssa_root}/omssacl"
|
17
|
+
# Get ommssa to print out a list of its acceptable modifications
|
18
|
+
acceptable_mods=%x[#{omssa_root} -ml].split(/\n/).collect do |mod|
|
19
|
+
|
20
|
+
mod_vals=mod.split(":")
|
21
|
+
[mod_vals[0].lstrip.rstrip,mod_vals[1].lstrip.rstrip]
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
# Drop the header
|
26
|
+
#
|
27
|
+
acceptable_mods.shift
|
28
|
+
|
29
|
+
loc_output=File.new("omssa_mods.loc",'w')
|
30
|
+
|
31
|
+
loc_output << "#This file lists the names of chemical modifications accepted by OMMSA\n"
|
32
|
+
loc_output << "#\n"
|
33
|
+
loc_output << "#\n"
|
34
|
+
|
35
|
+
acceptable_mods.each { |am|
|
36
|
+
key = am[1].downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
|
37
|
+
loc_output << "#{am[1]}\t#{key}_\t#{am[0]}\t#{key}_\n"
|
38
|
+
}
|
39
|
+
|
40
|
+
loc_output.close
|
41
|
+
|
42
|
+
|
data/bin/interprophet.rb
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 18/1/2011
|
5
|
+
#
|
6
|
+
# Runs the InterProphet tool on a set of pep.xml files generated by peptide_prophet
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/command_runner'
|
12
|
+
require 'protk/prophet_tool'
|
13
|
+
|
14
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
15
|
+
#
|
16
|
+
prophet_tool=ProphetTool.new({:explicit_output=>true})
|
17
|
+
prophet_tool.option_parser.banner = "Run InterProphet on a set of pep.xml input files.\n\nUsage: interprophet.rb [options] file1.pep.xml file2.pep.xml ..."
|
18
|
+
prophet_tool.options.output_suffix="_iproph"
|
19
|
+
|
20
|
+
|
21
|
+
prophet_tool.options.no_nss=""
|
22
|
+
prophet_tool.option_parser.on( '--no-nss', 'Don\'t use NSS (Number of Sibling Searches) in Model' ) do
|
23
|
+
prophet_tool.options.no_nss="NONSS"
|
24
|
+
end
|
25
|
+
|
26
|
+
prophet_tool.options.no_nrs=""
|
27
|
+
prophet_tool.option_parser.on('--no-nrs', 'Don\'t use NRS (Number of Replicate Spectra) in Model' ) do
|
28
|
+
prophet_tool.options.no_nrs="NONRS"
|
29
|
+
end
|
30
|
+
|
31
|
+
prophet_tool.options.no_nse=""
|
32
|
+
prophet_tool.option_parser.on('--no-nse', 'Don\'t use NSE (Number of Sibling Experiments) in Model' ) do
|
33
|
+
prophet_tool.options.no_nse="NONSE"
|
34
|
+
end
|
35
|
+
|
36
|
+
prophet_tool.options.no_nsi=""
|
37
|
+
prophet_tool.option_parser.on("--no-nsi",'Don\'t use NSE (Number of Sibling Ions) in Model' ) do
|
38
|
+
prophet_tool.options.no_nsi="NONSI"
|
39
|
+
end
|
40
|
+
|
41
|
+
prophet_tool.options.no_nsm=""
|
42
|
+
prophet_tool.option_parser.on("--no-nsm",'Don\'t use NSE (Number of Sibling Modifications) in Model' ) do
|
43
|
+
prophet_tool.options.no_nsm="NONSM"
|
44
|
+
end
|
45
|
+
|
46
|
+
prophet_tool.options.min_prob=""
|
47
|
+
prophet_tool.option_parser.on("--minprob mp","Minimum probability cutoff ") do |mp|
|
48
|
+
prophet_tool.options.min_prob=mp
|
49
|
+
end
|
50
|
+
|
51
|
+
prophet_tool.option_parser.parse!
|
52
|
+
|
53
|
+
|
54
|
+
# Obtain a global environment object
|
55
|
+
genv=Constants.new
|
56
|
+
|
57
|
+
if ( prophet_tool.explicit_output != nil )
|
58
|
+
output_file=prophet_tool.explicit_output
|
59
|
+
else
|
60
|
+
output_file="#{prophet_tool.output_prefix}interact#{prophet_tool.output_suffix}.pep.xml"
|
61
|
+
end
|
62
|
+
|
63
|
+
if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
|
64
|
+
|
65
|
+
cmd="#{genv.interprophetparser} #{prophet_tool.options.no_nss} #{prophet_tool.options.no_nrs} #{prophet_tool.options.no_nse} #{prophet_tool.options.no_nsi} #{prophet_tool.options.no_nsm}"
|
66
|
+
cmd << " MINPROB=#{min_prob}" if ( prophet_tool.options.min_prob !="" )
|
67
|
+
|
68
|
+
inputs = ARGV.collect {|file_name|
|
69
|
+
file_name.chomp
|
70
|
+
}
|
71
|
+
|
72
|
+
cmd << " #{inputs.join(" ")} #{output_file}"
|
73
|
+
|
74
|
+
genv.log("Running #{cmd}",:info)
|
75
|
+
|
76
|
+
# Run the analysis
|
77
|
+
#
|
78
|
+
jobscript_path="#{output_file}.pbs.sh"
|
79
|
+
job_params={:jobid=>"iprophet", :vmem=>"900mb", :queue => "lowmem"}
|
80
|
+
code = prophet_tool.run(cmd,genv,job_params,jobscript_path)
|
81
|
+
throw "Command failed with exit code #{code}" unless code==0
|
82
|
+
|
83
|
+
else
|
84
|
+
genv.log("Interprophet output file #{output_file} already exists. Run with -r option to replace",:warn)
|
85
|
+
end
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
|
91
|
+
|
data/bin/make_decoy.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 9/3/2012
|
5
|
+
#
|
6
|
+
# Create a decoy database based on a set of real protein sequences
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'libxml'
|
11
|
+
require 'protk/constants'
|
12
|
+
require 'protk/command_runner'
|
13
|
+
require 'protk/tool'
|
14
|
+
require 'bio'
|
15
|
+
|
16
|
+
include LibXML
|
17
|
+
|
18
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
19
|
+
#
|
20
|
+
tool=Tool.new({:explicit_output=>true})
|
21
|
+
tool.option_parser.banner = "Create a decoy database from real protein sequences.\n\nUsage: make_decoy.rb [options] realdb.fasta"
|
22
|
+
|
23
|
+
tool.options.db_length=0
|
24
|
+
tool.option_parser.on('-L len','--db-length len','Number of sequences to generate') do |len|
|
25
|
+
tool.options.db_length=len.to_i
|
26
|
+
end
|
27
|
+
|
28
|
+
tool.options.prefix_string="decoy_"
|
29
|
+
tool.option_parser.on('-P str','--prefix-string str','String to prepend to sequence ids') do |str|
|
30
|
+
tool.options.prefix_string=str
|
31
|
+
end
|
32
|
+
|
33
|
+
tool.options.append=false
|
34
|
+
tool.option_parser.on('-A','--append','Append input sequences to the generated database') do
|
35
|
+
tool.options.append=true
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
tool.option_parser.parse!
|
40
|
+
|
41
|
+
input_file=ARGV[0]
|
42
|
+
|
43
|
+
|
44
|
+
db_length=tool.db_length
|
45
|
+
if ( db_length==0) #If no db length was specified use the number of entries in the input file
|
46
|
+
db_length=Bio::FastaFormat.open(input_file).count
|
47
|
+
p "Found #{db_length} entries in input file"
|
48
|
+
end
|
49
|
+
|
50
|
+
output_file="decoy_#{input_file}"
|
51
|
+
|
52
|
+
output_file = tool.explicit_output if tool.explicit_output!=nil
|
53
|
+
|
54
|
+
genv=Constants.new()
|
55
|
+
|
56
|
+
Randomize.make_decoys #{input_file} #{db_length} #{output_file} #{tool.prefix_string}"
|
57
|
+
cmd << "cat #{input_file} >> #{output_file}" if ( tool.append )
|
58
|
+
p cmd
|
59
|
+
# Run the conversion
|
60
|
+
#
|
61
|
+
job_params= {:jobid => tool.jobid_from_filename(input_file) }
|
62
|
+
job_params[:queue]="lowmem"
|
63
|
+
job_params[:vmem]="900mb"
|
64
|
+
tool.run(cmd,genv,job_params)
|