protk 1.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +85 -0
- data/bin/annotate_ids.rb +59 -0
- data/bin/big_search.rb +41 -0
- data/bin/correct_omssa_retention_times.rb +27 -0
- data/bin/feature_finder.rb +76 -0
- data/bin/file_convert.rb +157 -0
- data/bin/generate_omssa_loc.rb +42 -0
- data/bin/interprophet.rb +91 -0
- data/bin/make_decoy.rb +64 -0
- data/bin/manage_db.rb +123 -0
- data/bin/mascot_search.rb +187 -0
- data/bin/mascot_to_pepxml.rb +44 -0
- data/bin/msgfplus_search.rb +191 -0
- data/bin/omssa_search.rb +205 -0
- data/bin/peptide_prophet.rb +245 -0
- data/bin/pepxml_to_table.rb +78 -0
- data/bin/protein_prophet.rb +140 -0
- data/bin/protk_setup.rb +31 -0
- data/bin/repair_run_summary.rb +113 -0
- data/bin/tandem_search.rb +292 -0
- data/bin/template_search.rb +144 -0
- data/bin/unimod_to_loc.rb +118 -0
- data/bin/xls_to_table.rb +46 -0
- data/ext/protk/extconf.rb +3 -0
- data/ext/protk/protk.c +235 -0
- data/lib/protk/big_search_rakefile.rake +16 -0
- data/lib/protk/big_search_tool.rb +23 -0
- data/lib/protk/bio_sptr_extensions.rb +210 -0
- data/lib/protk/biotools_excel_converter.rb +60 -0
- data/lib/protk/command_runner.rb +84 -0
- data/lib/protk/constants.rb +296 -0
- data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
- data/lib/protk/data/apt-get_packages.yaml +47 -0
- data/lib/protk/data/brew_packages.yaml +10 -0
- data/lib/protk/data/default_config.yml +20 -0
- data/lib/protk/data/predefined_db.crap.yaml +19 -0
- data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
- data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
- data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
- data/lib/protk/data/tandem_params.xml +56 -0
- data/lib/protk/data/taxonomy_template.xml +9 -0
- data/lib/protk/data/unimod.xml +16780 -0
- data/lib/protk/eupathdb_gene_information_table.rb +158 -0
- data/lib/protk/galaxy_stager.rb +24 -0
- data/lib/protk/galaxy_util.rb +9 -0
- data/lib/protk/manage_db_rakefile.rake +484 -0
- data/lib/protk/manage_db_tool.rb +181 -0
- data/lib/protk/mascot_util.rb +63 -0
- data/lib/protk/omssa_util.rb +57 -0
- data/lib/protk/plasmodb.rb +50 -0
- data/lib/protk/prophet_tool.rb +85 -0
- data/lib/protk/protein_annotator.rb +646 -0
- data/lib/protk/protxml.rb +137 -0
- data/lib/protk/randomize.rb +7 -0
- data/lib/protk/search_tool.rb +182 -0
- data/lib/protk/setup_rakefile.rake +245 -0
- data/lib/protk/setup_tool.rb +19 -0
- data/lib/protk/spreadsheet_extensions.rb +78 -0
- data/lib/protk/swissprot_database.rb +38 -0
- data/lib/protk/tool.rb +182 -0
- data/lib/protk/xtandem_defaults.rb +11 -0
- data/lib/protk.rb +18 -0
- metadata +256 -0
@@ -0,0 +1,181 @@
|
|
1
|
+
#
|
2
|
+
# This file is part of protk
|
3
|
+
# Created by Ira Cooke 13/3/2012
|
4
|
+
#
|
5
|
+
# Provides support for the manage_db tool
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'optparse'
|
9
|
+
require 'ostruct'
|
10
|
+
require 'protk/tool'
|
11
|
+
|
12
|
+
class ManageDBTool < Tool
|
13
|
+
|
14
|
+
def add dbspec, dbname
|
15
|
+
genv=Constants.new()
|
16
|
+
dbdir="#{genv.protein_database_root}/#{dbname}"
|
17
|
+
%x[mkdir -p #{dbdir}]
|
18
|
+
|
19
|
+
File.open("#{dbdir}/.protkdb.yaml", "w") {|file| file.puts(dbspec.to_yaml) }
|
20
|
+
end
|
21
|
+
|
22
|
+
def predefined_databases_help
|
23
|
+
this_dir=File.dirname(__FILE__)
|
24
|
+
definition_files=Dir.glob("#{this_dir}/data/predefined_db.*")
|
25
|
+
help_string=""
|
26
|
+
for fn in definition_files
|
27
|
+
name=Pathname.new(fn).basename.to_s.split(".")[1]
|
28
|
+
desc=YAML.load(File.read(fn))[:description]
|
29
|
+
help_string << "\t\t\t\t\t#{name} : #{desc}\n"
|
30
|
+
end
|
31
|
+
|
32
|
+
help_string
|
33
|
+
end
|
34
|
+
|
35
|
+
def predefined_names
|
36
|
+
this_dir=File.dirname(__FILE__)
|
37
|
+
definition_files=Dir.glob("#{this_dir}/data/predefined_db.*")
|
38
|
+
definition_files.collect { |fn| Pathname.new(fn).basename.to_s.split(".")[1] }
|
39
|
+
end
|
40
|
+
|
41
|
+
def get_predefined_definition name
|
42
|
+
this_dir=File.dirname(__FILE__)
|
43
|
+
filename="#{this_dir}/data/predefined_db.#{name}.yaml"
|
44
|
+
return {} unless Pathname.new(filename).exist?
|
45
|
+
if predefined_names.include? name
|
46
|
+
return YAML.load(File.read(filename))
|
47
|
+
end
|
48
|
+
return {}
|
49
|
+
end
|
50
|
+
|
51
|
+
|
52
|
+
def all_database_names(genv)
|
53
|
+
all_names=[]
|
54
|
+
Dir.foreach(genv.protein_database_root) do |db_subdir|
|
55
|
+
db_specfile="#{genv.protein_database_root}/#{db_subdir}/.protkdb.yaml"
|
56
|
+
if ( Pathname.new(db_specfile).exist?)
|
57
|
+
all_names.push db_subdir
|
58
|
+
end
|
59
|
+
end
|
60
|
+
return all_names
|
61
|
+
end
|
62
|
+
|
63
|
+
def rakefile_path
|
64
|
+
"#{File.dirname(__FILE__)}/manage_db_rakefile.rake"
|
65
|
+
end
|
66
|
+
|
67
|
+
# Initializes the commandline options
|
68
|
+
def initialize(command)
|
69
|
+
super({:help=>false})
|
70
|
+
|
71
|
+
@option_parser.banner=""
|
72
|
+
|
73
|
+
|
74
|
+
case command
|
75
|
+
when "add"
|
76
|
+
|
77
|
+
@options.sources=[]
|
78
|
+
|
79
|
+
@options.predefined=false
|
80
|
+
@option_parser.on( '--predefined', "Install a database from one of protk\'s predefined definitions.\n\t\t\t\t\tAvailable definitions are;\n#{predefined_databases_help}" ) do
|
81
|
+
@options.predefined=true
|
82
|
+
end
|
83
|
+
|
84
|
+
@option_parser.on( '--db-source dbname', 'A named database to use an an input source. Multiple db sources may be specified' ) do |db|
|
85
|
+
@options.sources.push db
|
86
|
+
end
|
87
|
+
|
88
|
+
@option_parser.on( '--file-source fs', 'A file path to a fasta file to use as an input source. Multiple file sources may be specified' ) do |fs|
|
89
|
+
@options.sources.push fs
|
90
|
+
end
|
91
|
+
|
92
|
+
@option_parser.on( '--ftp-source fs', "A space separated pair of urls. \n\t\t\t\t\tThe first is an ftp url to a fasta file to use as an input source.\n\t\t\t\t\tThe second is an ftp url to release notes file or other file which can be checked to see if the database requires an update. If no such url exists type \"none\" \n\t\t\t\t\tMultiple ftp sources may be specified" ) do |ftps|
|
93
|
+
@options.sources.push ftps.split(/\s+/)
|
94
|
+
end
|
95
|
+
|
96
|
+
@options.include_filters=[]
|
97
|
+
@option_parser.on( '--include-filters rx', "A comma separated series of regular expressions to use as filters. \n\t\t\t\t\tEach time this argument is encountered is adds a set of filters for another source file, in the order that source files were added. \n\t\t\t\t\tIf you use multiple source files you will need multiple --include-filters" ) do |tx|
|
98
|
+
|
99
|
+
throw "Specified include filter #{tx} is not in the format /regex1/,/regex2/" unless match=tx.match(/\/(.*)\//)
|
100
|
+
tx= match[1]
|
101
|
+
|
102
|
+
@options.include_filters.push tx.split(/\/,\//)
|
103
|
+
end
|
104
|
+
|
105
|
+
@options.id_regexes=[]
|
106
|
+
@option_parser.on( '--id-regex rx', 'A regular expression with a single capture group for capturing the protein ID from a faster description line' ) do |rx|
|
107
|
+
rx.gsub!(/^\//,'')
|
108
|
+
rx.gsub!(/\/$/,'')
|
109
|
+
@options.id_regexes.push rx
|
110
|
+
end
|
111
|
+
|
112
|
+
@options.make_blast_index=false
|
113
|
+
@option_parser.on( '--make-blast-index', 'Create a blast index of the database (required for OMSSA searches)' ) do
|
114
|
+
@options.make_blast_index=true
|
115
|
+
end
|
116
|
+
|
117
|
+
@options.make_msgf_index=false
|
118
|
+
@option_parser.on( '--make-msgf-index', 'Create an index suitable for msgf plus (required for msgfplus searches)' ) do
|
119
|
+
@options.make_msgf_index=true
|
120
|
+
end
|
121
|
+
|
122
|
+
@options.decoys=false
|
123
|
+
@option_parser.on( '--add-decoys', 'Add random sequences to be used as decoys to the database (required for OMSSA searches)' ) do
|
124
|
+
@options.decoys=true
|
125
|
+
end
|
126
|
+
|
127
|
+
@options.archive_old=false
|
128
|
+
@option_parser.on( '--archive-old', 'Don\'t delete old fasta files when updating to a newer version' ) do
|
129
|
+
@options.archive_old=true
|
130
|
+
end
|
131
|
+
|
132
|
+
@options.decoy_prefix="decoy_"
|
133
|
+
@option_parser.on( '--decoy-prefix pref', 'Define a prefix string to prepend to protein ID\'s used as decoys' ) do |pref|
|
134
|
+
@options.decoy_prefix=pref
|
135
|
+
end
|
136
|
+
|
137
|
+
|
138
|
+
@options.update_spec=false
|
139
|
+
@option_parser.on( '--update-spec', 'Change the specification for an existing database by updating its spec file' ) do
|
140
|
+
@options.update_spec=true
|
141
|
+
end
|
142
|
+
|
143
|
+
@options.is_annotation_db=false
|
144
|
+
@option_parser.on( '--annotation-db', 'This database is not for searching but for annotating search results (eg Swissprot .dat file)' ) do
|
145
|
+
@options.is_annotation_db=true
|
146
|
+
end
|
147
|
+
|
148
|
+
@options.db_format="fasta"
|
149
|
+
@option_parser.on( '--db-format format', 'Format of the database file (fasta or dat). Default is fasta' ) do |format|
|
150
|
+
@options.db_format=format
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
@option_parser.banner = "Add new protein databases.\nUsage: manage_db.rb add [options] <dbname>"
|
155
|
+
|
156
|
+
when "list"
|
157
|
+
@option_parser.banner = "List protein databases.\nUsage: manage_db.rb list"
|
158
|
+
|
159
|
+
@options.verbose=false
|
160
|
+
@option_parser.on('-v', '--verbose', 'Display detailed specification for each installed database' ) do
|
161
|
+
@options.verbose=true
|
162
|
+
end
|
163
|
+
|
164
|
+
@options.galaxy=false
|
165
|
+
@option_parser.on('-g' ,'--generate-loc-file', 'Generate a galaxy loc file' ) do
|
166
|
+
@options.galaxy=true
|
167
|
+
end
|
168
|
+
|
169
|
+
@options.galaxy_write=false
|
170
|
+
@option_parser.on('-G' ,'--write-loc-file', 'Update the pepxml_databases.loc file in galaxy if a galaxy_root directory has been configured and the file exists' ) do
|
171
|
+
@options.galaxy_write=true
|
172
|
+
end
|
173
|
+
|
174
|
+
|
175
|
+
when "update"
|
176
|
+
@option_parser.banner = "Update protein databases.\nUsage: manage_db.rb update <dbname>"
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
|
2
|
+
class MascotUtil
|
3
|
+
|
4
|
+
|
5
|
+
# Reads a mascot dat file and returns the basename of the original search file
|
6
|
+
#
|
7
|
+
def self.input_basename(dat_file)
|
8
|
+
|
9
|
+
dat=File.new(dat_file)
|
10
|
+
filename=""
|
11
|
+
dat.each_line do |line|
|
12
|
+
if ( line=~ /^File/i)
|
13
|
+
p line
|
14
|
+
m=line.match(/^File=.*?[\/\\]?(.*)\.[md][ga][tf]/i)
|
15
|
+
if ( m!=nil )
|
16
|
+
filename=m[1]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
return filename
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.remove_charge_from_title_string(tstring)
|
26
|
+
if ( tstring=~/(.*)\..*?\..*?\.$/)
|
27
|
+
return tstring.chop
|
28
|
+
end
|
29
|
+
|
30
|
+
if ( tstring=~/(.*)\..*?\..*?\.\d$/)
|
31
|
+
return tstring.chop!.chop
|
32
|
+
end
|
33
|
+
|
34
|
+
throw "Unrecognised title string format #{tstring}"
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
# Create a hashtable which maps spectrum references to retention times for an mgf file
|
39
|
+
#
|
40
|
+
def self.index_mgf_times(mgf_file)
|
41
|
+
rt_table=Hash.new()
|
42
|
+
mgf=File.new(mgf_file)
|
43
|
+
|
44
|
+
mgf.each(sep="END IONS") do |line|
|
45
|
+
|
46
|
+
spec=line.match(/TITLE=(.*?)$/)
|
47
|
+
|
48
|
+
rt=line.match(/RTINSECONDS=(.*?)$/)
|
49
|
+
|
50
|
+
if ( spec!=nil && rt!=nil)
|
51
|
+
# Remove charge from the end of the title
|
52
|
+
spec_id= remove_charge_from_title_string(spec[1])
|
53
|
+
|
54
|
+
rt_table[spec_id]=rt[1]
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
return rt_table
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
require 'protk/mascot_util'
|
3
|
+
include LibXML
|
4
|
+
|
5
|
+
|
6
|
+
class OMSSAUtil
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
# Reads a pepxml file and modifies it to include retention time info.
|
11
|
+
# The modified xml doc is returned but not yet saved
|
12
|
+
#
|
13
|
+
def self.add_retention_times(mgf_file,pepxml_file,over_write=false,save=false)
|
14
|
+
parser=XML::Parser.file(pepxml_file)
|
15
|
+
pepxml_doc=parser.parse
|
16
|
+
rt_table=MascotUtil.index_mgf_times(mgf_file)
|
17
|
+
|
18
|
+
p "Retention time table #{rt_table}"
|
19
|
+
|
20
|
+
# queries=pepxml_doc.find('//x:spectrum_query','x:http://regis-web.systemsbiology.net/pepXML')
|
21
|
+
queries=pepxml_doc.find('//spectrum_query')
|
22
|
+
i=0
|
23
|
+
queries.each do |query|
|
24
|
+
|
25
|
+
atts=query.attributes
|
26
|
+
spect=atts["spectrum"]
|
27
|
+
spect.chop!.chop! # Remove charge ... presume it isn't greater than 9!
|
28
|
+
|
29
|
+
throw "No spectrum found for spectrum_query #{query}" unless ( spect!=nil)
|
30
|
+
throw "No retention time found for spectrum #{spect}. Most likely MALDI data was converted without specifying MALDI option." unless (rt_table[spect]!=nil)
|
31
|
+
|
32
|
+
if ( queries[i].attributes["retention_time_sec"]!=nil )
|
33
|
+
throw "A retention time value is already present" unless over_write
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
if ( queries[i].attributes["retention_time_sec"]==nil || over_write)
|
38
|
+
queries[i].attributes["retention_time_sec"]=rt_table[spect]
|
39
|
+
p queries[i].attributes["retention_time_sec"]
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
i=i+1
|
44
|
+
end
|
45
|
+
|
46
|
+
if ( save)
|
47
|
+
pepxml_doc.save(pepxml_file)
|
48
|
+
end
|
49
|
+
|
50
|
+
return pepxml_doc
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bio'
|
3
|
+
require 'protk/constants'
|
4
|
+
require 'protk/eupathdb_gene_information_table'
|
5
|
+
|
6
|
+
# Provides fast indexed access to a swissprot database in a flat .dat file
|
7
|
+
#
|
8
|
+
class PlasmoDB
|
9
|
+
|
10
|
+
def initialize(env=nil)
|
11
|
+
if ( env!=nil)
|
12
|
+
@genv=env
|
13
|
+
else
|
14
|
+
@genv=Constants.new
|
15
|
+
end
|
16
|
+
|
17
|
+
database_file="#{@genv.protein_database_root}/#{@genv.plasmodb_annotation_database}/raw.txt"
|
18
|
+
|
19
|
+
throw "The plasmodb database at \"#{database_file}\" does not exist" if ( database_file==nil || !FileTest.exist?(database_file) )
|
20
|
+
|
21
|
+
@db_object=EuPathDBGeneInformationFileExtractor.new(database_file)
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
def get_entry_for_name(name)
|
27
|
+
|
28
|
+
@genv.log("Getting entry for #{name}",:info)
|
29
|
+
|
30
|
+
begin
|
31
|
+
result=nil
|
32
|
+
result=@db_object.extract_gene_info(name,10000)
|
33
|
+
|
34
|
+
rescue
|
35
|
+
|
36
|
+
|
37
|
+
if result==nil
|
38
|
+
if ( @genv!=nil)
|
39
|
+
@genv.log("Failed to find PlasmoDB entry for gene named #{name} in database",:warn)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
return result
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
#
|
2
|
+
# This file is part of protk
|
3
|
+
# Created by Ira Cooke 16/12/2010
|
4
|
+
#
|
5
|
+
# Provides common functionality used by xinteract tools provided by the TPP. Includes PeptideProphet, InterProphet and ProteinProphet
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'optparse'
|
9
|
+
require 'ostruct'
|
10
|
+
require 'pathname'
|
11
|
+
require 'libxml'
|
12
|
+
require 'protk/search_tool'
|
13
|
+
|
14
|
+
class ProphetTool < SearchTool
|
15
|
+
|
16
|
+
include LibXML
|
17
|
+
|
18
|
+
|
19
|
+
# Initializes the commandline options
|
20
|
+
def initialize(option_support={})
|
21
|
+
option_support[:prefix_suffix]=true;
|
22
|
+
option_support[:over_write]=true;
|
23
|
+
|
24
|
+
super(option_support)
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
# Obtain the database name from the given input file
|
31
|
+
#
|
32
|
+
def extract_db(file_name)
|
33
|
+
reader = XML::Reader.file(file_name)
|
34
|
+
throw "Failed to open xml file #{file_name}" unless reader!=nil
|
35
|
+
|
36
|
+
while(reader.read)
|
37
|
+
# For pep.xml files
|
38
|
+
#
|
39
|
+
if ( reader.name == "search_database" )
|
40
|
+
dbnode=reader.expand
|
41
|
+
dbvalue=dbnode['local_path']
|
42
|
+
reader.close
|
43
|
+
return dbvalue
|
44
|
+
end
|
45
|
+
|
46
|
+
# For prot.xml files
|
47
|
+
#
|
48
|
+
if ( reader.name == "protein_summary_header" )
|
49
|
+
dbnode=reader.expand
|
50
|
+
dbvalue=dbnode['reference_database']
|
51
|
+
reader.close
|
52
|
+
return dbvalue
|
53
|
+
end
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
# Obtain the search engine name from the input file
|
64
|
+
# The name of the engine is returned in lowercase and should contain no spaces
|
65
|
+
# Names of common engines are searched for and extracted in simplified form if possible
|
66
|
+
#
|
67
|
+
def extract_engine(file_name)
|
68
|
+
reader = XML::Reader.file(file_name)
|
69
|
+
throw "Failed to open xml file #{file_name}" unless reader!=nil
|
70
|
+
|
71
|
+
while(reader.read)
|
72
|
+
if ( reader.name == "search_summary" )
|
73
|
+
dbnode=reader.expand
|
74
|
+
dbvalue=dbnode['search_engine']
|
75
|
+
reader.close
|
76
|
+
engine_name=dbvalue.gsub(/ /,"_")
|
77
|
+
engine_name=engine_name.gsub(/\(/,"")
|
78
|
+
engine_name=engine_name.gsub(/\)/,"")
|
79
|
+
engine_name=engine_name.gsub(/\!/,"")
|
80
|
+
return engine_name.downcase
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|