protk 1.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/README.md +85 -0
  2. data/bin/annotate_ids.rb +59 -0
  3. data/bin/big_search.rb +41 -0
  4. data/bin/correct_omssa_retention_times.rb +27 -0
  5. data/bin/feature_finder.rb +76 -0
  6. data/bin/file_convert.rb +157 -0
  7. data/bin/generate_omssa_loc.rb +42 -0
  8. data/bin/interprophet.rb +91 -0
  9. data/bin/make_decoy.rb +64 -0
  10. data/bin/manage_db.rb +123 -0
  11. data/bin/mascot_search.rb +187 -0
  12. data/bin/mascot_to_pepxml.rb +44 -0
  13. data/bin/msgfplus_search.rb +191 -0
  14. data/bin/omssa_search.rb +205 -0
  15. data/bin/peptide_prophet.rb +245 -0
  16. data/bin/pepxml_to_table.rb +78 -0
  17. data/bin/protein_prophet.rb +140 -0
  18. data/bin/protk_setup.rb +31 -0
  19. data/bin/repair_run_summary.rb +113 -0
  20. data/bin/tandem_search.rb +292 -0
  21. data/bin/template_search.rb +144 -0
  22. data/bin/unimod_to_loc.rb +118 -0
  23. data/bin/xls_to_table.rb +46 -0
  24. data/ext/protk/extconf.rb +3 -0
  25. data/ext/protk/protk.c +235 -0
  26. data/lib/protk/big_search_rakefile.rake +16 -0
  27. data/lib/protk/big_search_tool.rb +23 -0
  28. data/lib/protk/bio_sptr_extensions.rb +210 -0
  29. data/lib/protk/biotools_excel_converter.rb +60 -0
  30. data/lib/protk/command_runner.rb +84 -0
  31. data/lib/protk/constants.rb +296 -0
  32. data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
  33. data/lib/protk/data/apt-get_packages.yaml +47 -0
  34. data/lib/protk/data/brew_packages.yaml +10 -0
  35. data/lib/protk/data/default_config.yml +20 -0
  36. data/lib/protk/data/predefined_db.crap.yaml +19 -0
  37. data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
  38. data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
  39. data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
  40. data/lib/protk/data/tandem_params.xml +56 -0
  41. data/lib/protk/data/taxonomy_template.xml +9 -0
  42. data/lib/protk/data/unimod.xml +16780 -0
  43. data/lib/protk/eupathdb_gene_information_table.rb +158 -0
  44. data/lib/protk/galaxy_stager.rb +24 -0
  45. data/lib/protk/galaxy_util.rb +9 -0
  46. data/lib/protk/manage_db_rakefile.rake +484 -0
  47. data/lib/protk/manage_db_tool.rb +181 -0
  48. data/lib/protk/mascot_util.rb +63 -0
  49. data/lib/protk/omssa_util.rb +57 -0
  50. data/lib/protk/plasmodb.rb +50 -0
  51. data/lib/protk/prophet_tool.rb +85 -0
  52. data/lib/protk/protein_annotator.rb +646 -0
  53. data/lib/protk/protxml.rb +137 -0
  54. data/lib/protk/randomize.rb +7 -0
  55. data/lib/protk/search_tool.rb +182 -0
  56. data/lib/protk/setup_rakefile.rake +245 -0
  57. data/lib/protk/setup_tool.rb +19 -0
  58. data/lib/protk/spreadsheet_extensions.rb +78 -0
  59. data/lib/protk/swissprot_database.rb +38 -0
  60. data/lib/protk/tool.rb +182 -0
  61. data/lib/protk/xtandem_defaults.rb +11 -0
  62. data/lib/protk.rb +18 -0
  63. metadata +256 -0
@@ -0,0 +1,181 @@
1
+ #
2
+ # This file is part of protk
3
+ # Created by Ira Cooke 13/3/2012
4
+ #
5
+ # Provides support for the manage_db tool
6
+ #
7
+
8
+ require 'optparse'
9
+ require 'ostruct'
10
+ require 'protk/tool'
11
+
12
+ class ManageDBTool < Tool
13
+
14
+ def add dbspec, dbname
15
+ genv=Constants.new()
16
+ dbdir="#{genv.protein_database_root}/#{dbname}"
17
+ %x[mkdir -p #{dbdir}]
18
+
19
+ File.open("#{dbdir}/.protkdb.yaml", "w") {|file| file.puts(dbspec.to_yaml) }
20
+ end
21
+
22
+ def predefined_databases_help
23
+ this_dir=File.dirname(__FILE__)
24
+ definition_files=Dir.glob("#{this_dir}/data/predefined_db.*")
25
+ help_string=""
26
+ for fn in definition_files
27
+ name=Pathname.new(fn).basename.to_s.split(".")[1]
28
+ desc=YAML.load(File.read(fn))[:description]
29
+ help_string << "\t\t\t\t\t#{name} : #{desc}\n"
30
+ end
31
+
32
+ help_string
33
+ end
34
+
35
+ def predefined_names
36
+ this_dir=File.dirname(__FILE__)
37
+ definition_files=Dir.glob("#{this_dir}/data/predefined_db.*")
38
+ definition_files.collect { |fn| Pathname.new(fn).basename.to_s.split(".")[1] }
39
+ end
40
+
41
+ def get_predefined_definition name
42
+ this_dir=File.dirname(__FILE__)
43
+ filename="#{this_dir}/data/predefined_db.#{name}.yaml"
44
+ return {} unless Pathname.new(filename).exist?
45
+ if predefined_names.include? name
46
+ return YAML.load(File.read(filename))
47
+ end
48
+ return {}
49
+ end
50
+
51
+
52
+ def all_database_names(genv)
53
+ all_names=[]
54
+ Dir.foreach(genv.protein_database_root) do |db_subdir|
55
+ db_specfile="#{genv.protein_database_root}/#{db_subdir}/.protkdb.yaml"
56
+ if ( Pathname.new(db_specfile).exist?)
57
+ all_names.push db_subdir
58
+ end
59
+ end
60
+ return all_names
61
+ end
62
+
63
+ def rakefile_path
64
+ "#{File.dirname(__FILE__)}/manage_db_rakefile.rake"
65
+ end
66
+
67
+ # Initializes the commandline options
68
+ def initialize(command)
69
+ super({:help=>false})
70
+
71
+ @option_parser.banner=""
72
+
73
+
74
+ case command
75
+ when "add"
76
+
77
+ @options.sources=[]
78
+
79
+ @options.predefined=false
80
+ @option_parser.on( '--predefined', "Install a database from one of protk\'s predefined definitions.\n\t\t\t\t\tAvailable definitions are;\n#{predefined_databases_help}" ) do
81
+ @options.predefined=true
82
+ end
83
+
84
+ @option_parser.on( '--db-source dbname', 'A named database to use an an input source. Multiple db sources may be specified' ) do |db|
85
+ @options.sources.push db
86
+ end
87
+
88
+ @option_parser.on( '--file-source fs', 'A file path to a fasta file to use as an input source. Multiple file sources may be specified' ) do |fs|
89
+ @options.sources.push fs
90
+ end
91
+
92
+ @option_parser.on( '--ftp-source fs', "A space separated pair of urls. \n\t\t\t\t\tThe first is an ftp url to a fasta file to use as an input source.\n\t\t\t\t\tThe second is an ftp url to release notes file or other file which can be checked to see if the database requires an update. If no such url exists type \"none\" \n\t\t\t\t\tMultiple ftp sources may be specified" ) do |ftps|
93
+ @options.sources.push ftps.split(/\s+/)
94
+ end
95
+
96
+ @options.include_filters=[]
97
+ @option_parser.on( '--include-filters rx', "A comma separated series of regular expressions to use as filters. \n\t\t\t\t\tEach time this argument is encountered is adds a set of filters for another source file, in the order that source files were added. \n\t\t\t\t\tIf you use multiple source files you will need multiple --include-filters" ) do |tx|
98
+
99
+ throw "Specified include filter #{tx} is not in the format /regex1/,/regex2/" unless match=tx.match(/\/(.*)\//)
100
+ tx= match[1]
101
+
102
+ @options.include_filters.push tx.split(/\/,\//)
103
+ end
104
+
105
+ @options.id_regexes=[]
106
+ @option_parser.on( '--id-regex rx', 'A regular expression with a single capture group for capturing the protein ID from a faster description line' ) do |rx|
107
+ rx.gsub!(/^\//,'')
108
+ rx.gsub!(/\/$/,'')
109
+ @options.id_regexes.push rx
110
+ end
111
+
112
+ @options.make_blast_index=false
113
+ @option_parser.on( '--make-blast-index', 'Create a blast index of the database (required for OMSSA searches)' ) do
114
+ @options.make_blast_index=true
115
+ end
116
+
117
+ @options.make_msgf_index=false
118
+ @option_parser.on( '--make-msgf-index', 'Create an index suitable for msgf plus (required for msgfplus searches)' ) do
119
+ @options.make_msgf_index=true
120
+ end
121
+
122
+ @options.decoys=false
123
+ @option_parser.on( '--add-decoys', 'Add random sequences to be used as decoys to the database (required for OMSSA searches)' ) do
124
+ @options.decoys=true
125
+ end
126
+
127
+ @options.archive_old=false
128
+ @option_parser.on( '--archive-old', 'Don\'t delete old fasta files when updating to a newer version' ) do
129
+ @options.archive_old=true
130
+ end
131
+
132
+ @options.decoy_prefix="decoy_"
133
+ @option_parser.on( '--decoy-prefix pref', 'Define a prefix string to prepend to protein ID\'s used as decoys' ) do |pref|
134
+ @options.decoy_prefix=pref
135
+ end
136
+
137
+
138
+ @options.update_spec=false
139
+ @option_parser.on( '--update-spec', 'Change the specification for an existing database by updating its spec file' ) do
140
+ @options.update_spec=true
141
+ end
142
+
143
+ @options.is_annotation_db=false
144
+ @option_parser.on( '--annotation-db', 'This database is not for searching but for annotating search results (eg Swissprot .dat file)' ) do
145
+ @options.is_annotation_db=true
146
+ end
147
+
148
+ @options.db_format="fasta"
149
+ @option_parser.on( '--db-format format', 'Format of the database file (fasta or dat). Default is fasta' ) do |format|
150
+ @options.db_format=format
151
+ end
152
+
153
+
154
+ @option_parser.banner = "Add new protein databases.\nUsage: manage_db.rb add [options] <dbname>"
155
+
156
+ when "list"
157
+ @option_parser.banner = "List protein databases.\nUsage: manage_db.rb list"
158
+
159
+ @options.verbose=false
160
+ @option_parser.on('-v', '--verbose', 'Display detailed specification for each installed database' ) do
161
+ @options.verbose=true
162
+ end
163
+
164
+ @options.galaxy=false
165
+ @option_parser.on('-g' ,'--generate-loc-file', 'Generate a galaxy loc file' ) do
166
+ @options.galaxy=true
167
+ end
168
+
169
+ @options.galaxy_write=false
170
+ @option_parser.on('-G' ,'--write-loc-file', 'Update the pepxml_databases.loc file in galaxy if a galaxy_root directory has been configured and the file exists' ) do
171
+ @options.galaxy_write=true
172
+ end
173
+
174
+
175
+ when "update"
176
+ @option_parser.banner = "Update protein databases.\nUsage: manage_db.rb update <dbname>"
177
+ end
178
+
179
+ end
180
+
181
+ end
@@ -0,0 +1,63 @@
1
+
2
+ class MascotUtil
3
+
4
+
5
+ # Reads a mascot dat file and returns the basename of the original search file
6
+ #
7
+ def self.input_basename(dat_file)
8
+
9
+ dat=File.new(dat_file)
10
+ filename=""
11
+ dat.each_line do |line|
12
+ if ( line=~ /^File/i)
13
+ p line
14
+ m=line.match(/^File=.*?[\/\\]?(.*)\.[md][ga][tf]/i)
15
+ if ( m!=nil )
16
+ filename=m[1]
17
+ end
18
+ end
19
+ end
20
+
21
+ return filename
22
+
23
+ end
24
+
25
+ def self.remove_charge_from_title_string(tstring)
26
+ if ( tstring=~/(.*)\..*?\..*?\.$/)
27
+ return tstring.chop
28
+ end
29
+
30
+ if ( tstring=~/(.*)\..*?\..*?\.\d$/)
31
+ return tstring.chop!.chop
32
+ end
33
+
34
+ throw "Unrecognised title string format #{tstring}"
35
+
36
+ end
37
+
38
+ # Create a hashtable which maps spectrum references to retention times for an mgf file
39
+ #
40
+ def self.index_mgf_times(mgf_file)
41
+ rt_table=Hash.new()
42
+ mgf=File.new(mgf_file)
43
+
44
+ mgf.each(sep="END IONS") do |line|
45
+
46
+ spec=line.match(/TITLE=(.*?)$/)
47
+
48
+ rt=line.match(/RTINSECONDS=(.*?)$/)
49
+
50
+ if ( spec!=nil && rt!=nil)
51
+ # Remove charge from the end of the title
52
+ spec_id= remove_charge_from_title_string(spec[1])
53
+
54
+ rt_table[spec_id]=rt[1]
55
+ end
56
+
57
+ end
58
+
59
+ return rt_table
60
+
61
+ end
62
+
63
+ end
@@ -0,0 +1,57 @@
1
+ require 'libxml'
2
+ require 'protk/mascot_util'
3
+ include LibXML
4
+
5
+
6
+ class OMSSAUtil
7
+
8
+
9
+
10
+ # Reads a pepxml file and modifies it to include retention time info.
11
+ # The modified xml doc is returned but not yet saved
12
+ #
13
+ def self.add_retention_times(mgf_file,pepxml_file,over_write=false,save=false)
14
+ parser=XML::Parser.file(pepxml_file)
15
+ pepxml_doc=parser.parse
16
+ rt_table=MascotUtil.index_mgf_times(mgf_file)
17
+
18
+ p "Retention time table #{rt_table}"
19
+
20
+ # queries=pepxml_doc.find('//x:spectrum_query','x:http://regis-web.systemsbiology.net/pepXML')
21
+ queries=pepxml_doc.find('//spectrum_query')
22
+ i=0
23
+ queries.each do |query|
24
+
25
+ atts=query.attributes
26
+ spect=atts["spectrum"]
27
+ spect.chop!.chop! # Remove charge ... presume it isn't greater than 9!
28
+
29
+ throw "No spectrum found for spectrum_query #{query}" unless ( spect!=nil)
30
+ throw "No retention time found for spectrum #{spect}. Most likely MALDI data was converted without specifying MALDI option." unless (rt_table[spect]!=nil)
31
+
32
+ if ( queries[i].attributes["retention_time_sec"]!=nil )
33
+ throw "A retention time value is already present" unless over_write
34
+ end
35
+
36
+
37
+ if ( queries[i].attributes["retention_time_sec"]==nil || over_write)
38
+ queries[i].attributes["retention_time_sec"]=rt_table[spect]
39
+ p queries[i].attributes["retention_time_sec"]
40
+ end
41
+
42
+
43
+ i=i+1
44
+ end
45
+
46
+ if ( save)
47
+ pepxml_doc.save(pepxml_file)
48
+ end
49
+
50
+ return pepxml_doc
51
+ end
52
+
53
+
54
+
55
+
56
+
57
+ end
@@ -0,0 +1,50 @@
1
+ require 'rubygems'
2
+ require 'bio'
3
+ require 'protk/constants'
4
+ require 'protk/eupathdb_gene_information_table'
5
+
6
+ # Provides fast indexed access to a swissprot database in a flat .dat file
7
+ #
8
+ class PlasmoDB
9
+
10
+ def initialize(env=nil)
11
+ if ( env!=nil)
12
+ @genv=env
13
+ else
14
+ @genv=Constants.new
15
+ end
16
+
17
+ database_file="#{@genv.protein_database_root}/#{@genv.plasmodb_annotation_database}/raw.txt"
18
+
19
+ throw "The plasmodb database at \"#{database_file}\" does not exist" if ( database_file==nil || !FileTest.exist?(database_file) )
20
+
21
+ @db_object=EuPathDBGeneInformationFileExtractor.new(database_file)
22
+
23
+ end
24
+
25
+
26
+ def get_entry_for_name(name)
27
+
28
+ @genv.log("Getting entry for #{name}",:info)
29
+
30
+ begin
31
+ result=nil
32
+ result=@db_object.extract_gene_info(name,10000)
33
+
34
+ rescue
35
+
36
+
37
+ if result==nil
38
+ if ( @genv!=nil)
39
+ @genv.log("Failed to find PlasmoDB entry for gene named #{name} in database",:warn)
40
+ end
41
+ end
42
+
43
+
44
+ return result
45
+ end
46
+
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,85 @@
1
+ #
2
+ # This file is part of protk
3
+ # Created by Ira Cooke 16/12/2010
4
+ #
5
+ # Provides common functionality used by xinteract tools provided by the TPP. Includes PeptideProphet, InterProphet and ProteinProphet
6
+ #
7
+
8
+ require 'optparse'
9
+ require 'ostruct'
10
+ require 'pathname'
11
+ require 'libxml'
12
+ require 'protk/search_tool'
13
+
14
+ class ProphetTool < SearchTool
15
+
16
+ include LibXML
17
+
18
+
19
+ # Initializes the commandline options
20
+ def initialize(option_support={})
21
+ option_support[:prefix_suffix]=true;
22
+ option_support[:over_write]=true;
23
+
24
+ super(option_support)
25
+
26
+ end
27
+
28
+
29
+
30
+ # Obtain the database name from the given input file
31
+ #
32
+ def extract_db(file_name)
33
+ reader = XML::Reader.file(file_name)
34
+ throw "Failed to open xml file #{file_name}" unless reader!=nil
35
+
36
+ while(reader.read)
37
+ # For pep.xml files
38
+ #
39
+ if ( reader.name == "search_database" )
40
+ dbnode=reader.expand
41
+ dbvalue=dbnode['local_path']
42
+ reader.close
43
+ return dbvalue
44
+ end
45
+
46
+ # For prot.xml files
47
+ #
48
+ if ( reader.name == "protein_summary_header" )
49
+ dbnode=reader.expand
50
+ dbvalue=dbnode['reference_database']
51
+ reader.close
52
+ return dbvalue
53
+ end
54
+
55
+
56
+
57
+ end
58
+
59
+ end
60
+
61
+
62
+
63
+ # Obtain the search engine name from the input file
64
+ # The name of the engine is returned in lowercase and should contain no spaces
65
+ # Names of common engines are searched for and extracted in simplified form if possible
66
+ #
67
+ def extract_engine(file_name)
68
+ reader = XML::Reader.file(file_name)
69
+ throw "Failed to open xml file #{file_name}" unless reader!=nil
70
+
71
+ while(reader.read)
72
+ if ( reader.name == "search_summary" )
73
+ dbnode=reader.expand
74
+ dbvalue=dbnode['search_engine']
75
+ reader.close
76
+ engine_name=dbvalue.gsub(/ /,"_")
77
+ engine_name=engine_name.gsub(/\(/,"")
78
+ engine_name=engine_name.gsub(/\)/,"")
79
+ engine_name=engine_name.gsub(/\!/,"")
80
+ return engine_name.downcase
81
+ end
82
+ end
83
+ end
84
+
85
+ end