protk 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +69 -36
- data/bin/gffmerge.rb +16 -5
- data/bin/mascot_search.rb +174 -97
- data/bin/omssa_search.rb +31 -8
- data/bin/protk_setup.rb +0 -3
- data/bin/sixframe.rb +0 -6
- data/bin/tandem_search.rb +97 -6
- data/bin/template_search.rb +144 -0
- data/bin/uniprot_annotation.rb +130 -0
- data/lib/convert_util.rb +27 -0
- data/lib/pepxml.rb +22 -0
- data/lib/protk/big_search_rakefile.rake +16 -0
- data/lib/protk/big_search_tool.rb +23 -0
- data/lib/protk/bio_sptr_extensions.rb +41 -2
- data/lib/protk/constants.rb +13 -0
- data/lib/protk/data/apt-get_packages.yaml +4 -0
- data/lib/protk/data/default_config.yml +1 -0
- data/lib/protk/data/make_uniprot_table.rb +29 -0
- data/lib/protk/data/predefined_db.trembl_annotation.yaml +20 -0
- data/lib/protk/data/uniprot_accessions.loc +96 -0
- data/lib/protk/data/uniprot_accessions_table.txt +97 -0
- data/lib/protk/data/uniprot_input_accessions.loc +95 -0
- data/lib/protk/manage_db_rakefile.rake +25 -11
- data/lib/protk/omssa_util.rb +1 -1
- data/lib/protk/setup_rakefile.rake +39 -2
- metadata +13 -1
data/bin/omssa_search.rb
CHANGED
@@ -40,6 +40,20 @@ search_tool.option_parser.on( '--galaxy-index-dir dir', 'Specify galaxy index di
|
|
40
40
|
search_tool.options.galaxy_index_dir=dir
|
41
41
|
end
|
42
42
|
|
43
|
+
search_tool.options.omx_output=nil
|
44
|
+
search_tool.option_parser.on( '--omx-output path', 'Specify path for additional OMX output (optional).' ) do |path|
|
45
|
+
search_tool.options.omx_output=path
|
46
|
+
end
|
47
|
+
|
48
|
+
if ( ENV['PROTK_OMSSA_NTHREADS'] )
|
49
|
+
search_tool.options.nthreads=ENV['PROTK_OMSSA_NTHREADS']
|
50
|
+
else
|
51
|
+
search_tool.options.nthreads=0
|
52
|
+
end
|
53
|
+
search_tool.option_parser.on( '--nthreads num', 'Number of search threads to use. Default is to use the value in environment variable PROTK_OMSSA_NTHREADS or else to autodetect' ) do |num|
|
54
|
+
search_tool.options.nthreads=num
|
55
|
+
end
|
56
|
+
|
43
57
|
search_tool.option_parser.parse!
|
44
58
|
|
45
59
|
# Environment with global constants
|
@@ -96,7 +110,7 @@ ARGV.each do |filename|
|
|
96
110
|
|
97
111
|
# The basic command
|
98
112
|
#
|
99
|
-
cmd = "#{make_blastdb_cmd} #{genv.omssacl} -d #{current_db} -fm #{input_path} -op #{output_path} -w"
|
113
|
+
cmd = "#{make_blastdb_cmd} #{genv.omssacl} -nt #{search_tool.nthreads} -d #{current_db} -fm #{input_path} -op #{output_path} -w"
|
100
114
|
|
101
115
|
#Missed cleavages
|
102
116
|
#
|
@@ -118,6 +132,10 @@ ARGV.each do |filename|
|
|
118
132
|
end
|
119
133
|
end
|
120
134
|
|
135
|
+
if ( search_tool.omx_output )
|
136
|
+
cmd << " -ox #{search_tool.omx_output} "
|
137
|
+
end
|
138
|
+
|
121
139
|
|
122
140
|
# Precursor tolerance
|
123
141
|
#
|
@@ -150,8 +168,9 @@ ARGV.each do |filename|
|
|
150
168
|
|
151
169
|
# Variable Modifications
|
152
170
|
#
|
153
|
-
if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
|
171
|
+
if ( search_tool.var_mods !="" && !(search_tool.var_mods =~/None/)) # Checking for none is to cope with galaxy input
|
154
172
|
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join(",")
|
173
|
+
|
155
174
|
if ( var_mods !="" )
|
156
175
|
cmd << " -mv #{var_mods}"
|
157
176
|
end
|
@@ -163,7 +182,7 @@ ARGV.each do |filename|
|
|
163
182
|
end
|
164
183
|
end
|
165
184
|
|
166
|
-
if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
|
185
|
+
if ( search_tool.fix_mods !="" && !(search_tool.fix_mods=~/None/))
|
167
186
|
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
|
168
187
|
if ( fix_mods !="")
|
169
188
|
cmd << " -mf #{fix_mods}"
|
@@ -182,7 +201,7 @@ ARGV.each do |filename|
|
|
182
201
|
end
|
183
202
|
end
|
184
203
|
|
185
|
-
if ( search_tool.searched_ions !="" && !search_tool.searched_ions=~/None/)
|
204
|
+
if ( search_tool.searched_ions !="" && !(search_tool.searched_ions=~/None/))
|
186
205
|
searched_ions=search_tool.searched_ions.split(",").collect{ |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
|
187
206
|
if ( searched_ions!="")
|
188
207
|
cmd << " -i #{searched_ions}"
|
@@ -204,19 +223,23 @@ ARGV.each do |filename|
|
|
204
223
|
# Intensity cut-off
|
205
224
|
cmd << " -ci #{search_tool.intensity_cut_off}"
|
206
225
|
|
226
|
+
# Send output to logfile. OMSSA Logging does not play well with Ruby Open4
|
227
|
+
cmd << " -logfile omssa.log"
|
228
|
+
|
207
229
|
# Up to here we've formulated the omssa command. The rest is cleanup
|
208
230
|
p "Running:#{cmd}"
|
209
231
|
|
210
232
|
# Add retention time corrections
|
211
233
|
#
|
212
234
|
if (search_tool.options.add_retention_times)
|
213
|
-
|
235
|
+
# TODO: Really correct rts
|
236
|
+
# cmd << "; #{rt_correct_bin} #{output_path} #{input_path} "
|
214
237
|
end
|
215
238
|
|
216
239
|
# Correct the pepXML file
|
217
240
|
#
|
218
|
-
cmd << "; #{repair_script_bin} -N #{input_path} -R mgf #{output_path} --omssa-itol #{search_tool.fragment_tol}"
|
219
|
-
genv.log("Running repair script command #{cmd}",:info)
|
241
|
+
# cmd << "; #{repair_script_bin} -N #{input_path} -R mgf #{output_path} --omssa-itol #{search_tool.fragment_tol}"
|
242
|
+
# genv.log("Running repair script command #{cmd}",:info)
|
220
243
|
|
221
244
|
# Run the search
|
222
245
|
#
|
@@ -234,4 +257,4 @@ ARGV.each do |filename|
|
|
234
257
|
#
|
235
258
|
make_blastdb_cmd=""
|
236
259
|
|
237
|
-
end
|
260
|
+
end
|
data/bin/protk_setup.rb
CHANGED
data/bin/sixframe.rb
CHANGED
@@ -40,12 +40,6 @@ file.each do |entry|
|
|
40
40
|
position_start = position
|
41
41
|
position_end = position_start + orf.length*3 -1
|
42
42
|
|
43
|
-
if ( frame > 3)
|
44
|
-
position_start = length - (position - 1)
|
45
|
-
position_end = position_start - orf.length * 3 + 1
|
46
|
-
end
|
47
|
-
|
48
|
-
|
49
43
|
# Create accession compliant with NCBI naming standard
|
50
44
|
# See http://www.ncbi.nlm.nih.gov/books/NBK7183/?rendertype=table&id=ch_demo.T5
|
51
45
|
ncbi_scaffold_id = entry.entry_id.gsub('|','_').gsub(' ','_')
|
data/bin/tandem_search.rb
CHANGED
@@ -48,7 +48,18 @@ search_tool.option_parser.on( '--tandem-output tandem_output', 'Keep X! Tandem O
|
|
48
48
|
end
|
49
49
|
|
50
50
|
search_tool.options.thresholds_type = 'isb_kscore'
|
51
|
-
search_tool.option_parser.on( '--thresholds-type thresholds_type', 'Threshold Type (tandem_default, isb_native, isb_kscore, scaffold)' ) do |thresholds_type|
|
51
|
+
search_tool.option_parser.on( '--thresholds-type thresholds_type', 'Threshold Type (tandem_default, isb_native, isb_kscore, scaffold, system_default)' ) do |thresholds_type|
|
52
|
+
# This options sets up various X! Tandem thresholds.
|
53
|
+
# - system_default: Don't change any defaults just use
|
54
|
+
# the defaults for this TPP install as is.
|
55
|
+
# - tandem_default: These thresholds are found on the
|
56
|
+
# tandem api page. http://www.thegpm.org/tandem/api/index.html
|
57
|
+
# - isb_native: These are the defaults found in
|
58
|
+
# isb_default_input_native.xml distributed with TPP 4.6.
|
59
|
+
# - isb_kscore: These are the defaults found in
|
60
|
+
# isb_default_input_kscore.xml distributed with TPP 4.6.
|
61
|
+
# - scaffold: These are the defaults recommend by Proteome Software
|
62
|
+
# for use with Scaffold.
|
52
63
|
search_tool.options.thresholds_type = thresholds_type
|
53
64
|
end
|
54
65
|
|
@@ -57,11 +68,12 @@ search_tool.option_parser.on( '--algorithm algorithm', "Scoring algorithm (kscor
|
|
57
68
|
search_tool.options.algorithm = algorithm
|
58
69
|
end
|
59
70
|
|
60
|
-
search_tool.options.cleavage_semi =
|
61
|
-
search_tool.option_parser.on( '--cleavage-semi' ) do
|
62
|
-
search_tool.options.cleavage_semi =
|
71
|
+
search_tool.options.cleavage_semi = true
|
72
|
+
search_tool.option_parser.on( '--no-cleavage-semi' ) do
|
73
|
+
search_tool.options.cleavage_semi = false
|
63
74
|
end
|
64
75
|
|
76
|
+
|
65
77
|
search_tool.options.n_terminal_mod_mass=nil
|
66
78
|
search_tool.option_parser.on('--n-terminal-mod-mass mass') do |mass|
|
67
79
|
search_tool.options.n_terminal_mod_mass = mass
|
@@ -122,9 +134,16 @@ def decode_modification_string(mstring)
|
|
122
134
|
mstring
|
123
135
|
end
|
124
136
|
|
137
|
+
def set_option(std_params, tandem_key, value)
|
138
|
+
notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
|
139
|
+
throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
|
140
|
+
notes[0].content=value
|
141
|
+
end
|
142
|
+
|
125
143
|
def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
|
126
|
-
|
127
|
-
|
144
|
+
set_option(std_params, "protein, cleavage semi", search_tool.cleavage_semi ? "yes" : "no")
|
145
|
+
set_option(std_params, "scoring, maximum missed cleavage sites", search_tool.missed_cleavages)
|
146
|
+
|
128
147
|
# Set the input and output paths
|
129
148
|
#
|
130
149
|
input_notes=std_params.find('/bioml/note[@type="input" and @label="spectrum, path"]')
|
@@ -184,6 +203,78 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
|
|
184
203
|
isotopic_error[0].content="no"
|
185
204
|
end
|
186
205
|
|
206
|
+
if search_tool.tandem_output
|
207
|
+
# If one is interested in the tandem output (e.g. for consumption by Scaffold)
|
208
|
+
# want to store additional information.
|
209
|
+
set_option(std_params, "output, spectra", "yes")
|
210
|
+
end
|
211
|
+
|
212
|
+
thresholds_type = search_tool.thresholds_type
|
213
|
+
|
214
|
+
if thresholds_type != "system_default"
|
215
|
+
|
216
|
+
maximum_valid_expectation_value = "0.1"
|
217
|
+
if thresholds_type == "scaffold"
|
218
|
+
maximum_valid_expectation_value = "1000"
|
219
|
+
end
|
220
|
+
|
221
|
+
minimum_ion_count = "4"
|
222
|
+
case thresholds_type
|
223
|
+
when "isb_kscore", "isb_native"
|
224
|
+
minimum_ion_count = "1"
|
225
|
+
when "scaffold"
|
226
|
+
minimum_ion_count = "0"
|
227
|
+
end
|
228
|
+
|
229
|
+
minimum_peaks = "15"
|
230
|
+
case thresholds_type
|
231
|
+
when "isb_native"
|
232
|
+
minimum_peaks = "6"
|
233
|
+
when "isb_kscore"
|
234
|
+
minimum_peaks = "10"
|
235
|
+
when "scaffold"
|
236
|
+
minimum_peaks = "0"
|
237
|
+
end
|
238
|
+
|
239
|
+
minimum_fragement_mz = "150"
|
240
|
+
case thresholds_type
|
241
|
+
when "isb_native"
|
242
|
+
minimum_fragement_mz = "50"
|
243
|
+
when "isb_kscore"
|
244
|
+
minimum_fragement_mz = "125"
|
245
|
+
when "scaffold"
|
246
|
+
minimum_fragement_mz = "0"
|
247
|
+
end
|
248
|
+
|
249
|
+
minimum_parent_mh = "500" # tandem and isb_native defaults
|
250
|
+
case thresholds_type
|
251
|
+
when "isb_kscore"
|
252
|
+
minimum_parent_mh = "600"
|
253
|
+
when "scaffold"
|
254
|
+
minimum_parent_mh = "0"
|
255
|
+
end
|
256
|
+
|
257
|
+
use_noise_suppression = "yes"
|
258
|
+
if thresholds_type == "isb_kscore" or thresholds_type == "scaffold"
|
259
|
+
use_noise_suppression = "no"
|
260
|
+
end
|
261
|
+
|
262
|
+
dynamic_range = "100.0"
|
263
|
+
case thresholds_type
|
264
|
+
when "isb_kscore"
|
265
|
+
dynamic_range = "10000.0"
|
266
|
+
when "scaffold"
|
267
|
+
dynamic_range = "1000.0"
|
268
|
+
end
|
269
|
+
|
270
|
+
set_option(std_params, "spectrum, dynamic range", dynamic_range)
|
271
|
+
set_option(std_params, "spectrum, use noise suppression", use_noise_suppression)
|
272
|
+
set_option(std_params, "spectrum, minimum parent m+h", minimum_parent_mh)
|
273
|
+
set_option(std_params, "spectrum, minimum fragment mz", minimum_fragement_mz)
|
274
|
+
set_option(std_params, "spectrum, minimum peaks", minimum_peaks)
|
275
|
+
set_option(std_params, "scoring, minimum ion count", minimum_ion_count)
|
276
|
+
set_option(std_params, "output, maximum valid expectation value", maximum_valid_expectation_value)
|
277
|
+
end
|
187
278
|
|
188
279
|
# Fixed and Variable Modifications
|
189
280
|
#
|
@@ -0,0 +1,144 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 14/12/2010
|
5
|
+
#
|
6
|
+
# Runs an MS/MS search using the MSGFPlus search engine
|
7
|
+
#
|
8
|
+
require 'protk/search_tool'
|
9
|
+
|
10
|
+
|
11
|
+
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
12
|
+
#
|
13
|
+
search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
|
14
|
+
search_tool.option_parser.banner = "Run an msms search on a set of msms spectrum input files.\n\nUsage: template_search.rb [options] file1.mzML file2.mzML ..."
|
15
|
+
search_tool.options.output_suffix="_template"
|
16
|
+
|
17
|
+
search_tool.options.custom_option="default"
|
18
|
+
search_tool.option_parser.on('--custom-opt value','Custom option relevant to this tool only (Default default)') do |val|
|
19
|
+
search_tool.options.custom_option=val
|
20
|
+
end
|
21
|
+
|
22
|
+
search_tool.option_parser.parse!
|
23
|
+
|
24
|
+
# Set search engine specific parameters on the SearchTool object
|
25
|
+
#
|
26
|
+
msgf_bin="#{genv.msgf_bin}/MSGFPlus.jar"
|
27
|
+
|
28
|
+
case
|
29
|
+
when Pathname.new(search_tool.database).exist? # It's an explicitly named db
|
30
|
+
current_db=Pathname.new(search_tool.database).realpath.to_s
|
31
|
+
else
|
32
|
+
current_db=search_tool.current_database :fasta
|
33
|
+
end
|
34
|
+
|
35
|
+
fragment_tol = search_tool.fragment_tol
|
36
|
+
precursor_tol = search_tool.precursor_tol
|
37
|
+
|
38
|
+
|
39
|
+
throw "When --output is set only one file at a time can be run" if ( ARGV.length> 1 ) && ( search_tool.explicit_output!=nil )
|
40
|
+
|
41
|
+
# Run the search engine on each input file
|
42
|
+
#
|
43
|
+
ARGV.each do |filename|
|
44
|
+
|
45
|
+
if ( search_tool.explicit_output!=nil)
|
46
|
+
output_path=search_tool.explicit_output
|
47
|
+
else
|
48
|
+
output_path="#{search_tool.output_base_path(filename.chomp)}.mzid"
|
49
|
+
end
|
50
|
+
|
51
|
+
# (*.mzML, *.mzXML, *.mgf, *.ms2, *.pkl or *_dta.txt)
|
52
|
+
# Get the input file extension
|
53
|
+
ext = Pathname.new(filename).extname
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
input_path="#{search_tool.input_base_path(filename.chomp)}#{ext}"
|
58
|
+
|
59
|
+
# Only proceed if the output file is not present or we have opted to over-write it
|
60
|
+
#
|
61
|
+
if ( search_tool.over_write || !Pathname.new(output_path).exist? )
|
62
|
+
|
63
|
+
# The basic command
|
64
|
+
#
|
65
|
+
cmd= "java -Xmx#{search_tool.java_mem} -jar #{msgf_bin} -d #{current_db} -s #{input_path} -o #{output_path} "
|
66
|
+
|
67
|
+
#Missed cleavages
|
68
|
+
#
|
69
|
+
throw "Maximum value for missed cleavages is 2" if ( search_tool.missed_cleavages > 2)
|
70
|
+
cmd << " -ntt #{search_tool.missed_cleavages}"
|
71
|
+
|
72
|
+
# Precursor tolerance
|
73
|
+
#
|
74
|
+
cmd << " -t #{search_tool.precursor_tol}#{search_tool.precursor_tolu}"
|
75
|
+
|
76
|
+
# Instrument type
|
77
|
+
#
|
78
|
+
cmd << " -inst 2"
|
79
|
+
|
80
|
+
# cmd << " -m 4"
|
81
|
+
|
82
|
+
cmd << " -addFeatures 1"
|
83
|
+
|
84
|
+
# Enzyme
|
85
|
+
#
|
86
|
+
# if ( search_tool.enzyme!="Trypsin")
|
87
|
+
# cmd << " -e #{search_tool.enzyme}"
|
88
|
+
# end
|
89
|
+
|
90
|
+
mods_path="#{search_tool.input_base_path(filename.chomp)}.msgfplus_mods.txt"
|
91
|
+
mods_file=File.open(mods_path,'w+')
|
92
|
+
|
93
|
+
# Variable Modifications
|
94
|
+
#
|
95
|
+
if ( search_tool.var_mods !="" && !search_tool.var_mods =~/None/) # Checking for none is to cope with galaxy input
|
96
|
+
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }.join(",")
|
97
|
+
if ( var_mods !="" )
|
98
|
+
cmd << " -mv #{var_mods}"
|
99
|
+
end
|
100
|
+
else
|
101
|
+
# Add options related to peptide modifications
|
102
|
+
#
|
103
|
+
if ( search_tool.glyco )
|
104
|
+
cmd << " -mv 119 "
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Fixed modifications
|
109
|
+
#
|
110
|
+
if ( search_tool.fix_mods !="" && !search_tool.fix_mods=~/None/)
|
111
|
+
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }.join(",")
|
112
|
+
if ( fix_mods !="")
|
113
|
+
cmd << " -mf #{fix_mods}"
|
114
|
+
end
|
115
|
+
else
|
116
|
+
if ( search_tool.has_modifications )
|
117
|
+
cmd << " -mf "
|
118
|
+
if ( search_tool.carbamidomethyl )
|
119
|
+
cmd<<"3 "
|
120
|
+
end
|
121
|
+
|
122
|
+
if ( search_tool.methionine_oxidation )
|
123
|
+
cmd<<"1 "
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Up to here we've formulated the omssa command. The rest is cleanup
|
130
|
+
p "Running:#{cmd}"
|
131
|
+
|
132
|
+
# Run the search
|
133
|
+
#
|
134
|
+
job_params= {:jobid => search_tool.jobid_from_filename(filename) }
|
135
|
+
job_params[:queue]="lowmem"
|
136
|
+
job_params[:vmem]="900mb"
|
137
|
+
search_tool.run(cmd,genv,job_params)
|
138
|
+
|
139
|
+
|
140
|
+
else
|
141
|
+
genv.log("Skipping search on existing file #{output_path}",:warn)
|
142
|
+
end
|
143
|
+
|
144
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of Protk
|
4
|
+
# Created by Ira Cooke 24/3/2013
|
5
|
+
#
|
6
|
+
# Retrieve annotation information for proteins from the Uniprot Swissprot database
|
7
|
+
#
|
8
|
+
#
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/command_runner'
|
11
|
+
require 'protk/tool'
|
12
|
+
require 'protk/swissprot_database'
|
13
|
+
require 'protk/bio_sptr_extensions'
|
14
|
+
|
15
|
+
|
16
|
+
# Setup specific command-line options for this tool. Other options are inherited from Tool
|
17
|
+
#
|
18
|
+
tool=Tool.new({:explicit_output=>true})
|
19
|
+
tool.option_parser.banner = "Retrieve information from the Uniprot database given a list of ID's.\n\n\
|
20
|
+
Usage: uniprot_annotation.rb [options] input.tsv"
|
21
|
+
|
22
|
+
tool.options.id_column=1
|
23
|
+
tool.option_parser.on( '--id-column num', 'Specify a column for ids (default is column 1)' ) do |col|
|
24
|
+
tool.options.id_column=col.to_i
|
25
|
+
end
|
26
|
+
|
27
|
+
tool.options.fields=nil
|
28
|
+
tool.option_parser.on( '--fields flds', 'A comma separated list of fields to extract' ) do |flds|
|
29
|
+
tool.options.fields=flds
|
30
|
+
end
|
31
|
+
|
32
|
+
tool.option_parser.parse!
|
33
|
+
|
34
|
+
# Obtain a global environment object
|
35
|
+
genv=Constants.new
|
36
|
+
|
37
|
+
input_file=ARGV[0]
|
38
|
+
|
39
|
+
swissprotdb=SwissprotDatabase.new(genv)
|
40
|
+
|
41
|
+
output_file=nil
|
42
|
+
|
43
|
+
if ( tool.explicit_output==nil)
|
44
|
+
output_file=$stdout
|
45
|
+
else
|
46
|
+
output_file=File.open(tool.explicit_output,'w+')
|
47
|
+
end
|
48
|
+
|
49
|
+
ac_column = tool.id_column-1
|
50
|
+
|
51
|
+
db_fields = {
|
52
|
+
'recname'=>"Primary Name",
|
53
|
+
'cd'=>"CD Antigen Name",
|
54
|
+
'altnames'=>"Alternate Names",
|
55
|
+
'location' => "Subcellular Location",
|
56
|
+
'function' => "Known Function",
|
57
|
+
'similarity' => "Similarity",
|
58
|
+
'tissues' => "Tissue Specificity",
|
59
|
+
'disease' => "Disease Association",
|
60
|
+
'domain' => "Domain",
|
61
|
+
'subunit' => "Sub Unit",
|
62
|
+
'nextbio' => "NextBio",
|
63
|
+
'ipi' => "IPI",
|
64
|
+
'intact' => "Interactions",
|
65
|
+
'pride' => 'Pride',
|
66
|
+
'ensembl'=> 'Ensembl',
|
67
|
+
'num_transmem'=>"Transmembrane Regions",
|
68
|
+
'signalp'=>'Signal Peptide',
|
69
|
+
'ref_dump'=>'References',
|
70
|
+
'tax_dump'=>'Taxonomy Cross Ref',
|
71
|
+
'species_dump'=>'Species',
|
72
|
+
'feature_dump'=>'Feature Table',
|
73
|
+
'seq_dump' => 'AA Sequence'
|
74
|
+
}
|
75
|
+
|
76
|
+
hyperlink_fields = {
|
77
|
+
'uniprot_link'=>"Uniprot Link",
|
78
|
+
'nextbio_link'=>'NextBio Link',
|
79
|
+
'intact_link'=>"Interactions Link",
|
80
|
+
'pride_link'=>"Pride Link",
|
81
|
+
'ensembl_link'=>"Ensembl Link"
|
82
|
+
}
|
83
|
+
|
84
|
+
if tool.fields !=nil
|
85
|
+
fields = tool.fields.split(",").collect { |f| f.lstrip.rstrip }.reject {|e| e.empty? }
|
86
|
+
db_fields = db_fields.select { |k| fields.include? k }
|
87
|
+
hyperlink_fields = hyperlink_fields.select { |k| fields.include? k}
|
88
|
+
end
|
89
|
+
|
90
|
+
output_file.write db_fields.values.join("\t")
|
91
|
+
if ( hyperlink_fields.count > 0 )
|
92
|
+
output_file.write("\t")
|
93
|
+
output_file.write hyperlink_fields.values.join("\t")
|
94
|
+
end
|
95
|
+
output_file.write("\n")
|
96
|
+
|
97
|
+
line_num=0
|
98
|
+
File.foreach(input_file) { |line|
|
99
|
+
input_cols=line.split("\t")
|
100
|
+
throw "Not enough columns in line #{line_num}" unless input_cols.count > ac_column
|
101
|
+
accession=input_cols[ac_column].chomp
|
102
|
+
|
103
|
+
sptr_entry=swissprotdb.get_entry_for_name(accession)
|
104
|
+
|
105
|
+
if ( sptr_entry==nil)
|
106
|
+
genv.log("No entry for #{accession} in uniprot database",:warn)
|
107
|
+
else
|
108
|
+
|
109
|
+
db_values = db_fields.collect { |key,value|
|
110
|
+
sptr_entry.send(key)
|
111
|
+
}
|
112
|
+
|
113
|
+
hyperlink_values = hyperlink_fields.collect { |key,value|
|
114
|
+
sptr_entry.send(key)
|
115
|
+
}
|
116
|
+
|
117
|
+
output_file.write db_values.join("\t")
|
118
|
+
if ( hyperlink_fields.count > 0 )
|
119
|
+
output_file.write("\t")
|
120
|
+
output_file.write hyperlink_values.join("\t")
|
121
|
+
end
|
122
|
+
output_file.write "\n"
|
123
|
+
end
|
124
|
+
|
125
|
+
line_num+=1
|
126
|
+
|
127
|
+
}
|
128
|
+
|
129
|
+
|
130
|
+
|