protk 1.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/README.md +85 -0
  2. data/bin/annotate_ids.rb +59 -0
  3. data/bin/big_search.rb +41 -0
  4. data/bin/correct_omssa_retention_times.rb +27 -0
  5. data/bin/feature_finder.rb +76 -0
  6. data/bin/file_convert.rb +157 -0
  7. data/bin/generate_omssa_loc.rb +42 -0
  8. data/bin/interprophet.rb +91 -0
  9. data/bin/make_decoy.rb +64 -0
  10. data/bin/manage_db.rb +123 -0
  11. data/bin/mascot_search.rb +187 -0
  12. data/bin/mascot_to_pepxml.rb +44 -0
  13. data/bin/msgfplus_search.rb +191 -0
  14. data/bin/omssa_search.rb +205 -0
  15. data/bin/peptide_prophet.rb +245 -0
  16. data/bin/pepxml_to_table.rb +78 -0
  17. data/bin/protein_prophet.rb +140 -0
  18. data/bin/protk_setup.rb +31 -0
  19. data/bin/repair_run_summary.rb +113 -0
  20. data/bin/tandem_search.rb +292 -0
  21. data/bin/template_search.rb +144 -0
  22. data/bin/unimod_to_loc.rb +118 -0
  23. data/bin/xls_to_table.rb +46 -0
  24. data/ext/protk/extconf.rb +3 -0
  25. data/ext/protk/protk.c +235 -0
  26. data/lib/protk/big_search_rakefile.rake +16 -0
  27. data/lib/protk/big_search_tool.rb +23 -0
  28. data/lib/protk/bio_sptr_extensions.rb +210 -0
  29. data/lib/protk/biotools_excel_converter.rb +60 -0
  30. data/lib/protk/command_runner.rb +84 -0
  31. data/lib/protk/constants.rb +296 -0
  32. data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
  33. data/lib/protk/data/apt-get_packages.yaml +47 -0
  34. data/lib/protk/data/brew_packages.yaml +10 -0
  35. data/lib/protk/data/default_config.yml +20 -0
  36. data/lib/protk/data/predefined_db.crap.yaml +19 -0
  37. data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
  38. data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
  39. data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
  40. data/lib/protk/data/tandem_params.xml +56 -0
  41. data/lib/protk/data/taxonomy_template.xml +9 -0
  42. data/lib/protk/data/unimod.xml +16780 -0
  43. data/lib/protk/eupathdb_gene_information_table.rb +158 -0
  44. data/lib/protk/galaxy_stager.rb +24 -0
  45. data/lib/protk/galaxy_util.rb +9 -0
  46. data/lib/protk/manage_db_rakefile.rake +484 -0
  47. data/lib/protk/manage_db_tool.rb +181 -0
  48. data/lib/protk/mascot_util.rb +63 -0
  49. data/lib/protk/omssa_util.rb +57 -0
  50. data/lib/protk/plasmodb.rb +50 -0
  51. data/lib/protk/prophet_tool.rb +85 -0
  52. data/lib/protk/protein_annotator.rb +646 -0
  53. data/lib/protk/protxml.rb +137 -0
  54. data/lib/protk/randomize.rb +7 -0
  55. data/lib/protk/search_tool.rb +182 -0
  56. data/lib/protk/setup_rakefile.rake +245 -0
  57. data/lib/protk/setup_tool.rb +19 -0
  58. data/lib/protk/spreadsheet_extensions.rb +78 -0
  59. data/lib/protk/swissprot_database.rb +38 -0
  60. data/lib/protk/tool.rb +182 -0
  61. data/lib/protk/xtandem_defaults.rb +11 -0
  62. data/lib/protk.rb +18 -0
  63. metadata +256 -0
@@ -0,0 +1,158 @@
1
+ # Code for interacting with EuPathDB gene information files e.g. http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
2
+ # These gene information files contain a large amount of information about individual genes/proteins in EuPathDBs.
3
+
4
+ require 'tempfile'
5
+
6
+ # A class for extracting gene info from a particular gene from the information file
7
+ class EuPathDBGeneInformationFileExtractor
8
+ # A filename path to the gene information file
9
+ attr_accessor :filename
10
+
11
+ def initialize(filename = nil)
12
+ @filename = filename
13
+ end
14
+
15
+ # Returns a EuPathDBGeneInformation object corresponding to the wanted key. If
16
+ # there are multiple in the file, only the first is returned. If none are found, nil is returned.
17
+ #
18
+ # If grep_hack_lines is defined (as an integer), then a shortcut is applied to speed things up. Before parsing the gene info file, grep some lines after the "Gene Id: .." line. Then feed that into the parser.
19
+ def extract_gene_info(wanted_gene_id, grep_hack_lines = nil)
20
+ inside_iterator = lambda do |gene|
21
+ return gene if wanted_gene_id == gene.info['Gene Id']
22
+ end
23
+
24
+ filename = @filename
25
+ p @filename
26
+ if grep_hack_lines and grep_hack_lines.to_i != 0
27
+ tempfile=Tempfile.new('reubypathdb_grep_hack')
28
+ # grep however many lines from past the point. Rather dodgy, but faster.
29
+ raise Exception, "grep_hack_lines should be an integer" unless grep_hack_lines.is_a?(Integer)
30
+ `grep -A #{grep_hack_lines} 'Gene Id: #{wanted_gene_id}' '#{@filename}' >#{tempfile.path}`
31
+ EuPathDBGeneInformationTable.new(File.open(tempfile.path)).each do |gene|
32
+ return inside_iterator.call(gene)
33
+ end
34
+ else
35
+ # no grep hack. Parse the whole gene information file
36
+ EuPathDBGeneInformationTable.new(File.open(@filename)).each do |gene|
37
+ return inside_iterator.call(gene)
38
+ end
39
+ end
40
+ return nil
41
+ end
42
+ end
43
+
44
+ # A class for parsing the 'gene information table' files from EuPathDB, such
45
+ # as http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
46
+ #
47
+ # The usual way of interacting with these is the use of the each method,
48
+ # which returns a EuPathDBGeneInformation object with all of the recorded
49
+ # information in it.
50
+ class EuPathDBGeneInformationTable
51
+ include Enumerable
52
+
53
+ # Initialise using an IO object, say File.open('/path/to/CmurisGene_CryptoDB-4.3.txt'). After opening, the #each method can be used to iterate over the genes that are present in the file
54
+ def initialize(io)
55
+ @io = io
56
+ end
57
+
58
+ # Return a EuPathDBGeneInformation object with
59
+ # the contained info in it, one at a time
60
+ def each
61
+ while g = next_gene
62
+ yield g
63
+ end
64
+ end
65
+
66
+ # Returns a EuPathDBGeneInformation object with all the data you could
67
+ # possibly want.
68
+ def next_gene
69
+ info = EuPathDBGeneInformation.new
70
+
71
+ # first, read the table, which should start with the ID column
72
+ line = @io.readline.strip
73
+ while line == ''
74
+ return nil if @io.eof?
75
+ line = @io.readline.strip
76
+ end
77
+
78
+ while line != ''
79
+ if matches = line.match(/^(.*?)\: (.*)$/)
80
+ info.add_information(matches[1], matches[2])
81
+ else
82
+ raise Exception, "EuPathDBGeneInformationTable Couldn't parse this line: #{line}"
83
+ end
84
+
85
+ line = @io.readline.strip
86
+ end
87
+
88
+ # now read each of the tables, which should start with the
89
+ # 'TABLE: <name>' entry
90
+ line = @io.readline.strip
91
+ table_name = nil
92
+ headers = nil
93
+ data = []
94
+ while line != '------------------------------------------------------------'
95
+ if line == ''
96
+ # add it to the stack unless we are just starting out
97
+ info.add_table(table_name, headers, data) unless table_name.nil?
98
+
99
+ # reset things
100
+ table_name = nil
101
+ headers = nil
102
+ data = []
103
+ elsif matches = line.match(/^TABLE\: (.*)$/)
104
+ # name of a table
105
+ table_name = matches[1]
106
+ elsif line.match(/^\[.*\]/)
107
+ # headings of the table
108
+ headers = line.split("\t").collect do |header|
109
+ header.gsub(/^\[/,'').gsub(/\]$/,'')
110
+ end
111
+ else
112
+ # a proper data row
113
+ data.push line.split("\t")
114
+ end
115
+ line = @io.readline.strip
116
+ end
117
+
118
+ # return the object that has been created
119
+ return info
120
+ end
121
+ end
122
+
123
+ # Each gene in the gene information table is represented
124
+ # by 2 types of information - info and tables.
125
+ # info are 1 line data, whereas tables are tables of
126
+ # data with possibly multiple rows
127
+ class EuPathDBGeneInformation
128
+ def info
129
+ @info
130
+ end
131
+
132
+ def get_info(key)
133
+ @info[key]
134
+ end
135
+ alias_method :[], :get_info
136
+
137
+ def get_table(table_name)
138
+ @tables[table_name]
139
+ end
140
+
141
+ def add_information(key, value)
142
+ @info ||= {}
143
+ @info[key] = value
144
+ "Added info #{key}, now is #{@info[key]}"
145
+ end
146
+
147
+ def add_table(name, headers, data)
148
+ @tables ||= {}
149
+ @tables[name] = []
150
+ data.each do |row|
151
+ final = {}
152
+ row.each_with_index do |cell, i|
153
+ final[headers[i]] = cell
154
+ end
155
+ @tables[name].push final
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,24 @@
1
+ require 'pathname'
2
+
3
+ class GalaxyStager
4
+ attr_accessor :staged_path
5
+
6
+ def initialize(original_path, options = {})
7
+ options = { :name => nil, :extension => '' }.merge(options)
8
+ @original_path = Pathname.new(original_path)
9
+ @wd = Dir.pwd
10
+ staged_name = options[:name] || @original_path.basename
11
+ @staged_path = File.join(@wd, "#{staged_name}#{options[:extension]}")
12
+ File.symlink(@original_path, @staged_path)
13
+ end
14
+
15
+ def restore_references(in_file)
16
+ GalaxyStager.replace_references(in_file, @staged_path, @original_path)
17
+ end
18
+
19
+ def self.replace_references(in_file, from_path, to_path)
20
+ cmd="ruby -pi -e \"gsub('#{from_path}', '#{to_path}')\" #{in_file}"
21
+ %x[#{cmd}]
22
+ end
23
+
24
+ end
@@ -0,0 +1,9 @@
1
+ class GalaxyUtil
2
+
3
+ def self.for_galaxy
4
+ for_galaxy = ARGV[0] == "--galaxy"
5
+ ARGV.shift if for_galaxy
6
+ return for_galaxy
7
+ end
8
+
9
+ end
@@ -0,0 +1,484 @@
1
+ require 'protk/constants'
2
+ require 'protk/randomize'
3
+ require 'uri'
4
+ require 'digest/md5'
5
+ require 'net/ftp'
6
+ require 'net/ftp/list'
7
+ require 'bio'
8
+ require 'tempfile'
9
+ require 'pp'
10
+ require 'set'
11
+
12
+ dbname=ARGV[0]
13
+
14
+ # Load database spec file
15
+ #
16
+ $genv=Constants.new()
17
+ dbdir="#{$genv.protein_database_root}/#{dbname}"
18
+
19
+ dbspec_file="#{dbdir}/.protkdb.yaml"
20
+ dbspec=YAML.load_file "#{dbspec_file}"
21
+
22
+ format = dbspec[:format]!=nil ? dbspec[:format] : "fasta"
23
+
24
+ # Output database filename
25
+ #
26
+ db_filename="#{dbdir}/current.#{format}"
27
+
28
+ #####################
29
+ # Utility Functions #
30
+ #####################
31
+
32
+
33
+ def check_ftp_release_notes(release_notes)
34
+ rn_uri = URI.parse(release_notes)
35
+
36
+ rn_path="#{$genv.database_downloads}/#{rn_uri.host}/#{rn_uri.path}"
37
+
38
+
39
+ host=rn_uri.host
40
+ Net::FTP.open(host) do |ftp|
41
+
42
+ ftp.login
43
+ rn_dir=Pathname.new(rn_uri.path).dirname.to_s
44
+ rn_file=Pathname.new(rn_uri.path).basename.to_s
45
+ ftp.chdir(rn_dir)
46
+
47
+ ftp.passive=true
48
+
49
+
50
+ p "Checking release notes"
51
+
52
+ # Is the last path component a wildcard expression (we only allow *)
53
+ # If so we need to find the file with the most recent modification time
54
+ #
55
+ if ( rn_file =~ /\*/)
56
+ entries=ftp.list(rn_file)
57
+ p entries
58
+ latest_file=nil
59
+ latest_file_mtime=nil
60
+ entries.each do |dir_entry|
61
+ info=Net::FTP::List.parse(dir_entry)
62
+ if ( info.file? )
63
+ latest_file_mtime = info.mtime if ( latest_file_mtime ==nil )
64
+ latest_file = info.basename if ( latest_file_mtime ==nil )
65
+
66
+ if ( info.mtime <=> latest_file_mtime ) #entry's mtime is later
67
+ latest_file_mtime=info.mtime
68
+ latest_file=info.basename
69
+ end
70
+
71
+ end
72
+ end
73
+
74
+ throw "No release notes found" if ( latest_file ==nil)
75
+
76
+ rn_file=latest_file
77
+
78
+ # Adjust the rn_path to be the path of the latest file
79
+ #
80
+ rn_path="#{Pathname.new(rn_path).dirname}/#{latest_file}"
81
+
82
+ end
83
+
84
+ # Hash existing release notes data if it exists
85
+ #
86
+ existing_digest=nil
87
+ existing_digest=Digest::MD5.hexdigest(File.read(rn_path)) if Pathname.new(rn_path).exist?
88
+
89
+
90
+
91
+ rn_data=""
92
+ dl_file=Tempfile.new("rn_file")
93
+
94
+ ftp.getbinaryfile(rn_file,dl_file.path) { |data| rn_data << data }
95
+
96
+ rn_digest=Digest::MD5.hexdigest(rn_data)
97
+
98
+ p "Done Downloading release notes #{ftp} #{rn_file} to #{dl_file.path} #{ftp.pwd}"
99
+
100
+ throw "No release notes data at #{release_notes}" unless rn_digest!=nil
101
+
102
+ # Update release notes data
103
+ case
104
+ when ( existing_digest != rn_digest )
105
+ FileUtils.mkpath(Pathname.new(rn_path).dirname.to_s)
106
+ File.open(rn_path, "w") {|file| file.puts(rn_data) }
107
+ else
108
+ p "Release notes are up to date"
109
+ end
110
+ end
111
+ end
112
+
113
+ def download_ftp_file(ftp,file_name,dest_dir)
114
+ dest_path="#{dest_dir}/#{file_name}"
115
+
116
+ download_size=ftp.size(file_name)
117
+ mod_time=ftp.mtime(file_name,true)
118
+
119
+
120
+
121
+ percent_size=download_size/100
122
+ i=1
123
+ pc_complete=0
124
+ last_time=Time.new
125
+ p "Downloading #{file_name}"
126
+ ftp.passive=true
127
+
128
+ ftp.getbinaryfile(file_name,dest_path,1024) { |data|
129
+
130
+ progress=i*1024
131
+ if ( pc_complete < progress.divmod(percent_size)[0] && ( Time.new - last_time) > 10 )
132
+ pc_complete=progress.divmod(percent_size)[0]
133
+ p "Downloading #{file_name} #{pc_complete} percent complete"
134
+ last_time=Time.new
135
+ end
136
+ i=i+1
137
+ }
138
+
139
+ end
140
+
141
+ def download_ftp_source(source)
142
+
143
+ data_uri = URI.parse(source)
144
+
145
+ data_path="#{$genv.database_downloads}/#{data_uri.host}/#{data_uri.path}"
146
+ # Make sure our destination dir is available
147
+ #
148
+ FileUtils.mkpath(Pathname.new(data_path).dirname.to_s)
149
+
150
+
151
+
152
+ Net::FTP.open(data_uri.host) do |ftp|
153
+ p "Connected to #{data_uri.host}"
154
+ ftp.login
155
+
156
+ ftp.chdir(Pathname.new(data_uri.path).dirname.to_s)
157
+
158
+ last_path_component=Pathname.new(data_uri.path).basename.to_s
159
+
160
+ case
161
+ when last_path_component=~/\*/ # A wildcard match. Need to download them all
162
+ p "Getting directory listing for #{last_path_component}"
163
+ ftp.passive=true
164
+ matching_items=ftp.list(last_path_component)
165
+
166
+ PP.pp(matching_items)
167
+
168
+ matching_items.each do |dir_entry|
169
+ info=Net::FTP::List.parse(dir_entry)
170
+ download_ftp_file(ftp,info.basename,Pathname.new(data_path).dirname)
171
+ end
172
+
173
+ else # Just one file to download
174
+ download_ftp_file(ftp,last_path_component,Pathname.new(data_path).dirname)
175
+ end
176
+
177
+ end
178
+
179
+ end
180
+
181
+
182
+ def archive_fasta_file(filename)
183
+ if ( Pathname.new(filename).exist? )
184
+ mt=File.new(filename).mtime
185
+ timestamp="#{mt.year}_#{mt.month}_#{mt.day}"
186
+ archive_filename="#{filename.gsub(/.fasta$/,'')}_#{timestamp}.fasta"
187
+ p "Moving old database to #{archive_filename}"
188
+ FileUtils.mv(filename,archive_filename)
189
+ end
190
+ end
191
+
192
+ #####################
193
+ # Source Files #
194
+ #####################
195
+
196
+ def file_source(raw_source)
197
+ full_path=raw_source
198
+ full_path = "#{$genv.protein_database_root}/#{raw_source}" unless ( raw_source =~ /^\//) # relative paths should be relative to datbases dir
199
+ throw "File source #{full_path} does not exist" unless Pathname.new(full_path).exist?
200
+ full_path
201
+ end
202
+
203
+ def db_source(db_source)
204
+ current_release_path = "#{$genv.protein_database_root}/#{db_source}/current.fasta"
205
+ throw "Database source #{current_release_path} does not exist" unless Pathname.new(current_release_path).exist?
206
+ current_release_path
207
+ end
208
+
209
+
210
+ def ftp_source(ftpsource)
211
+
212
+
213
+ data_uri=URI.parse(ftpsource[0])
214
+ data_file_path="#{$genv.database_downloads}/#{data_uri.host}/#{data_uri.path}"
215
+ unpacked_data_path=data_file_path.gsub(/\.gz$/,'')
216
+
217
+ release_notes_url=ftpsource[1]
218
+ release_notes_exist=true
219
+ release_notes_exist=false if release_notes_url =~ /^\s*none\s*$/
220
+ if release_notes_exist
221
+ data_rn=URI.parse(release_notes_url) unless
222
+ release_notes_file_path="#{$genv.database_downloads}/#{data_rn.host}/#{data_rn.path}"
223
+
224
+ task :check_rn do
225
+ check_ftp_release_notes(release_notes_url)
226
+ end
227
+
228
+ file release_notes_file_path => :check_rn
229
+ else
230
+ task :check_date do
231
+
232
+ end
233
+ end
234
+
235
+
236
+
237
+ if ( data_file_path=~/\*/) # A wildcard
238
+ unpacked_data_path=data_file_path.gsub(/\*/,"_all_").gsub(/\.gz$/,'')
239
+ end
240
+
241
+ file unpacked_data_path do #Unpacking. Includes unzipping and/or concatenating
242
+ download_ftp_source(ftpsource[0])
243
+
244
+ case
245
+ when data_file_path=~/\*/ # Multiple files to unzip/concatenate and we don't know what they are yet
246
+ file_pattern = Pathname.new(data_file_path).basename.to_s
247
+ if file_pattern =~ /.gz$/
248
+ unzipcmd="gunzip -vdf #{file_pattern}"
249
+ p "Unzipping #{unzipcmd} ... this could take a while"
250
+ sh %{ cd #{Pathname.new(data_file_path).dirname}; #{unzipcmd} }
251
+ end
252
+
253
+ file_pattern.gsub!(/\.gz$/,'')
254
+ catcmd="cat #{file_pattern} > #{unpacked_data_path}"
255
+
256
+ p "Concatenating files #{catcmd} ... this could take a while"
257
+ sh %{ cd #{Pathname.new(data_file_path).dirname}; #{catcmd} }
258
+
259
+ else # Simple case. A single file
260
+ if file_pattern =~ /.gz$/
261
+ p "Unzipping #{Pathname.new(data_file_path).basename} ... "
262
+ sh %{ cd #{Pathname.new(data_file_path).dirname}; gunzip -f #{Pathname.new(data_file_path).basename} }
263
+ end
264
+ end
265
+ end
266
+
267
+ task release_notes_file_path => release_notes_file_path if release_notes_exist
268
+
269
+ unpacked_data_path
270
+ end
271
+
272
+ source_files=dbspec[:sources].collect do |raw_source|
273
+ sf=""
274
+ case
275
+ when raw_source.class==Array
276
+ sf=ftp_source(raw_source)
277
+ when (raw_source =~ /\.fasta$/ || raw_source =~ /\.txt$/ || raw_source =~ /\.dat$/ )
278
+ sf=file_source(raw_source)
279
+ else
280
+ sf=db_source(raw_source)
281
+ end
282
+ sf
283
+ end
284
+
285
+ ########################
286
+ # Concat Filter Copy #
287
+ ########################
288
+
289
+ raw_db_filename = "#{dbdir}/raw.#{format}"
290
+
291
+ file raw_db_filename => [source_files,dbspec_file].flatten do
292
+
293
+ source_filters=dbspec[:include_filters]
294
+
295
+ if ( format == "fasta" && source_filters.length > 0 ) # We can perform concat and filter for fasta only
296
+
297
+ archive_fasta_file(raw_db_filename) if dbspec[:archive_old]
298
+
299
+ output_fh=File.open(raw_db_filename, "w")
300
+
301
+ id_regexes=dbspec[:id_regexes]
302
+ source_i=0
303
+ throw "The number of source files #{source_files.length} should equal the number of source filters #{source_filters.length}" unless source_filters.length == source_files.length
304
+ throw "The number of source files #{source_files.length} should equal the number of id regexes #{id_regexes.length}" unless source_filters.length == id_regexes.length
305
+
306
+ added_ids=Set.new
307
+
308
+ source_files.each do |source|
309
+ # Open source as Fasta
310
+ #
311
+ Bio::FlatFile.open(Bio::FastaFormat, source) do |ff|
312
+ p "Reading source file #{source}"
313
+
314
+ n_match=0
315
+
316
+ filters=source_filters[source_i] #An array of filters for this input file
317
+ id_regex=/#{id_regexes[source_i]}/
318
+
319
+ ff.each do |entry|
320
+ filters.each do |filter|
321
+ if ( entry.definition =~ /#{filter}/)
322
+ n_match=n_match+1
323
+ idmatch=id_regex.match(entry.definition)
324
+ case
325
+ when idmatch==nil || idmatch[1]==nil
326
+ p "No match to id regex #{id_regex} for #{entry.definition}. Skipping this entry"
327
+ else
328
+ new_def="#{idmatch[1]}"
329
+ if ( added_ids.include?(new_def) )
330
+ p "Warning: Skipping duplicate definition for #{new_def}"
331
+ else
332
+ entry.definition=new_def
333
+ output_fh.puts(entry.to_s)
334
+ added_ids.add new_def
335
+ end
336
+ # p entry.definition.to_s
337
+ end
338
+ break
339
+ end
340
+ end
341
+ end
342
+ p "Warning no match to any filter in #{filters} for source file #{source}" unless n_match > 0
343
+ end
344
+ source_i=source_i+1
345
+ end
346
+ output_fh.close
347
+ else # Other formats just copy a file across ... must be a single source
348
+
349
+ throw "Only a single source file is permitted for formats other than fasta" unless source_files.length == 1
350
+
351
+ sh "cp #{source_files[0]} #{raw_db_filename}" do |ok,res|
352
+ if ! ok
353
+ puts "Unable to copy #{source_files[0]} to #{raw_db_filename}"
354
+ end
355
+ end
356
+
357
+ end
358
+ end
359
+
360
+ #####################
361
+ # Decoys #
362
+ #####################
363
+
364
+ decoy_db_filename = "#{dbdir}/with_decoys.fasta"
365
+ file decoy_db_filename => raw_db_filename do
366
+
367
+ archive_fasta_file(decoy_db_filename) if dbspec[:archive_old]
368
+
369
+
370
+ decoys_filename = "#{dbdir}/decoys_only.fasta"
371
+ decoy_prefix=dbspec[:decoy_prefix]
372
+
373
+ # Count entries in the raw input file
374
+ #
375
+ ff=Bio::FlatFile.open(Bio::FastaFormat, raw_db_filename)
376
+ db_length=0
377
+ ff.each do |entry|
378
+ db_length=db_length+1
379
+ end
380
+
381
+ p "Generating decoy sequences ... this could take a while"
382
+ # Make decoys, concatenate and delete decoy only file
383
+ Randomize.make_decoys #{raw_db_filename} #{db_length} #{decoys_filename} #{decoy_prefix}"
384
+ cmd << "cat #{raw_db_filename} #{decoys_filename} >> #{decoy_db_filename}; rm #{decoys_filename}"
385
+ sh %{ #{cmd} }
386
+ end
387
+
388
+ # Adjust dependencies depending on whether we're making decoys
389
+ #
390
+ case dbspec[:decoys]
391
+ when true
392
+ throw "Decoys are only supported for fasta formatted databases" unless format=="fasta"
393
+ file db_filename => decoy_db_filename
394
+ else
395
+ file db_filename => raw_db_filename
396
+ end
397
+
398
+
399
+ ###################
400
+ # Symlink Current #
401
+ ###################
402
+
403
+
404
+ # Current database file should symlink to raw or decoy
405
+ #
406
+ file db_filename do
407
+ if ( dbspec[:is_annotation_db])
408
+ db_filename=raw_db_filename # For annotation databases we don't use symlinks at all
409
+ else
410
+ # if we are an annotation db we can't symlink so do nothing
411
+
412
+ # source db filename is either decoy or raw
413
+ #
414
+ case dbspec[:decoys]
415
+ when true
416
+ source_db_filename = decoy_db_filename
417
+ when false
418
+ source_db_filename = raw_db_filename
419
+ end
420
+
421
+ p "Current db links to #{source_db_filename}"
422
+
423
+ # Symlink to the source file
424
+ #
425
+ File.symlink(source_db_filename,db_filename)
426
+ end
427
+ end
428
+
429
+
430
+
431
+ ###################
432
+ # Indexing #
433
+ ###################
434
+ if dbspec[:make_blast_index]
435
+ blast_index_files=FileList.new([".phr"].collect {|ext| "#{db_filename}#{ext}" })
436
+ # task :make_blast_index => blast_index_files do
437
+ blast_index_files.each do |indfile|
438
+ file indfile => db_filename do
439
+ cmd="cd #{dbdir}; #{$genv.makeblastdb} -in #{db_filename} -parse_seqids -dbtype prot"
440
+ p "Creating blast index"
441
+ sh %{ #{cmd} }
442
+ end
443
+ end
444
+
445
+ task dbname => blast_index_files
446
+
447
+ end
448
+
449
+
450
+ if dbspec[:make_msgf_index]
451
+ msgf_index_files=FileList.new([".canno"].collect {|ext| "#{db_filename}#{ext}" })
452
+ # task :make_blast_index => blast_index_files do
453
+ msgf_index_files.each do |indfile|
454
+ file indfile => db_filename do
455
+ cmd="cd #{dbdir}; java -Xmx3500M -cp #{$genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{db_filename} -tda 0"
456
+ p "Creating msgf index"
457
+ sh %{ #{cmd} }
458
+ end
459
+ end
460
+
461
+ task dbname => msgf_index_files
462
+ end
463
+
464
+ if format=="dat" && dbspec[:is_annotation_db]
465
+ dat_index_files=FileList.new(["config.dat","id_AC.index","key_ID.key"].collect {|file| "#{dbdir}/#{file}"} )
466
+
467
+ dat_index_files.each do |indexfile|
468
+ file indexfile => db_filename do
469
+ puts "Indexing annotation database"
470
+ dbclass=Bio::SPTR
471
+ parser = Bio::FlatFileIndex::Indexer::Parser.new(dbclass, nil, nil)
472
+ Bio::FlatFileIndex::Indexer::makeindexFlat(dbdir, parser, {}, db_filename)
473
+ end
474
+ end
475
+
476
+ task dbname => dat_index_files
477
+
478
+ end
479
+
480
+ #################
481
+ # Root task #
482
+ #################
483
+
484
+ task dbname => db_filename