protk 1.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/README.md +85 -0
  2. data/bin/annotate_ids.rb +59 -0
  3. data/bin/big_search.rb +41 -0
  4. data/bin/correct_omssa_retention_times.rb +27 -0
  5. data/bin/feature_finder.rb +76 -0
  6. data/bin/file_convert.rb +157 -0
  7. data/bin/generate_omssa_loc.rb +42 -0
  8. data/bin/interprophet.rb +91 -0
  9. data/bin/make_decoy.rb +64 -0
  10. data/bin/manage_db.rb +123 -0
  11. data/bin/mascot_search.rb +187 -0
  12. data/bin/mascot_to_pepxml.rb +44 -0
  13. data/bin/msgfplus_search.rb +191 -0
  14. data/bin/omssa_search.rb +205 -0
  15. data/bin/peptide_prophet.rb +245 -0
  16. data/bin/pepxml_to_table.rb +78 -0
  17. data/bin/protein_prophet.rb +140 -0
  18. data/bin/protk_setup.rb +31 -0
  19. data/bin/repair_run_summary.rb +113 -0
  20. data/bin/tandem_search.rb +292 -0
  21. data/bin/template_search.rb +144 -0
  22. data/bin/unimod_to_loc.rb +118 -0
  23. data/bin/xls_to_table.rb +46 -0
  24. data/ext/protk/extconf.rb +3 -0
  25. data/ext/protk/protk.c +235 -0
  26. data/lib/protk/big_search_rakefile.rake +16 -0
  27. data/lib/protk/big_search_tool.rb +23 -0
  28. data/lib/protk/bio_sptr_extensions.rb +210 -0
  29. data/lib/protk/biotools_excel_converter.rb +60 -0
  30. data/lib/protk/command_runner.rb +84 -0
  31. data/lib/protk/constants.rb +296 -0
  32. data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
  33. data/lib/protk/data/apt-get_packages.yaml +47 -0
  34. data/lib/protk/data/brew_packages.yaml +10 -0
  35. data/lib/protk/data/default_config.yml +20 -0
  36. data/lib/protk/data/predefined_db.crap.yaml +19 -0
  37. data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
  38. data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
  39. data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
  40. data/lib/protk/data/tandem_params.xml +56 -0
  41. data/lib/protk/data/taxonomy_template.xml +9 -0
  42. data/lib/protk/data/unimod.xml +16780 -0
  43. data/lib/protk/eupathdb_gene_information_table.rb +158 -0
  44. data/lib/protk/galaxy_stager.rb +24 -0
  45. data/lib/protk/galaxy_util.rb +9 -0
  46. data/lib/protk/manage_db_rakefile.rake +484 -0
  47. data/lib/protk/manage_db_tool.rb +181 -0
  48. data/lib/protk/mascot_util.rb +63 -0
  49. data/lib/protk/omssa_util.rb +57 -0
  50. data/lib/protk/plasmodb.rb +50 -0
  51. data/lib/protk/prophet_tool.rb +85 -0
  52. data/lib/protk/protein_annotator.rb +646 -0
  53. data/lib/protk/protxml.rb +137 -0
  54. data/lib/protk/randomize.rb +7 -0
  55. data/lib/protk/search_tool.rb +182 -0
  56. data/lib/protk/setup_rakefile.rake +245 -0
  57. data/lib/protk/setup_tool.rb +19 -0
  58. data/lib/protk/spreadsheet_extensions.rb +78 -0
  59. data/lib/protk/swissprot_database.rb +38 -0
  60. data/lib/protk/tool.rb +182 -0
  61. data/lib/protk/xtandem_defaults.rb +11 -0
  62. data/lib/protk.rb +18 -0
  63. metadata +256 -0
@@ -0,0 +1,158 @@
1
+ # Code for interacting with EuPathDB gene information files e.g. http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
2
+ # These gene information files contain a large amount of information about individual genes/proteins in EuPathDBs.
3
+
4
+ require 'tempfile'
5
+
6
+ # A class for extracting gene info from a particular gene from the information file
7
+ class EuPathDBGeneInformationFileExtractor
8
+ # A filename path to the gene information file
9
+ attr_accessor :filename
10
+
11
+ def initialize(filename = nil)
12
+ @filename = filename
13
+ end
14
+
15
+ # Returns a EuPathDBGeneInformation object corresponding to the wanted key. If
16
+ # there are multiple in the file, only the first is returned. If none are found, nil is returned.
17
+ #
18
+ # If grep_hack_lines is defined (as an integer), then a shortcut is applied to speed things up. Before parsing the gene info file, grep some lines after the "Gene Id: .." line. Then feed that into the parser.
19
+ def extract_gene_info(wanted_gene_id, grep_hack_lines = nil)
20
+ inside_iterator = lambda do |gene|
21
+ return gene if wanted_gene_id == gene.info['Gene Id']
22
+ end
23
+
24
+ filename = @filename
25
+ p @filename
26
+ if grep_hack_lines and grep_hack_lines.to_i != 0
27
+ tempfile=Tempfile.new('reubypathdb_grep_hack')
28
+ # grep however many lines from past the point. Rather dodgy, but faster.
29
+ raise Exception, "grep_hack_lines should be an integer" unless grep_hack_lines.is_a?(Integer)
30
+ `grep -A #{grep_hack_lines} 'Gene Id: #{wanted_gene_id}' '#{@filename}' >#{tempfile.path}`
31
+ EuPathDBGeneInformationTable.new(File.open(tempfile.path)).each do |gene|
32
+ return inside_iterator.call(gene)
33
+ end
34
+ else
35
+ # no grep hack. Parse the whole gene information file
36
+ EuPathDBGeneInformationTable.new(File.open(@filename)).each do |gene|
37
+ return inside_iterator.call(gene)
38
+ end
39
+ end
40
+ return nil
41
+ end
42
+ end
43
+
44
+ # A class for parsing the 'gene information table' files from EuPathDB, such
45
+ # as http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
46
+ #
47
+ # The usual way of interacting with these is the use of the each method,
48
+ # which returns a EuPathDBGeneInformation object with all of the recorded
49
+ # information in it.
50
+ class EuPathDBGeneInformationTable
51
+ include Enumerable
52
+
53
+ # Initialise using an IO object, say File.open('/path/to/CmurisGene_CryptoDB-4.3.txt'). After opening, the #each method can be used to iterate over the genes that are present in the file
54
+ def initialize(io)
55
+ @io = io
56
+ end
57
+
58
+ # Return a EuPathDBGeneInformation object with
59
+ # the contained info in it, one at a time
60
+ def each
61
+ while g = next_gene
62
+ yield g
63
+ end
64
+ end
65
+
66
+ # Returns a EuPathDBGeneInformation object with all the data you could
67
+ # possibly want.
68
+ def next_gene
69
+ info = EuPathDBGeneInformation.new
70
+
71
+ # first, read the table, which should start with the ID column
72
+ line = @io.readline.strip
73
+ while line == ''
74
+ return nil if @io.eof?
75
+ line = @io.readline.strip
76
+ end
77
+
78
+ while line != ''
79
+ if matches = line.match(/^(.*?)\: (.*)$/)
80
+ info.add_information(matches[1], matches[2])
81
+ else
82
+ raise Exception, "EuPathDBGeneInformationTable Couldn't parse this line: #{line}"
83
+ end
84
+
85
+ line = @io.readline.strip
86
+ end
87
+
88
+ # now read each of the tables, which should start with the
89
+ # 'TABLE: <name>' entry
90
+ line = @io.readline.strip
91
+ table_name = nil
92
+ headers = nil
93
+ data = []
94
+ while line != '------------------------------------------------------------'
95
+ if line == ''
96
+ # add it to the stack unless we are just starting out
97
+ info.add_table(table_name, headers, data) unless table_name.nil?
98
+
99
+ # reset things
100
+ table_name = nil
101
+ headers = nil
102
+ data = []
103
+ elsif matches = line.match(/^TABLE\: (.*)$/)
104
+ # name of a table
105
+ table_name = matches[1]
106
+ elsif line.match(/^\[.*\]/)
107
+ # headings of the table
108
+ headers = line.split("\t").collect do |header|
109
+ header.gsub(/^\[/,'').gsub(/\]$/,'')
110
+ end
111
+ else
112
+ # a proper data row
113
+ data.push line.split("\t")
114
+ end
115
+ line = @io.readline.strip
116
+ end
117
+
118
+ # return the object that has been created
119
+ return info
120
+ end
121
+ end
122
+
123
+ # Each gene in the gene information table is represented
124
+ # by 2 types of information - info and tables.
125
+ # info are 1 line data, whereas tables are tables of
126
+ # data with possibly multiple rows
127
+ class EuPathDBGeneInformation
128
+ def info
129
+ @info
130
+ end
131
+
132
+ def get_info(key)
133
+ @info[key]
134
+ end
135
+ alias_method :[], :get_info
136
+
137
+ def get_table(table_name)
138
+ @tables[table_name]
139
+ end
140
+
141
+ def add_information(key, value)
142
+ @info ||= {}
143
+ @info[key] = value
144
+ "Added info #{key}, now is #{@info[key]}"
145
+ end
146
+
147
+ def add_table(name, headers, data)
148
+ @tables ||= {}
149
+ @tables[name] = []
150
+ data.each do |row|
151
+ final = {}
152
+ row.each_with_index do |cell, i|
153
+ final[headers[i]] = cell
154
+ end
155
+ @tables[name].push final
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,24 @@
1
+ require 'pathname'
2
+
3
+ class GalaxyStager
4
+ attr_accessor :staged_path
5
+
6
+ def initialize(original_path, options = {})
7
+ options = { :name => nil, :extension => '' }.merge(options)
8
+ @original_path = Pathname.new(original_path)
9
+ @wd = Dir.pwd
10
+ staged_name = options[:name] || @original_path.basename
11
+ @staged_path = File.join(@wd, "#{staged_name}#{options[:extension]}")
12
+ File.symlink(@original_path, @staged_path)
13
+ end
14
+
15
+ def restore_references(in_file)
16
+ GalaxyStager.replace_references(in_file, @staged_path, @original_path)
17
+ end
18
+
19
+ def self.replace_references(in_file, from_path, to_path)
20
+ cmd="ruby -pi -e \"gsub('#{from_path}', '#{to_path}')\" #{in_file}"
21
+ %x[#{cmd}]
22
+ end
23
+
24
+ end
@@ -0,0 +1,9 @@
1
+ class GalaxyUtil
2
+
3
+ def self.for_galaxy
4
+ for_galaxy = ARGV[0] == "--galaxy"
5
+ ARGV.shift if for_galaxy
6
+ return for_galaxy
7
+ end
8
+
9
+ end
@@ -0,0 +1,484 @@
1
+ require 'protk/constants'
2
+ require 'protk/randomize'
3
+ require 'uri'
4
+ require 'digest/md5'
5
+ require 'net/ftp'
6
+ require 'net/ftp/list'
7
+ require 'bio'
8
+ require 'tempfile'
9
+ require 'pp'
10
+ require 'set'
11
+
12
+ dbname=ARGV[0]
13
+
14
+ # Load database spec file
15
+ #
16
+ $genv=Constants.new()
17
+ dbdir="#{$genv.protein_database_root}/#{dbname}"
18
+
19
+ dbspec_file="#{dbdir}/.protkdb.yaml"
20
+ dbspec=YAML.load_file "#{dbspec_file}"
21
+
22
+ format = dbspec[:format]!=nil ? dbspec[:format] : "fasta"
23
+
24
+ # Output database filename
25
+ #
26
+ db_filename="#{dbdir}/current.#{format}"
27
+
28
+ #####################
29
+ # Utility Functions #
30
+ #####################
31
+
32
+
33
+ def check_ftp_release_notes(release_notes)
34
+ rn_uri = URI.parse(release_notes)
35
+
36
+ rn_path="#{$genv.database_downloads}/#{rn_uri.host}/#{rn_uri.path}"
37
+
38
+
39
+ host=rn_uri.host
40
+ Net::FTP.open(host) do |ftp|
41
+
42
+ ftp.login
43
+ rn_dir=Pathname.new(rn_uri.path).dirname.to_s
44
+ rn_file=Pathname.new(rn_uri.path).basename.to_s
45
+ ftp.chdir(rn_dir)
46
+
47
+ ftp.passive=true
48
+
49
+
50
+ p "Checking release notes"
51
+
52
+ # Is the last path component a wildcard expression (we only allow *)
53
+ # If so we need to find the file with the most recent modification time
54
+ #
55
+ if ( rn_file =~ /\*/)
56
+ entries=ftp.list(rn_file)
57
+ p entries
58
+ latest_file=nil
59
+ latest_file_mtime=nil
60
+ entries.each do |dir_entry|
61
+ info=Net::FTP::List.parse(dir_entry)
62
+ if ( info.file? )
63
+ latest_file_mtime = info.mtime if ( latest_file_mtime ==nil )
64
+ latest_file = info.basename if ( latest_file_mtime ==nil )
65
+
66
+ if ( info.mtime <=> latest_file_mtime ) #entry's mtime is later
67
+ latest_file_mtime=info.mtime
68
+ latest_file=info.basename
69
+ end
70
+
71
+ end
72
+ end
73
+
74
+ throw "No release notes found" if ( latest_file ==nil)
75
+
76
+ rn_file=latest_file
77
+
78
+ # Adjust the rn_path to be the path of the latest file
79
+ #
80
+ rn_path="#{Pathname.new(rn_path).dirname}/#{latest_file}"
81
+
82
+ end
83
+
84
+ # Hash existing release notes data if it exists
85
+ #
86
+ existing_digest=nil
87
+ existing_digest=Digest::MD5.hexdigest(File.read(rn_path)) if Pathname.new(rn_path).exist?
88
+
89
+
90
+
91
+ rn_data=""
92
+ dl_file=Tempfile.new("rn_file")
93
+
94
+ ftp.getbinaryfile(rn_file,dl_file.path) { |data| rn_data << data }
95
+
96
+ rn_digest=Digest::MD5.hexdigest(rn_data)
97
+
98
+ p "Done Downloading release notes #{ftp} #{rn_file} to #{dl_file.path} #{ftp.pwd}"
99
+
100
+ throw "No release notes data at #{release_notes}" unless rn_digest!=nil
101
+
102
+ # Update release notes data
103
+ case
104
+ when ( existing_digest != rn_digest )
105
+ FileUtils.mkpath(Pathname.new(rn_path).dirname.to_s)
106
+ File.open(rn_path, "w") {|file| file.puts(rn_data) }
107
+ else
108
+ p "Release notes are up to date"
109
+ end
110
+ end
111
+ end
112
+
113
+ def download_ftp_file(ftp,file_name,dest_dir)
114
+ dest_path="#{dest_dir}/#{file_name}"
115
+
116
+ download_size=ftp.size(file_name)
117
+ mod_time=ftp.mtime(file_name,true)
118
+
119
+
120
+
121
+ percent_size=download_size/100
122
+ i=1
123
+ pc_complete=0
124
+ last_time=Time.new
125
+ p "Downloading #{file_name}"
126
+ ftp.passive=true
127
+
128
+ ftp.getbinaryfile(file_name,dest_path,1024) { |data|
129
+
130
+ progress=i*1024
131
+ if ( pc_complete < progress.divmod(percent_size)[0] && ( Time.new - last_time) > 10 )
132
+ pc_complete=progress.divmod(percent_size)[0]
133
+ p "Downloading #{file_name} #{pc_complete} percent complete"
134
+ last_time=Time.new
135
+ end
136
+ i=i+1
137
+ }
138
+
139
+ end
140
+
141
+ def download_ftp_source(source)
142
+
143
+ data_uri = URI.parse(source)
144
+
145
+ data_path="#{$genv.database_downloads}/#{data_uri.host}/#{data_uri.path}"
146
+ # Make sure our destination dir is available
147
+ #
148
+ FileUtils.mkpath(Pathname.new(data_path).dirname.to_s)
149
+
150
+
151
+
152
+ Net::FTP.open(data_uri.host) do |ftp|
153
+ p "Connected to #{data_uri.host}"
154
+ ftp.login
155
+
156
+ ftp.chdir(Pathname.new(data_uri.path).dirname.to_s)
157
+
158
+ last_path_component=Pathname.new(data_uri.path).basename.to_s
159
+
160
+ case
161
+ when last_path_component=~/\*/ # A wildcard match. Need to download them all
162
+ p "Getting directory listing for #{last_path_component}"
163
+ ftp.passive=true
164
+ matching_items=ftp.list(last_path_component)
165
+
166
+ PP.pp(matching_items)
167
+
168
+ matching_items.each do |dir_entry|
169
+ info=Net::FTP::List.parse(dir_entry)
170
+ download_ftp_file(ftp,info.basename,Pathname.new(data_path).dirname)
171
+ end
172
+
173
+ else # Just one file to download
174
+ download_ftp_file(ftp,last_path_component,Pathname.new(data_path).dirname)
175
+ end
176
+
177
+ end
178
+
179
+ end
180
+
181
+
182
+ def archive_fasta_file(filename)
183
+ if ( Pathname.new(filename).exist? )
184
+ mt=File.new(filename).mtime
185
+ timestamp="#{mt.year}_#{mt.month}_#{mt.day}"
186
+ archive_filename="#{filename.gsub(/.fasta$/,'')}_#{timestamp}.fasta"
187
+ p "Moving old database to #{archive_filename}"
188
+ FileUtils.mv(filename,archive_filename)
189
+ end
190
+ end
191
+
192
+ #####################
193
+ # Source Files #
194
+ #####################
195
+
196
+ def file_source(raw_source)
197
+ full_path=raw_source
198
+ full_path = "#{$genv.protein_database_root}/#{raw_source}" unless ( raw_source =~ /^\//) # relative paths should be relative to datbases dir
199
+ throw "File source #{full_path} does not exist" unless Pathname.new(full_path).exist?
200
+ full_path
201
+ end
202
+
203
+ def db_source(db_source)
204
+ current_release_path = "#{$genv.protein_database_root}/#{db_source}/current.fasta"
205
+ throw "Database source #{current_release_path} does not exist" unless Pathname.new(current_release_path).exist?
206
+ current_release_path
207
+ end
208
+
209
+
210
+ def ftp_source(ftpsource)
211
+
212
+
213
+ data_uri=URI.parse(ftpsource[0])
214
+ data_file_path="#{$genv.database_downloads}/#{data_uri.host}/#{data_uri.path}"
215
+ unpacked_data_path=data_file_path.gsub(/\.gz$/,'')
216
+
217
+ release_notes_url=ftpsource[1]
218
+ release_notes_exist=true
219
+ release_notes_exist=false if release_notes_url =~ /^\s*none\s*$/
220
+ if release_notes_exist
221
+ data_rn=URI.parse(release_notes_url) unless
222
+ release_notes_file_path="#{$genv.database_downloads}/#{data_rn.host}/#{data_rn.path}"
223
+
224
+ task :check_rn do
225
+ check_ftp_release_notes(release_notes_url)
226
+ end
227
+
228
+ file release_notes_file_path => :check_rn
229
+ else
230
+ task :check_date do
231
+
232
+ end
233
+ end
234
+
235
+
236
+
237
+ if ( data_file_path=~/\*/) # A wildcard
238
+ unpacked_data_path=data_file_path.gsub(/\*/,"_all_").gsub(/\.gz$/,'')
239
+ end
240
+
241
+ file unpacked_data_path do #Unpacking. Includes unzipping and/or concatenating
242
+ download_ftp_source(ftpsource[0])
243
+
244
+ case
245
+ when data_file_path=~/\*/ # Multiple files to unzip/concatenate and we don't know what they are yet
246
+ file_pattern = Pathname.new(data_file_path).basename.to_s
247
+ if file_pattern =~ /.gz$/
248
+ unzipcmd="gunzip -vdf #{file_pattern}"
249
+ p "Unzipping #{unzipcmd} ... this could take a while"
250
+ sh %{ cd #{Pathname.new(data_file_path).dirname}; #{unzipcmd} }
251
+ end
252
+
253
+ file_pattern.gsub!(/\.gz$/,'')
254
+ catcmd="cat #{file_pattern} > #{unpacked_data_path}"
255
+
256
+ p "Concatenating files #{catcmd} ... this could take a while"
257
+ sh %{ cd #{Pathname.new(data_file_path).dirname}; #{catcmd} }
258
+
259
+ else # Simple case. A single file
260
+ if file_pattern =~ /.gz$/
261
+ p "Unzipping #{Pathname.new(data_file_path).basename} ... "
262
+ sh %{ cd #{Pathname.new(data_file_path).dirname}; gunzip -f #{Pathname.new(data_file_path).basename} }
263
+ end
264
+ end
265
+ end
266
+
267
+ task release_notes_file_path => release_notes_file_path if release_notes_exist
268
+
269
+ unpacked_data_path
270
+ end
271
+
272
+ source_files=dbspec[:sources].collect do |raw_source|
273
+ sf=""
274
+ case
275
+ when raw_source.class==Array
276
+ sf=ftp_source(raw_source)
277
+ when (raw_source =~ /\.fasta$/ || raw_source =~ /\.txt$/ || raw_source =~ /\.dat$/ )
278
+ sf=file_source(raw_source)
279
+ else
280
+ sf=db_source(raw_source)
281
+ end
282
+ sf
283
+ end
284
+
285
+ ########################
286
+ # Concat Filter Copy #
287
+ ########################
288
+
289
+ raw_db_filename = "#{dbdir}/raw.#{format}"
290
+
291
+ file raw_db_filename => [source_files,dbspec_file].flatten do
292
+
293
+ source_filters=dbspec[:include_filters]
294
+
295
+ if ( format == "fasta" && source_filters.length > 0 ) # We can perform concat and filter for fasta only
296
+
297
+ archive_fasta_file(raw_db_filename) if dbspec[:archive_old]
298
+
299
+ output_fh=File.open(raw_db_filename, "w")
300
+
301
+ id_regexes=dbspec[:id_regexes]
302
+ source_i=0
303
+ throw "The number of source files #{source_files.length} should equal the number of source filters #{source_filters.length}" unless source_filters.length == source_files.length
304
+ throw "The number of source files #{source_files.length} should equal the number of id regexes #{id_regexes.length}" unless source_filters.length == id_regexes.length
305
+
306
+ added_ids=Set.new
307
+
308
+ source_files.each do |source|
309
+ # Open source as Fasta
310
+ #
311
+ Bio::FlatFile.open(Bio::FastaFormat, source) do |ff|
312
+ p "Reading source file #{source}"
313
+
314
+ n_match=0
315
+
316
+ filters=source_filters[source_i] #An array of filters for this input file
317
+ id_regex=/#{id_regexes[source_i]}/
318
+
319
+ ff.each do |entry|
320
+ filters.each do |filter|
321
+ if ( entry.definition =~ /#{filter}/)
322
+ n_match=n_match+1
323
+ idmatch=id_regex.match(entry.definition)
324
+ case
325
+ when idmatch==nil || idmatch[1]==nil
326
+ p "No match to id regex #{id_regex} for #{entry.definition}. Skipping this entry"
327
+ else
328
+ new_def="#{idmatch[1]}"
329
+ if ( added_ids.include?(new_def) )
330
+ p "Warning: Skipping duplicate definition for #{new_def}"
331
+ else
332
+ entry.definition=new_def
333
+ output_fh.puts(entry.to_s)
334
+ added_ids.add new_def
335
+ end
336
+ # p entry.definition.to_s
337
+ end
338
+ break
339
+ end
340
+ end
341
+ end
342
+ p "Warning no match to any filter in #{filters} for source file #{source}" unless n_match > 0
343
+ end
344
+ source_i=source_i+1
345
+ end
346
+ output_fh.close
347
+ else # Other formats just copy a file across ... must be a single source
348
+
349
+ throw "Only a single source file is permitted for formats other than fasta" unless source_files.length == 1
350
+
351
+ sh "cp #{source_files[0]} #{raw_db_filename}" do |ok,res|
352
+ if ! ok
353
+ puts "Unable to copy #{source_files[0]} to #{raw_db_filename}"
354
+ end
355
+ end
356
+
357
+ end
358
+ end
359
+
360
+ #####################
361
+ # Decoys #
362
+ #####################
363
+
364
+ decoy_db_filename = "#{dbdir}/with_decoys.fasta"
365
+ file decoy_db_filename => raw_db_filename do
366
+
367
+ archive_fasta_file(decoy_db_filename) if dbspec[:archive_old]
368
+
369
+
370
+ decoys_filename = "#{dbdir}/decoys_only.fasta"
371
+ decoy_prefix=dbspec[:decoy_prefix]
372
+
373
+ # Count entries in the raw input file
374
+ #
375
+ ff=Bio::FlatFile.open(Bio::FastaFormat, raw_db_filename)
376
+ db_length=0
377
+ ff.each do |entry|
378
+ db_length=db_length+1
379
+ end
380
+
381
+ p "Generating decoy sequences ... this could take a while"
382
+ # Make decoys, concatenate and delete decoy only file
383
+ Randomize.make_decoys #{raw_db_filename} #{db_length} #{decoys_filename} #{decoy_prefix}"
384
+ cmd << "cat #{raw_db_filename} #{decoys_filename} >> #{decoy_db_filename}; rm #{decoys_filename}"
385
+ sh %{ #{cmd} }
386
+ end
387
+
388
+ # Adjust dependencies depending on whether we're making decoys
389
+ #
390
+ case dbspec[:decoys]
391
+ when true
392
+ throw "Decoys are only supported for fasta formatted databases" unless format=="fasta"
393
+ file db_filename => decoy_db_filename
394
+ else
395
+ file db_filename => raw_db_filename
396
+ end
397
+
398
+
399
+ ###################
400
+ # Symlink Current #
401
+ ###################
402
+
403
+
404
+ # Current database file should symlink to raw or decoy
405
+ #
406
+ file db_filename do
407
+ if ( dbspec[:is_annotation_db])
408
+ db_filename=raw_db_filename # For annotation databases we don't use symlinks at all
409
+ else
410
+ # if we are an annotation db we can't symlink so do nothing
411
+
412
+ # source db filename is either decoy or raw
413
+ #
414
+ case dbspec[:decoys]
415
+ when true
416
+ source_db_filename = decoy_db_filename
417
+ when false
418
+ source_db_filename = raw_db_filename
419
+ end
420
+
421
+ p "Current db links to #{source_db_filename}"
422
+
423
+ # Symlink to the source file
424
+ #
425
+ File.symlink(source_db_filename,db_filename)
426
+ end
427
+ end
428
+
429
+
430
+
431
+ ###################
432
+ # Indexing #
433
+ ###################
434
+ if dbspec[:make_blast_index]
435
+ blast_index_files=FileList.new([".phr"].collect {|ext| "#{db_filename}#{ext}" })
436
+ # task :make_blast_index => blast_index_files do
437
+ blast_index_files.each do |indfile|
438
+ file indfile => db_filename do
439
+ cmd="cd #{dbdir}; #{$genv.makeblastdb} -in #{db_filename} -parse_seqids -dbtype prot"
440
+ p "Creating blast index"
441
+ sh %{ #{cmd} }
442
+ end
443
+ end
444
+
445
+ task dbname => blast_index_files
446
+
447
+ end
448
+
449
+
450
+ if dbspec[:make_msgf_index]
451
+ msgf_index_files=FileList.new([".canno"].collect {|ext| "#{db_filename}#{ext}" })
452
+ # task :make_blast_index => blast_index_files do
453
+ msgf_index_files.each do |indfile|
454
+ file indfile => db_filename do
455
+ cmd="cd #{dbdir}; java -Xmx3500M -cp #{$genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{db_filename} -tda 0"
456
+ p "Creating msgf index"
457
+ sh %{ #{cmd} }
458
+ end
459
+ end
460
+
461
+ task dbname => msgf_index_files
462
+ end
463
+
464
+ if format=="dat" && dbspec[:is_annotation_db]
465
+ dat_index_files=FileList.new(["config.dat","id_AC.index","key_ID.key"].collect {|file| "#{dbdir}/#{file}"} )
466
+
467
+ dat_index_files.each do |indexfile|
468
+ file indexfile => db_filename do
469
+ puts "Indexing annotation database"
470
+ dbclass=Bio::SPTR
471
+ parser = Bio::FlatFileIndex::Indexer::Parser.new(dbclass, nil, nil)
472
+ Bio::FlatFileIndex::Indexer::makeindexFlat(dbdir, parser, {}, db_filename)
473
+ end
474
+ end
475
+
476
+ task dbname => dat_index_files
477
+
478
+ end
479
+
480
+ #################
481
+ # Root task #
482
+ #################
483
+
484
+ task dbname => db_filename