protk 1.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +85 -0
- data/bin/annotate_ids.rb +59 -0
- data/bin/big_search.rb +41 -0
- data/bin/correct_omssa_retention_times.rb +27 -0
- data/bin/feature_finder.rb +76 -0
- data/bin/file_convert.rb +157 -0
- data/bin/generate_omssa_loc.rb +42 -0
- data/bin/interprophet.rb +91 -0
- data/bin/make_decoy.rb +64 -0
- data/bin/manage_db.rb +123 -0
- data/bin/mascot_search.rb +187 -0
- data/bin/mascot_to_pepxml.rb +44 -0
- data/bin/msgfplus_search.rb +191 -0
- data/bin/omssa_search.rb +205 -0
- data/bin/peptide_prophet.rb +245 -0
- data/bin/pepxml_to_table.rb +78 -0
- data/bin/protein_prophet.rb +140 -0
- data/bin/protk_setup.rb +31 -0
- data/bin/repair_run_summary.rb +113 -0
- data/bin/tandem_search.rb +292 -0
- data/bin/template_search.rb +144 -0
- data/bin/unimod_to_loc.rb +118 -0
- data/bin/xls_to_table.rb +46 -0
- data/ext/protk/extconf.rb +3 -0
- data/ext/protk/protk.c +235 -0
- data/lib/protk/big_search_rakefile.rake +16 -0
- data/lib/protk/big_search_tool.rb +23 -0
- data/lib/protk/bio_sptr_extensions.rb +210 -0
- data/lib/protk/biotools_excel_converter.rb +60 -0
- data/lib/protk/command_runner.rb +84 -0
- data/lib/protk/constants.rb +296 -0
- data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
- data/lib/protk/data/apt-get_packages.yaml +47 -0
- data/lib/protk/data/brew_packages.yaml +10 -0
- data/lib/protk/data/default_config.yml +20 -0
- data/lib/protk/data/predefined_db.crap.yaml +19 -0
- data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
- data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
- data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
- data/lib/protk/data/tandem_params.xml +56 -0
- data/lib/protk/data/taxonomy_template.xml +9 -0
- data/lib/protk/data/unimod.xml +16780 -0
- data/lib/protk/eupathdb_gene_information_table.rb +158 -0
- data/lib/protk/galaxy_stager.rb +24 -0
- data/lib/protk/galaxy_util.rb +9 -0
- data/lib/protk/manage_db_rakefile.rake +484 -0
- data/lib/protk/manage_db_tool.rb +181 -0
- data/lib/protk/mascot_util.rb +63 -0
- data/lib/protk/omssa_util.rb +57 -0
- data/lib/protk/plasmodb.rb +50 -0
- data/lib/protk/prophet_tool.rb +85 -0
- data/lib/protk/protein_annotator.rb +646 -0
- data/lib/protk/protxml.rb +137 -0
- data/lib/protk/randomize.rb +7 -0
- data/lib/protk/search_tool.rb +182 -0
- data/lib/protk/setup_rakefile.rake +245 -0
- data/lib/protk/setup_tool.rb +19 -0
- data/lib/protk/spreadsheet_extensions.rb +78 -0
- data/lib/protk/swissprot_database.rb +38 -0
- data/lib/protk/tool.rb +182 -0
- data/lib/protk/xtandem_defaults.rb +11 -0
- data/lib/protk.rb +18 -0
- metadata +256 -0
@@ -0,0 +1,158 @@
|
|
1
|
+
# Code for interacting with EuPathDB gene information files e.g. http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
2
|
+
# These gene information files contain a large amount of information about individual genes/proteins in EuPathDBs.
|
3
|
+
|
4
|
+
require 'tempfile'
|
5
|
+
|
6
|
+
# A class for extracting gene info from a particular gene from the information file
|
7
|
+
class EuPathDBGeneInformationFileExtractor
|
8
|
+
# A filename path to the gene information file
|
9
|
+
attr_accessor :filename
|
10
|
+
|
11
|
+
def initialize(filename = nil)
|
12
|
+
@filename = filename
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns a EuPathDBGeneInformation object corresponding to the wanted key. If
|
16
|
+
# there are multiple in the file, only the first is returned. If none are found, nil is returned.
|
17
|
+
#
|
18
|
+
# If grep_hack_lines is defined (as an integer), then a shortcut is applied to speed things up. Before parsing the gene info file, grep some lines after the "Gene Id: .." line. Then feed that into the parser.
|
19
|
+
def extract_gene_info(wanted_gene_id, grep_hack_lines = nil)
|
20
|
+
inside_iterator = lambda do |gene|
|
21
|
+
return gene if wanted_gene_id == gene.info['Gene Id']
|
22
|
+
end
|
23
|
+
|
24
|
+
filename = @filename
|
25
|
+
p @filename
|
26
|
+
if grep_hack_lines and grep_hack_lines.to_i != 0
|
27
|
+
tempfile=Tempfile.new('reubypathdb_grep_hack')
|
28
|
+
# grep however many lines from past the point. Rather dodgy, but faster.
|
29
|
+
raise Exception, "grep_hack_lines should be an integer" unless grep_hack_lines.is_a?(Integer)
|
30
|
+
`grep -A #{grep_hack_lines} 'Gene Id: #{wanted_gene_id}' '#{@filename}' >#{tempfile.path}`
|
31
|
+
EuPathDBGeneInformationTable.new(File.open(tempfile.path)).each do |gene|
|
32
|
+
return inside_iterator.call(gene)
|
33
|
+
end
|
34
|
+
else
|
35
|
+
# no grep hack. Parse the whole gene information file
|
36
|
+
EuPathDBGeneInformationTable.new(File.open(@filename)).each do |gene|
|
37
|
+
return inside_iterator.call(gene)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
return nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# A class for parsing the 'gene information table' files from EuPathDB, such
|
45
|
+
# as http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
46
|
+
#
|
47
|
+
# The usual way of interacting with these is the use of the each method,
|
48
|
+
# which returns a EuPathDBGeneInformation object with all of the recorded
|
49
|
+
# information in it.
|
50
|
+
class EuPathDBGeneInformationTable
|
51
|
+
include Enumerable
|
52
|
+
|
53
|
+
# Initialise using an IO object, say File.open('/path/to/CmurisGene_CryptoDB-4.3.txt'). After opening, the #each method can be used to iterate over the genes that are present in the file
|
54
|
+
def initialize(io)
|
55
|
+
@io = io
|
56
|
+
end
|
57
|
+
|
58
|
+
# Return a EuPathDBGeneInformation object with
|
59
|
+
# the contained info in it, one at a time
|
60
|
+
def each
|
61
|
+
while g = next_gene
|
62
|
+
yield g
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Returns a EuPathDBGeneInformation object with all the data you could
|
67
|
+
# possibly want.
|
68
|
+
def next_gene
|
69
|
+
info = EuPathDBGeneInformation.new
|
70
|
+
|
71
|
+
# first, read the table, which should start with the ID column
|
72
|
+
line = @io.readline.strip
|
73
|
+
while line == ''
|
74
|
+
return nil if @io.eof?
|
75
|
+
line = @io.readline.strip
|
76
|
+
end
|
77
|
+
|
78
|
+
while line != ''
|
79
|
+
if matches = line.match(/^(.*?)\: (.*)$/)
|
80
|
+
info.add_information(matches[1], matches[2])
|
81
|
+
else
|
82
|
+
raise Exception, "EuPathDBGeneInformationTable Couldn't parse this line: #{line}"
|
83
|
+
end
|
84
|
+
|
85
|
+
line = @io.readline.strip
|
86
|
+
end
|
87
|
+
|
88
|
+
# now read each of the tables, which should start with the
|
89
|
+
# 'TABLE: <name>' entry
|
90
|
+
line = @io.readline.strip
|
91
|
+
table_name = nil
|
92
|
+
headers = nil
|
93
|
+
data = []
|
94
|
+
while line != '------------------------------------------------------------'
|
95
|
+
if line == ''
|
96
|
+
# add it to the stack unless we are just starting out
|
97
|
+
info.add_table(table_name, headers, data) unless table_name.nil?
|
98
|
+
|
99
|
+
# reset things
|
100
|
+
table_name = nil
|
101
|
+
headers = nil
|
102
|
+
data = []
|
103
|
+
elsif matches = line.match(/^TABLE\: (.*)$/)
|
104
|
+
# name of a table
|
105
|
+
table_name = matches[1]
|
106
|
+
elsif line.match(/^\[.*\]/)
|
107
|
+
# headings of the table
|
108
|
+
headers = line.split("\t").collect do |header|
|
109
|
+
header.gsub(/^\[/,'').gsub(/\]$/,'')
|
110
|
+
end
|
111
|
+
else
|
112
|
+
# a proper data row
|
113
|
+
data.push line.split("\t")
|
114
|
+
end
|
115
|
+
line = @io.readline.strip
|
116
|
+
end
|
117
|
+
|
118
|
+
# return the object that has been created
|
119
|
+
return info
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Each gene in the gene information table is represented
|
124
|
+
# by 2 types of information - info and tables.
|
125
|
+
# info are 1 line data, whereas tables are tables of
|
126
|
+
# data with possibly multiple rows
|
127
|
+
class EuPathDBGeneInformation
|
128
|
+
def info
|
129
|
+
@info
|
130
|
+
end
|
131
|
+
|
132
|
+
def get_info(key)
|
133
|
+
@info[key]
|
134
|
+
end
|
135
|
+
alias_method :[], :get_info
|
136
|
+
|
137
|
+
def get_table(table_name)
|
138
|
+
@tables[table_name]
|
139
|
+
end
|
140
|
+
|
141
|
+
def add_information(key, value)
|
142
|
+
@info ||= {}
|
143
|
+
@info[key] = value
|
144
|
+
"Added info #{key}, now is #{@info[key]}"
|
145
|
+
end
|
146
|
+
|
147
|
+
def add_table(name, headers, data)
|
148
|
+
@tables ||= {}
|
149
|
+
@tables[name] = []
|
150
|
+
data.each do |row|
|
151
|
+
final = {}
|
152
|
+
row.each_with_index do |cell, i|
|
153
|
+
final[headers[i]] = cell
|
154
|
+
end
|
155
|
+
@tables[name].push final
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
class GalaxyStager
|
4
|
+
attr_accessor :staged_path
|
5
|
+
|
6
|
+
def initialize(original_path, options = {})
|
7
|
+
options = { :name => nil, :extension => '' }.merge(options)
|
8
|
+
@original_path = Pathname.new(original_path)
|
9
|
+
@wd = Dir.pwd
|
10
|
+
staged_name = options[:name] || @original_path.basename
|
11
|
+
@staged_path = File.join(@wd, "#{staged_name}#{options[:extension]}")
|
12
|
+
File.symlink(@original_path, @staged_path)
|
13
|
+
end
|
14
|
+
|
15
|
+
def restore_references(in_file)
|
16
|
+
GalaxyStager.replace_references(in_file, @staged_path, @original_path)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.replace_references(in_file, from_path, to_path)
|
20
|
+
cmd="ruby -pi -e \"gsub('#{from_path}', '#{to_path}')\" #{in_file}"
|
21
|
+
%x[#{cmd}]
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
@@ -0,0 +1,484 @@
|
|
1
|
+
require 'protk/constants'
|
2
|
+
require 'protk/randomize'
|
3
|
+
require 'uri'
|
4
|
+
require 'digest/md5'
|
5
|
+
require 'net/ftp'
|
6
|
+
require 'net/ftp/list'
|
7
|
+
require 'bio'
|
8
|
+
require 'tempfile'
|
9
|
+
require 'pp'
|
10
|
+
require 'set'
|
11
|
+
|
12
|
+
dbname=ARGV[0]
|
13
|
+
|
14
|
+
# Load database spec file
|
15
|
+
#
|
16
|
+
$genv=Constants.new()
|
17
|
+
dbdir="#{$genv.protein_database_root}/#{dbname}"
|
18
|
+
|
19
|
+
dbspec_file="#{dbdir}/.protkdb.yaml"
|
20
|
+
dbspec=YAML.load_file "#{dbspec_file}"
|
21
|
+
|
22
|
+
format = dbspec[:format]!=nil ? dbspec[:format] : "fasta"
|
23
|
+
|
24
|
+
# Output database filename
|
25
|
+
#
|
26
|
+
db_filename="#{dbdir}/current.#{format}"
|
27
|
+
|
28
|
+
#####################
|
29
|
+
# Utility Functions #
|
30
|
+
#####################
|
31
|
+
|
32
|
+
|
33
|
+
def check_ftp_release_notes(release_notes)
|
34
|
+
rn_uri = URI.parse(release_notes)
|
35
|
+
|
36
|
+
rn_path="#{$genv.database_downloads}/#{rn_uri.host}/#{rn_uri.path}"
|
37
|
+
|
38
|
+
|
39
|
+
host=rn_uri.host
|
40
|
+
Net::FTP.open(host) do |ftp|
|
41
|
+
|
42
|
+
ftp.login
|
43
|
+
rn_dir=Pathname.new(rn_uri.path).dirname.to_s
|
44
|
+
rn_file=Pathname.new(rn_uri.path).basename.to_s
|
45
|
+
ftp.chdir(rn_dir)
|
46
|
+
|
47
|
+
ftp.passive=true
|
48
|
+
|
49
|
+
|
50
|
+
p "Checking release notes"
|
51
|
+
|
52
|
+
# Is the last path component a wildcard expression (we only allow *)
|
53
|
+
# If so we need to find the file with the most recent modification time
|
54
|
+
#
|
55
|
+
if ( rn_file =~ /\*/)
|
56
|
+
entries=ftp.list(rn_file)
|
57
|
+
p entries
|
58
|
+
latest_file=nil
|
59
|
+
latest_file_mtime=nil
|
60
|
+
entries.each do |dir_entry|
|
61
|
+
info=Net::FTP::List.parse(dir_entry)
|
62
|
+
if ( info.file? )
|
63
|
+
latest_file_mtime = info.mtime if ( latest_file_mtime ==nil )
|
64
|
+
latest_file = info.basename if ( latest_file_mtime ==nil )
|
65
|
+
|
66
|
+
if ( info.mtime <=> latest_file_mtime ) #entry's mtime is later
|
67
|
+
latest_file_mtime=info.mtime
|
68
|
+
latest_file=info.basename
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
throw "No release notes found" if ( latest_file ==nil)
|
75
|
+
|
76
|
+
rn_file=latest_file
|
77
|
+
|
78
|
+
# Adjust the rn_path to be the path of the latest file
|
79
|
+
#
|
80
|
+
rn_path="#{Pathname.new(rn_path).dirname}/#{latest_file}"
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
# Hash existing release notes data if it exists
|
85
|
+
#
|
86
|
+
existing_digest=nil
|
87
|
+
existing_digest=Digest::MD5.hexdigest(File.read(rn_path)) if Pathname.new(rn_path).exist?
|
88
|
+
|
89
|
+
|
90
|
+
|
91
|
+
rn_data=""
|
92
|
+
dl_file=Tempfile.new("rn_file")
|
93
|
+
|
94
|
+
ftp.getbinaryfile(rn_file,dl_file.path) { |data| rn_data << data }
|
95
|
+
|
96
|
+
rn_digest=Digest::MD5.hexdigest(rn_data)
|
97
|
+
|
98
|
+
p "Done Downloading release notes #{ftp} #{rn_file} to #{dl_file.path} #{ftp.pwd}"
|
99
|
+
|
100
|
+
throw "No release notes data at #{release_notes}" unless rn_digest!=nil
|
101
|
+
|
102
|
+
# Update release notes data
|
103
|
+
case
|
104
|
+
when ( existing_digest != rn_digest )
|
105
|
+
FileUtils.mkpath(Pathname.new(rn_path).dirname.to_s)
|
106
|
+
File.open(rn_path, "w") {|file| file.puts(rn_data) }
|
107
|
+
else
|
108
|
+
p "Release notes are up to date"
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def download_ftp_file(ftp,file_name,dest_dir)
|
114
|
+
dest_path="#{dest_dir}/#{file_name}"
|
115
|
+
|
116
|
+
download_size=ftp.size(file_name)
|
117
|
+
mod_time=ftp.mtime(file_name,true)
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
percent_size=download_size/100
|
122
|
+
i=1
|
123
|
+
pc_complete=0
|
124
|
+
last_time=Time.new
|
125
|
+
p "Downloading #{file_name}"
|
126
|
+
ftp.passive=true
|
127
|
+
|
128
|
+
ftp.getbinaryfile(file_name,dest_path,1024) { |data|
|
129
|
+
|
130
|
+
progress=i*1024
|
131
|
+
if ( pc_complete < progress.divmod(percent_size)[0] && ( Time.new - last_time) > 10 )
|
132
|
+
pc_complete=progress.divmod(percent_size)[0]
|
133
|
+
p "Downloading #{file_name} #{pc_complete} percent complete"
|
134
|
+
last_time=Time.new
|
135
|
+
end
|
136
|
+
i=i+1
|
137
|
+
}
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
def download_ftp_source(source)
|
142
|
+
|
143
|
+
data_uri = URI.parse(source)
|
144
|
+
|
145
|
+
data_path="#{$genv.database_downloads}/#{data_uri.host}/#{data_uri.path}"
|
146
|
+
# Make sure our destination dir is available
|
147
|
+
#
|
148
|
+
FileUtils.mkpath(Pathname.new(data_path).dirname.to_s)
|
149
|
+
|
150
|
+
|
151
|
+
|
152
|
+
Net::FTP.open(data_uri.host) do |ftp|
|
153
|
+
p "Connected to #{data_uri.host}"
|
154
|
+
ftp.login
|
155
|
+
|
156
|
+
ftp.chdir(Pathname.new(data_uri.path).dirname.to_s)
|
157
|
+
|
158
|
+
last_path_component=Pathname.new(data_uri.path).basename.to_s
|
159
|
+
|
160
|
+
case
|
161
|
+
when last_path_component=~/\*/ # A wildcard match. Need to download them all
|
162
|
+
p "Getting directory listing for #{last_path_component}"
|
163
|
+
ftp.passive=true
|
164
|
+
matching_items=ftp.list(last_path_component)
|
165
|
+
|
166
|
+
PP.pp(matching_items)
|
167
|
+
|
168
|
+
matching_items.each do |dir_entry|
|
169
|
+
info=Net::FTP::List.parse(dir_entry)
|
170
|
+
download_ftp_file(ftp,info.basename,Pathname.new(data_path).dirname)
|
171
|
+
end
|
172
|
+
|
173
|
+
else # Just one file to download
|
174
|
+
download_ftp_file(ftp,last_path_component,Pathname.new(data_path).dirname)
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
def archive_fasta_file(filename)
|
183
|
+
if ( Pathname.new(filename).exist? )
|
184
|
+
mt=File.new(filename).mtime
|
185
|
+
timestamp="#{mt.year}_#{mt.month}_#{mt.day}"
|
186
|
+
archive_filename="#{filename.gsub(/.fasta$/,'')}_#{timestamp}.fasta"
|
187
|
+
p "Moving old database to #{archive_filename}"
|
188
|
+
FileUtils.mv(filename,archive_filename)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
#####################
|
193
|
+
# Source Files #
|
194
|
+
#####################
|
195
|
+
|
196
|
+
def file_source(raw_source)
|
197
|
+
full_path=raw_source
|
198
|
+
full_path = "#{$genv.protein_database_root}/#{raw_source}" unless ( raw_source =~ /^\//) # relative paths should be relative to datbases dir
|
199
|
+
throw "File source #{full_path} does not exist" unless Pathname.new(full_path).exist?
|
200
|
+
full_path
|
201
|
+
end
|
202
|
+
|
203
|
+
def db_source(db_source)
|
204
|
+
current_release_path = "#{$genv.protein_database_root}/#{db_source}/current.fasta"
|
205
|
+
throw "Database source #{current_release_path} does not exist" unless Pathname.new(current_release_path).exist?
|
206
|
+
current_release_path
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
def ftp_source(ftpsource)
|
211
|
+
|
212
|
+
|
213
|
+
data_uri=URI.parse(ftpsource[0])
|
214
|
+
data_file_path="#{$genv.database_downloads}/#{data_uri.host}/#{data_uri.path}"
|
215
|
+
unpacked_data_path=data_file_path.gsub(/\.gz$/,'')
|
216
|
+
|
217
|
+
release_notes_url=ftpsource[1]
|
218
|
+
release_notes_exist=true
|
219
|
+
release_notes_exist=false if release_notes_url =~ /^\s*none\s*$/
|
220
|
+
if release_notes_exist
|
221
|
+
data_rn=URI.parse(release_notes_url) unless
|
222
|
+
release_notes_file_path="#{$genv.database_downloads}/#{data_rn.host}/#{data_rn.path}"
|
223
|
+
|
224
|
+
task :check_rn do
|
225
|
+
check_ftp_release_notes(release_notes_url)
|
226
|
+
end
|
227
|
+
|
228
|
+
file release_notes_file_path => :check_rn
|
229
|
+
else
|
230
|
+
task :check_date do
|
231
|
+
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
|
236
|
+
|
237
|
+
if ( data_file_path=~/\*/) # A wildcard
|
238
|
+
unpacked_data_path=data_file_path.gsub(/\*/,"_all_").gsub(/\.gz$/,'')
|
239
|
+
end
|
240
|
+
|
241
|
+
file unpacked_data_path do #Unpacking. Includes unzipping and/or concatenating
|
242
|
+
download_ftp_source(ftpsource[0])
|
243
|
+
|
244
|
+
case
|
245
|
+
when data_file_path=~/\*/ # Multiple files to unzip/concatenate and we don't know what they are yet
|
246
|
+
file_pattern = Pathname.new(data_file_path).basename.to_s
|
247
|
+
if file_pattern =~ /.gz$/
|
248
|
+
unzipcmd="gunzip -vdf #{file_pattern}"
|
249
|
+
p "Unzipping #{unzipcmd} ... this could take a while"
|
250
|
+
sh %{ cd #{Pathname.new(data_file_path).dirname}; #{unzipcmd} }
|
251
|
+
end
|
252
|
+
|
253
|
+
file_pattern.gsub!(/\.gz$/,'')
|
254
|
+
catcmd="cat #{file_pattern} > #{unpacked_data_path}"
|
255
|
+
|
256
|
+
p "Concatenating files #{catcmd} ... this could take a while"
|
257
|
+
sh %{ cd #{Pathname.new(data_file_path).dirname}; #{catcmd} }
|
258
|
+
|
259
|
+
else # Simple case. A single file
|
260
|
+
if file_pattern =~ /.gz$/
|
261
|
+
p "Unzipping #{Pathname.new(data_file_path).basename} ... "
|
262
|
+
sh %{ cd #{Pathname.new(data_file_path).dirname}; gunzip -f #{Pathname.new(data_file_path).basename} }
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
task release_notes_file_path => release_notes_file_path if release_notes_exist
|
268
|
+
|
269
|
+
unpacked_data_path
|
270
|
+
end
|
271
|
+
|
272
|
+
source_files=dbspec[:sources].collect do |raw_source|
|
273
|
+
sf=""
|
274
|
+
case
|
275
|
+
when raw_source.class==Array
|
276
|
+
sf=ftp_source(raw_source)
|
277
|
+
when (raw_source =~ /\.fasta$/ || raw_source =~ /\.txt$/ || raw_source =~ /\.dat$/ )
|
278
|
+
sf=file_source(raw_source)
|
279
|
+
else
|
280
|
+
sf=db_source(raw_source)
|
281
|
+
end
|
282
|
+
sf
|
283
|
+
end
|
284
|
+
|
285
|
+
########################
|
286
|
+
# Concat Filter Copy #
|
287
|
+
########################
|
288
|
+
|
289
|
+
raw_db_filename = "#{dbdir}/raw.#{format}"
|
290
|
+
|
291
|
+
file raw_db_filename => [source_files,dbspec_file].flatten do
|
292
|
+
|
293
|
+
source_filters=dbspec[:include_filters]
|
294
|
+
|
295
|
+
if ( format == "fasta" && source_filters.length > 0 ) # We can perform concat and filter for fasta only
|
296
|
+
|
297
|
+
archive_fasta_file(raw_db_filename) if dbspec[:archive_old]
|
298
|
+
|
299
|
+
output_fh=File.open(raw_db_filename, "w")
|
300
|
+
|
301
|
+
id_regexes=dbspec[:id_regexes]
|
302
|
+
source_i=0
|
303
|
+
throw "The number of source files #{source_files.length} should equal the number of source filters #{source_filters.length}" unless source_filters.length == source_files.length
|
304
|
+
throw "The number of source files #{source_files.length} should equal the number of id regexes #{id_regexes.length}" unless source_filters.length == id_regexes.length
|
305
|
+
|
306
|
+
added_ids=Set.new
|
307
|
+
|
308
|
+
source_files.each do |source|
|
309
|
+
# Open source as Fasta
|
310
|
+
#
|
311
|
+
Bio::FlatFile.open(Bio::FastaFormat, source) do |ff|
|
312
|
+
p "Reading source file #{source}"
|
313
|
+
|
314
|
+
n_match=0
|
315
|
+
|
316
|
+
filters=source_filters[source_i] #An array of filters for this input file
|
317
|
+
id_regex=/#{id_regexes[source_i]}/
|
318
|
+
|
319
|
+
ff.each do |entry|
|
320
|
+
filters.each do |filter|
|
321
|
+
if ( entry.definition =~ /#{filter}/)
|
322
|
+
n_match=n_match+1
|
323
|
+
idmatch=id_regex.match(entry.definition)
|
324
|
+
case
|
325
|
+
when idmatch==nil || idmatch[1]==nil
|
326
|
+
p "No match to id regex #{id_regex} for #{entry.definition}. Skipping this entry"
|
327
|
+
else
|
328
|
+
new_def="#{idmatch[1]}"
|
329
|
+
if ( added_ids.include?(new_def) )
|
330
|
+
p "Warning: Skipping duplicate definition for #{new_def}"
|
331
|
+
else
|
332
|
+
entry.definition=new_def
|
333
|
+
output_fh.puts(entry.to_s)
|
334
|
+
added_ids.add new_def
|
335
|
+
end
|
336
|
+
# p entry.definition.to_s
|
337
|
+
end
|
338
|
+
break
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
p "Warning no match to any filter in #{filters} for source file #{source}" unless n_match > 0
|
343
|
+
end
|
344
|
+
source_i=source_i+1
|
345
|
+
end
|
346
|
+
output_fh.close
|
347
|
+
else # Other formats just copy a file across ... must be a single source
|
348
|
+
|
349
|
+
throw "Only a single source file is permitted for formats other than fasta" unless source_files.length == 1
|
350
|
+
|
351
|
+
sh "cp #{source_files[0]} #{raw_db_filename}" do |ok,res|
|
352
|
+
if ! ok
|
353
|
+
puts "Unable to copy #{source_files[0]} to #{raw_db_filename}"
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
#####################
|
361
|
+
# Decoys #
|
362
|
+
#####################
|
363
|
+
|
364
|
+
decoy_db_filename = "#{dbdir}/with_decoys.fasta"
|
365
|
+
file decoy_db_filename => raw_db_filename do
|
366
|
+
|
367
|
+
archive_fasta_file(decoy_db_filename) if dbspec[:archive_old]
|
368
|
+
|
369
|
+
|
370
|
+
decoys_filename = "#{dbdir}/decoys_only.fasta"
|
371
|
+
decoy_prefix=dbspec[:decoy_prefix]
|
372
|
+
|
373
|
+
# Count entries in the raw input file
|
374
|
+
#
|
375
|
+
ff=Bio::FlatFile.open(Bio::FastaFormat, raw_db_filename)
|
376
|
+
db_length=0
|
377
|
+
ff.each do |entry|
|
378
|
+
db_length=db_length+1
|
379
|
+
end
|
380
|
+
|
381
|
+
p "Generating decoy sequences ... this could take a while"
|
382
|
+
# Make decoys, concatenate and delete decoy only file
|
383
|
+
Randomize.make_decoys #{raw_db_filename} #{db_length} #{decoys_filename} #{decoy_prefix}"
|
384
|
+
cmd << "cat #{raw_db_filename} #{decoys_filename} >> #{decoy_db_filename}; rm #{decoys_filename}"
|
385
|
+
sh %{ #{cmd} }
|
386
|
+
end
|
387
|
+
|
388
|
+
# Adjust dependencies depending on whether we're making decoys
|
389
|
+
#
|
390
|
+
case dbspec[:decoys]
|
391
|
+
when true
|
392
|
+
throw "Decoys are only supported for fasta formatted databases" unless format=="fasta"
|
393
|
+
file db_filename => decoy_db_filename
|
394
|
+
else
|
395
|
+
file db_filename => raw_db_filename
|
396
|
+
end
|
397
|
+
|
398
|
+
|
399
|
+
###################
|
400
|
+
# Symlink Current #
|
401
|
+
###################
|
402
|
+
|
403
|
+
|
404
|
+
# Current database file should symlink to raw or decoy
|
405
|
+
#
|
406
|
+
file db_filename do
|
407
|
+
if ( dbspec[:is_annotation_db])
|
408
|
+
db_filename=raw_db_filename # For annotation databases we don't use symlinks at all
|
409
|
+
else
|
410
|
+
# if we are an annotation db we can't symlink so do nothing
|
411
|
+
|
412
|
+
# source db filename is either decoy or raw
|
413
|
+
#
|
414
|
+
case dbspec[:decoys]
|
415
|
+
when true
|
416
|
+
source_db_filename = decoy_db_filename
|
417
|
+
when false
|
418
|
+
source_db_filename = raw_db_filename
|
419
|
+
end
|
420
|
+
|
421
|
+
p "Current db links to #{source_db_filename}"
|
422
|
+
|
423
|
+
# Symlink to the source file
|
424
|
+
#
|
425
|
+
File.symlink(source_db_filename,db_filename)
|
426
|
+
end
|
427
|
+
end
|
428
|
+
|
429
|
+
|
430
|
+
|
431
|
+
###################
|
432
|
+
# Indexing #
|
433
|
+
###################
|
434
|
+
if dbspec[:make_blast_index]
|
435
|
+
blast_index_files=FileList.new([".phr"].collect {|ext| "#{db_filename}#{ext}" })
|
436
|
+
# task :make_blast_index => blast_index_files do
|
437
|
+
blast_index_files.each do |indfile|
|
438
|
+
file indfile => db_filename do
|
439
|
+
cmd="cd #{dbdir}; #{$genv.makeblastdb} -in #{db_filename} -parse_seqids -dbtype prot"
|
440
|
+
p "Creating blast index"
|
441
|
+
sh %{ #{cmd} }
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
task dbname => blast_index_files
|
446
|
+
|
447
|
+
end
|
448
|
+
|
449
|
+
|
450
|
+
if dbspec[:make_msgf_index]
|
451
|
+
msgf_index_files=FileList.new([".canno"].collect {|ext| "#{db_filename}#{ext}" })
|
452
|
+
# task :make_blast_index => blast_index_files do
|
453
|
+
msgf_index_files.each do |indfile|
|
454
|
+
file indfile => db_filename do
|
455
|
+
cmd="cd #{dbdir}; java -Xmx3500M -cp #{$genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{db_filename} -tda 0"
|
456
|
+
p "Creating msgf index"
|
457
|
+
sh %{ #{cmd} }
|
458
|
+
end
|
459
|
+
end
|
460
|
+
|
461
|
+
task dbname => msgf_index_files
|
462
|
+
end
|
463
|
+
|
464
|
+
if format=="dat" && dbspec[:is_annotation_db]
|
465
|
+
dat_index_files=FileList.new(["config.dat","id_AC.index","key_ID.key"].collect {|file| "#{dbdir}/#{file}"} )
|
466
|
+
|
467
|
+
dat_index_files.each do |indexfile|
|
468
|
+
file indexfile => db_filename do
|
469
|
+
puts "Indexing annotation database"
|
470
|
+
dbclass=Bio::SPTR
|
471
|
+
parser = Bio::FlatFileIndex::Indexer::Parser.new(dbclass, nil, nil)
|
472
|
+
Bio::FlatFileIndex::Indexer::makeindexFlat(dbdir, parser, {}, db_filename)
|
473
|
+
end
|
474
|
+
end
|
475
|
+
|
476
|
+
task dbname => dat_index_files
|
477
|
+
|
478
|
+
end
|
479
|
+
|
480
|
+
#################
|
481
|
+
# Root task #
|
482
|
+
#################
|
483
|
+
|
484
|
+
task dbname => db_filename
|