protk 1.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +85 -0
- data/bin/annotate_ids.rb +59 -0
- data/bin/big_search.rb +41 -0
- data/bin/correct_omssa_retention_times.rb +27 -0
- data/bin/feature_finder.rb +76 -0
- data/bin/file_convert.rb +157 -0
- data/bin/generate_omssa_loc.rb +42 -0
- data/bin/interprophet.rb +91 -0
- data/bin/make_decoy.rb +64 -0
- data/bin/manage_db.rb +123 -0
- data/bin/mascot_search.rb +187 -0
- data/bin/mascot_to_pepxml.rb +44 -0
- data/bin/msgfplus_search.rb +191 -0
- data/bin/omssa_search.rb +205 -0
- data/bin/peptide_prophet.rb +245 -0
- data/bin/pepxml_to_table.rb +78 -0
- data/bin/protein_prophet.rb +140 -0
- data/bin/protk_setup.rb +31 -0
- data/bin/repair_run_summary.rb +113 -0
- data/bin/tandem_search.rb +292 -0
- data/bin/template_search.rb +144 -0
- data/bin/unimod_to_loc.rb +118 -0
- data/bin/xls_to_table.rb +46 -0
- data/ext/protk/extconf.rb +3 -0
- data/ext/protk/protk.c +235 -0
- data/lib/protk/big_search_rakefile.rake +16 -0
- data/lib/protk/big_search_tool.rb +23 -0
- data/lib/protk/bio_sptr_extensions.rb +210 -0
- data/lib/protk/biotools_excel_converter.rb +60 -0
- data/lib/protk/command_runner.rb +84 -0
- data/lib/protk/constants.rb +296 -0
- data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
- data/lib/protk/data/apt-get_packages.yaml +47 -0
- data/lib/protk/data/brew_packages.yaml +10 -0
- data/lib/protk/data/default_config.yml +20 -0
- data/lib/protk/data/predefined_db.crap.yaml +19 -0
- data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
- data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
- data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
- data/lib/protk/data/tandem_params.xml +56 -0
- data/lib/protk/data/taxonomy_template.xml +9 -0
- data/lib/protk/data/unimod.xml +16780 -0
- data/lib/protk/eupathdb_gene_information_table.rb +158 -0
- data/lib/protk/galaxy_stager.rb +24 -0
- data/lib/protk/galaxy_util.rb +9 -0
- data/lib/protk/manage_db_rakefile.rake +484 -0
- data/lib/protk/manage_db_tool.rb +181 -0
- data/lib/protk/mascot_util.rb +63 -0
- data/lib/protk/omssa_util.rb +57 -0
- data/lib/protk/plasmodb.rb +50 -0
- data/lib/protk/prophet_tool.rb +85 -0
- data/lib/protk/protein_annotator.rb +646 -0
- data/lib/protk/protxml.rb +137 -0
- data/lib/protk/randomize.rb +7 -0
- data/lib/protk/search_tool.rb +182 -0
- data/lib/protk/setup_rakefile.rake +245 -0
- data/lib/protk/setup_tool.rb +19 -0
- data/lib/protk/spreadsheet_extensions.rb +78 -0
- data/lib/protk/swissprot_database.rb +38 -0
- data/lib/protk/tool.rb +182 -0
- data/lib/protk/xtandem_defaults.rb +11 -0
- data/lib/protk.rb +18 -0
- metadata +256 -0
@@ -0,0 +1,158 @@
|
|
1
|
+
# Code for interacting with EuPathDB gene information files e.g. http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
2
|
+
# These gene information files contain a large amount of information about individual genes/proteins in EuPathDBs.
|
3
|
+
|
4
|
+
require 'tempfile'
|
5
|
+
|
6
|
+
# A class for extracting gene info from a particular gene from the information file
|
7
|
+
class EuPathDBGeneInformationFileExtractor
|
8
|
+
# A filename path to the gene information file
|
9
|
+
attr_accessor :filename
|
10
|
+
|
11
|
+
def initialize(filename = nil)
|
12
|
+
@filename = filename
|
13
|
+
end
|
14
|
+
|
15
|
+
# Returns a EuPathDBGeneInformation object corresponding to the wanted key. If
|
16
|
+
# there are multiple in the file, only the first is returned. If none are found, nil is returned.
|
17
|
+
#
|
18
|
+
# If grep_hack_lines is defined (as an integer), then a shortcut is applied to speed things up. Before parsing the gene info file, grep some lines after the "Gene Id: .." line. Then feed that into the parser.
|
19
|
+
def extract_gene_info(wanted_gene_id, grep_hack_lines = nil)
|
20
|
+
inside_iterator = lambda do |gene|
|
21
|
+
return gene if wanted_gene_id == gene.info['Gene Id']
|
22
|
+
end
|
23
|
+
|
24
|
+
filename = @filename
|
25
|
+
p @filename
|
26
|
+
if grep_hack_lines and grep_hack_lines.to_i != 0
|
27
|
+
tempfile=Tempfile.new('reubypathdb_grep_hack')
|
28
|
+
# grep however many lines from past the point. Rather dodgy, but faster.
|
29
|
+
raise Exception, "grep_hack_lines should be an integer" unless grep_hack_lines.is_a?(Integer)
|
30
|
+
`grep -A #{grep_hack_lines} 'Gene Id: #{wanted_gene_id}' '#{@filename}' >#{tempfile.path}`
|
31
|
+
EuPathDBGeneInformationTable.new(File.open(tempfile.path)).each do |gene|
|
32
|
+
return inside_iterator.call(gene)
|
33
|
+
end
|
34
|
+
else
|
35
|
+
# no grep hack. Parse the whole gene information file
|
36
|
+
EuPathDBGeneInformationTable.new(File.open(@filename)).each do |gene|
|
37
|
+
return inside_iterator.call(gene)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
return nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# A class for parsing the 'gene information table' files from EuPathDB, such
|
45
|
+
# as http://cryptodb.org/common/downloads/release-4.3/Cmuris/txt/CmurisGene_CryptoDB-4.3.txt
|
46
|
+
#
|
47
|
+
# The usual way of interacting with these is the use of the each method,
|
48
|
+
# which returns a EuPathDBGeneInformation object with all of the recorded
|
49
|
+
# information in it.
|
50
|
+
class EuPathDBGeneInformationTable
|
51
|
+
include Enumerable
|
52
|
+
|
53
|
+
# Initialise using an IO object, say File.open('/path/to/CmurisGene_CryptoDB-4.3.txt'). After opening, the #each method can be used to iterate over the genes that are present in the file
|
54
|
+
def initialize(io)
|
55
|
+
@io = io
|
56
|
+
end
|
57
|
+
|
58
|
+
# Return a EuPathDBGeneInformation object with
|
59
|
+
# the contained info in it, one at a time
|
60
|
+
def each
|
61
|
+
while g = next_gene
|
62
|
+
yield g
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Returns a EuPathDBGeneInformation object with all the data you could
|
67
|
+
# possibly want.
|
68
|
+
def next_gene
|
69
|
+
info = EuPathDBGeneInformation.new
|
70
|
+
|
71
|
+
# first, read the table, which should start with the ID column
|
72
|
+
line = @io.readline.strip
|
73
|
+
while line == ''
|
74
|
+
return nil if @io.eof?
|
75
|
+
line = @io.readline.strip
|
76
|
+
end
|
77
|
+
|
78
|
+
while line != ''
|
79
|
+
if matches = line.match(/^(.*?)\: (.*)$/)
|
80
|
+
info.add_information(matches[1], matches[2])
|
81
|
+
else
|
82
|
+
raise Exception, "EuPathDBGeneInformationTable Couldn't parse this line: #{line}"
|
83
|
+
end
|
84
|
+
|
85
|
+
line = @io.readline.strip
|
86
|
+
end
|
87
|
+
|
88
|
+
# now read each of the tables, which should start with the
|
89
|
+
# 'TABLE: <name>' entry
|
90
|
+
line = @io.readline.strip
|
91
|
+
table_name = nil
|
92
|
+
headers = nil
|
93
|
+
data = []
|
94
|
+
while line != '------------------------------------------------------------'
|
95
|
+
if line == ''
|
96
|
+
# add it to the stack unless we are just starting out
|
97
|
+
info.add_table(table_name, headers, data) unless table_name.nil?
|
98
|
+
|
99
|
+
# reset things
|
100
|
+
table_name = nil
|
101
|
+
headers = nil
|
102
|
+
data = []
|
103
|
+
elsif matches = line.match(/^TABLE\: (.*)$/)
|
104
|
+
# name of a table
|
105
|
+
table_name = matches[1]
|
106
|
+
elsif line.match(/^\[.*\]/)
|
107
|
+
# headings of the table
|
108
|
+
headers = line.split("\t").collect do |header|
|
109
|
+
header.gsub(/^\[/,'').gsub(/\]$/,'')
|
110
|
+
end
|
111
|
+
else
|
112
|
+
# a proper data row
|
113
|
+
data.push line.split("\t")
|
114
|
+
end
|
115
|
+
line = @io.readline.strip
|
116
|
+
end
|
117
|
+
|
118
|
+
# return the object that has been created
|
119
|
+
return info
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Each gene in the gene information table is represented
|
124
|
+
# by 2 types of information - info and tables.
|
125
|
+
# info are 1 line data, whereas tables are tables of
|
126
|
+
# data with possibly multiple rows
|
127
|
+
class EuPathDBGeneInformation
|
128
|
+
def info
|
129
|
+
@info
|
130
|
+
end
|
131
|
+
|
132
|
+
def get_info(key)
|
133
|
+
@info[key]
|
134
|
+
end
|
135
|
+
alias_method :[], :get_info
|
136
|
+
|
137
|
+
def get_table(table_name)
|
138
|
+
@tables[table_name]
|
139
|
+
end
|
140
|
+
|
141
|
+
def add_information(key, value)
|
142
|
+
@info ||= {}
|
143
|
+
@info[key] = value
|
144
|
+
"Added info #{key}, now is #{@info[key]}"
|
145
|
+
end
|
146
|
+
|
147
|
+
def add_table(name, headers, data)
|
148
|
+
@tables ||= {}
|
149
|
+
@tables[name] = []
|
150
|
+
data.each do |row|
|
151
|
+
final = {}
|
152
|
+
row.each_with_index do |cell, i|
|
153
|
+
final[headers[i]] = cell
|
154
|
+
end
|
155
|
+
@tables[name].push final
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'pathname'
|
2
|
+
|
3
|
+
class GalaxyStager
|
4
|
+
attr_accessor :staged_path
|
5
|
+
|
6
|
+
def initialize(original_path, options = {})
|
7
|
+
options = { :name => nil, :extension => '' }.merge(options)
|
8
|
+
@original_path = Pathname.new(original_path)
|
9
|
+
@wd = Dir.pwd
|
10
|
+
staged_name = options[:name] || @original_path.basename
|
11
|
+
@staged_path = File.join(@wd, "#{staged_name}#{options[:extension]}")
|
12
|
+
File.symlink(@original_path, @staged_path)
|
13
|
+
end
|
14
|
+
|
15
|
+
def restore_references(in_file)
|
16
|
+
GalaxyStager.replace_references(in_file, @staged_path, @original_path)
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.replace_references(in_file, from_path, to_path)
|
20
|
+
cmd="ruby -pi -e \"gsub('#{from_path}', '#{to_path}')\" #{in_file}"
|
21
|
+
%x[#{cmd}]
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
@@ -0,0 +1,484 @@
|
|
1
|
+
require 'protk/constants'
|
2
|
+
require 'protk/randomize'
|
3
|
+
require 'uri'
|
4
|
+
require 'digest/md5'
|
5
|
+
require 'net/ftp'
|
6
|
+
require 'net/ftp/list'
|
7
|
+
require 'bio'
|
8
|
+
require 'tempfile'
|
9
|
+
require 'pp'
|
10
|
+
require 'set'
|
11
|
+
|
12
|
+
dbname=ARGV[0]
|
13
|
+
|
14
|
+
# Load database spec file
|
15
|
+
#
|
16
|
+
$genv=Constants.new()
|
17
|
+
dbdir="#{$genv.protein_database_root}/#{dbname}"
|
18
|
+
|
19
|
+
dbspec_file="#{dbdir}/.protkdb.yaml"
|
20
|
+
dbspec=YAML.load_file "#{dbspec_file}"
|
21
|
+
|
22
|
+
format = dbspec[:format]!=nil ? dbspec[:format] : "fasta"
|
23
|
+
|
24
|
+
# Output database filename
|
25
|
+
#
|
26
|
+
db_filename="#{dbdir}/current.#{format}"
|
27
|
+
|
28
|
+
#####################
|
29
|
+
# Utility Functions #
|
30
|
+
#####################
|
31
|
+
|
32
|
+
|
33
|
+
def check_ftp_release_notes(release_notes)
|
34
|
+
rn_uri = URI.parse(release_notes)
|
35
|
+
|
36
|
+
rn_path="#{$genv.database_downloads}/#{rn_uri.host}/#{rn_uri.path}"
|
37
|
+
|
38
|
+
|
39
|
+
host=rn_uri.host
|
40
|
+
Net::FTP.open(host) do |ftp|
|
41
|
+
|
42
|
+
ftp.login
|
43
|
+
rn_dir=Pathname.new(rn_uri.path).dirname.to_s
|
44
|
+
rn_file=Pathname.new(rn_uri.path).basename.to_s
|
45
|
+
ftp.chdir(rn_dir)
|
46
|
+
|
47
|
+
ftp.passive=true
|
48
|
+
|
49
|
+
|
50
|
+
p "Checking release notes"
|
51
|
+
|
52
|
+
# Is the last path component a wildcard expression (we only allow *)
|
53
|
+
# If so we need to find the file with the most recent modification time
|
54
|
+
#
|
55
|
+
if ( rn_file =~ /\*/)
|
56
|
+
entries=ftp.list(rn_file)
|
57
|
+
p entries
|
58
|
+
latest_file=nil
|
59
|
+
latest_file_mtime=nil
|
60
|
+
entries.each do |dir_entry|
|
61
|
+
info=Net::FTP::List.parse(dir_entry)
|
62
|
+
if ( info.file? )
|
63
|
+
latest_file_mtime = info.mtime if ( latest_file_mtime ==nil )
|
64
|
+
latest_file = info.basename if ( latest_file_mtime ==nil )
|
65
|
+
|
66
|
+
if ( info.mtime <=> latest_file_mtime ) #entry's mtime is later
|
67
|
+
latest_file_mtime=info.mtime
|
68
|
+
latest_file=info.basename
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
throw "No release notes found" if ( latest_file ==nil)
|
75
|
+
|
76
|
+
rn_file=latest_file
|
77
|
+
|
78
|
+
# Adjust the rn_path to be the path of the latest file
|
79
|
+
#
|
80
|
+
rn_path="#{Pathname.new(rn_path).dirname}/#{latest_file}"
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
# Hash existing release notes data if it exists
|
85
|
+
#
|
86
|
+
existing_digest=nil
|
87
|
+
existing_digest=Digest::MD5.hexdigest(File.read(rn_path)) if Pathname.new(rn_path).exist?
|
88
|
+
|
89
|
+
|
90
|
+
|
91
|
+
rn_data=""
|
92
|
+
dl_file=Tempfile.new("rn_file")
|
93
|
+
|
94
|
+
ftp.getbinaryfile(rn_file,dl_file.path) { |data| rn_data << data }
|
95
|
+
|
96
|
+
rn_digest=Digest::MD5.hexdigest(rn_data)
|
97
|
+
|
98
|
+
p "Done Downloading release notes #{ftp} #{rn_file} to #{dl_file.path} #{ftp.pwd}"
|
99
|
+
|
100
|
+
throw "No release notes data at #{release_notes}" unless rn_digest!=nil
|
101
|
+
|
102
|
+
# Update release notes data
|
103
|
+
case
|
104
|
+
when ( existing_digest != rn_digest )
|
105
|
+
FileUtils.mkpath(Pathname.new(rn_path).dirname.to_s)
|
106
|
+
File.open(rn_path, "w") {|file| file.puts(rn_data) }
|
107
|
+
else
|
108
|
+
p "Release notes are up to date"
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def download_ftp_file(ftp,file_name,dest_dir)
|
114
|
+
dest_path="#{dest_dir}/#{file_name}"
|
115
|
+
|
116
|
+
download_size=ftp.size(file_name)
|
117
|
+
mod_time=ftp.mtime(file_name,true)
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
percent_size=download_size/100
|
122
|
+
i=1
|
123
|
+
pc_complete=0
|
124
|
+
last_time=Time.new
|
125
|
+
p "Downloading #{file_name}"
|
126
|
+
ftp.passive=true
|
127
|
+
|
128
|
+
ftp.getbinaryfile(file_name,dest_path,1024) { |data|
|
129
|
+
|
130
|
+
progress=i*1024
|
131
|
+
if ( pc_complete < progress.divmod(percent_size)[0] && ( Time.new - last_time) > 10 )
|
132
|
+
pc_complete=progress.divmod(percent_size)[0]
|
133
|
+
p "Downloading #{file_name} #{pc_complete} percent complete"
|
134
|
+
last_time=Time.new
|
135
|
+
end
|
136
|
+
i=i+1
|
137
|
+
}
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
def download_ftp_source(source)
|
142
|
+
|
143
|
+
data_uri = URI.parse(source)
|
144
|
+
|
145
|
+
data_path="#{$genv.database_downloads}/#{data_uri.host}/#{data_uri.path}"
|
146
|
+
# Make sure our destination dir is available
|
147
|
+
#
|
148
|
+
FileUtils.mkpath(Pathname.new(data_path).dirname.to_s)
|
149
|
+
|
150
|
+
|
151
|
+
|
152
|
+
Net::FTP.open(data_uri.host) do |ftp|
|
153
|
+
p "Connected to #{data_uri.host}"
|
154
|
+
ftp.login
|
155
|
+
|
156
|
+
ftp.chdir(Pathname.new(data_uri.path).dirname.to_s)
|
157
|
+
|
158
|
+
last_path_component=Pathname.new(data_uri.path).basename.to_s
|
159
|
+
|
160
|
+
case
|
161
|
+
when last_path_component=~/\*/ # A wildcard match. Need to download them all
|
162
|
+
p "Getting directory listing for #{last_path_component}"
|
163
|
+
ftp.passive=true
|
164
|
+
matching_items=ftp.list(last_path_component)
|
165
|
+
|
166
|
+
PP.pp(matching_items)
|
167
|
+
|
168
|
+
matching_items.each do |dir_entry|
|
169
|
+
info=Net::FTP::List.parse(dir_entry)
|
170
|
+
download_ftp_file(ftp,info.basename,Pathname.new(data_path).dirname)
|
171
|
+
end
|
172
|
+
|
173
|
+
else # Just one file to download
|
174
|
+
download_ftp_file(ftp,last_path_component,Pathname.new(data_path).dirname)
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
180
|
+
|
181
|
+
|
182
|
+
def archive_fasta_file(filename)
|
183
|
+
if ( Pathname.new(filename).exist? )
|
184
|
+
mt=File.new(filename).mtime
|
185
|
+
timestamp="#{mt.year}_#{mt.month}_#{mt.day}"
|
186
|
+
archive_filename="#{filename.gsub(/.fasta$/,'')}_#{timestamp}.fasta"
|
187
|
+
p "Moving old database to #{archive_filename}"
|
188
|
+
FileUtils.mv(filename,archive_filename)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
#####################
|
193
|
+
# Source Files #
|
194
|
+
#####################
|
195
|
+
|
196
|
+
def file_source(raw_source)
|
197
|
+
full_path=raw_source
|
198
|
+
full_path = "#{$genv.protein_database_root}/#{raw_source}" unless ( raw_source =~ /^\//) # relative paths should be relative to datbases dir
|
199
|
+
throw "File source #{full_path} does not exist" unless Pathname.new(full_path).exist?
|
200
|
+
full_path
|
201
|
+
end
|
202
|
+
|
203
|
+
def db_source(db_source)
|
204
|
+
current_release_path = "#{$genv.protein_database_root}/#{db_source}/current.fasta"
|
205
|
+
throw "Database source #{current_release_path} does not exist" unless Pathname.new(current_release_path).exist?
|
206
|
+
current_release_path
|
207
|
+
end
|
208
|
+
|
209
|
+
|
210
|
+
def ftp_source(ftpsource)
|
211
|
+
|
212
|
+
|
213
|
+
data_uri=URI.parse(ftpsource[0])
|
214
|
+
data_file_path="#{$genv.database_downloads}/#{data_uri.host}/#{data_uri.path}"
|
215
|
+
unpacked_data_path=data_file_path.gsub(/\.gz$/,'')
|
216
|
+
|
217
|
+
release_notes_url=ftpsource[1]
|
218
|
+
release_notes_exist=true
|
219
|
+
release_notes_exist=false if release_notes_url =~ /^\s*none\s*$/
|
220
|
+
if release_notes_exist
|
221
|
+
data_rn=URI.parse(release_notes_url) unless
|
222
|
+
release_notes_file_path="#{$genv.database_downloads}/#{data_rn.host}/#{data_rn.path}"
|
223
|
+
|
224
|
+
task :check_rn do
|
225
|
+
check_ftp_release_notes(release_notes_url)
|
226
|
+
end
|
227
|
+
|
228
|
+
file release_notes_file_path => :check_rn
|
229
|
+
else
|
230
|
+
task :check_date do
|
231
|
+
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
|
236
|
+
|
237
|
+
if ( data_file_path=~/\*/) # A wildcard
|
238
|
+
unpacked_data_path=data_file_path.gsub(/\*/,"_all_").gsub(/\.gz$/,'')
|
239
|
+
end
|
240
|
+
|
241
|
+
file unpacked_data_path do #Unpacking. Includes unzipping and/or concatenating
|
242
|
+
download_ftp_source(ftpsource[0])
|
243
|
+
|
244
|
+
case
|
245
|
+
when data_file_path=~/\*/ # Multiple files to unzip/concatenate and we don't know what they are yet
|
246
|
+
file_pattern = Pathname.new(data_file_path).basename.to_s
|
247
|
+
if file_pattern =~ /.gz$/
|
248
|
+
unzipcmd="gunzip -vdf #{file_pattern}"
|
249
|
+
p "Unzipping #{unzipcmd} ... this could take a while"
|
250
|
+
sh %{ cd #{Pathname.new(data_file_path).dirname}; #{unzipcmd} }
|
251
|
+
end
|
252
|
+
|
253
|
+
file_pattern.gsub!(/\.gz$/,'')
|
254
|
+
catcmd="cat #{file_pattern} > #{unpacked_data_path}"
|
255
|
+
|
256
|
+
p "Concatenating files #{catcmd} ... this could take a while"
|
257
|
+
sh %{ cd #{Pathname.new(data_file_path).dirname}; #{catcmd} }
|
258
|
+
|
259
|
+
else # Simple case. A single file
|
260
|
+
if file_pattern =~ /.gz$/
|
261
|
+
p "Unzipping #{Pathname.new(data_file_path).basename} ... "
|
262
|
+
sh %{ cd #{Pathname.new(data_file_path).dirname}; gunzip -f #{Pathname.new(data_file_path).basename} }
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
task release_notes_file_path => release_notes_file_path if release_notes_exist
|
268
|
+
|
269
|
+
unpacked_data_path
|
270
|
+
end
|
271
|
+
|
272
|
+
source_files=dbspec[:sources].collect do |raw_source|
|
273
|
+
sf=""
|
274
|
+
case
|
275
|
+
when raw_source.class==Array
|
276
|
+
sf=ftp_source(raw_source)
|
277
|
+
when (raw_source =~ /\.fasta$/ || raw_source =~ /\.txt$/ || raw_source =~ /\.dat$/ )
|
278
|
+
sf=file_source(raw_source)
|
279
|
+
else
|
280
|
+
sf=db_source(raw_source)
|
281
|
+
end
|
282
|
+
sf
|
283
|
+
end
|
284
|
+
|
285
|
+
########################
|
286
|
+
# Concat Filter Copy #
|
287
|
+
########################
|
288
|
+
|
289
|
+
raw_db_filename = "#{dbdir}/raw.#{format}"
|
290
|
+
|
291
|
+
file raw_db_filename => [source_files,dbspec_file].flatten do
|
292
|
+
|
293
|
+
source_filters=dbspec[:include_filters]
|
294
|
+
|
295
|
+
if ( format == "fasta" && source_filters.length > 0 ) # We can perform concat and filter for fasta only
|
296
|
+
|
297
|
+
archive_fasta_file(raw_db_filename) if dbspec[:archive_old]
|
298
|
+
|
299
|
+
output_fh=File.open(raw_db_filename, "w")
|
300
|
+
|
301
|
+
id_regexes=dbspec[:id_regexes]
|
302
|
+
source_i=0
|
303
|
+
throw "The number of source files #{source_files.length} should equal the number of source filters #{source_filters.length}" unless source_filters.length == source_files.length
|
304
|
+
throw "The number of source files #{source_files.length} should equal the number of id regexes #{id_regexes.length}" unless source_filters.length == id_regexes.length
|
305
|
+
|
306
|
+
added_ids=Set.new
|
307
|
+
|
308
|
+
source_files.each do |source|
|
309
|
+
# Open source as Fasta
|
310
|
+
#
|
311
|
+
Bio::FlatFile.open(Bio::FastaFormat, source) do |ff|
|
312
|
+
p "Reading source file #{source}"
|
313
|
+
|
314
|
+
n_match=0
|
315
|
+
|
316
|
+
filters=source_filters[source_i] #An array of filters for this input file
|
317
|
+
id_regex=/#{id_regexes[source_i]}/
|
318
|
+
|
319
|
+
ff.each do |entry|
|
320
|
+
filters.each do |filter|
|
321
|
+
if ( entry.definition =~ /#{filter}/)
|
322
|
+
n_match=n_match+1
|
323
|
+
idmatch=id_regex.match(entry.definition)
|
324
|
+
case
|
325
|
+
when idmatch==nil || idmatch[1]==nil
|
326
|
+
p "No match to id regex #{id_regex} for #{entry.definition}. Skipping this entry"
|
327
|
+
else
|
328
|
+
new_def="#{idmatch[1]}"
|
329
|
+
if ( added_ids.include?(new_def) )
|
330
|
+
p "Warning: Skipping duplicate definition for #{new_def}"
|
331
|
+
else
|
332
|
+
entry.definition=new_def
|
333
|
+
output_fh.puts(entry.to_s)
|
334
|
+
added_ids.add new_def
|
335
|
+
end
|
336
|
+
# p entry.definition.to_s
|
337
|
+
end
|
338
|
+
break
|
339
|
+
end
|
340
|
+
end
|
341
|
+
end
|
342
|
+
p "Warning no match to any filter in #{filters} for source file #{source}" unless n_match > 0
|
343
|
+
end
|
344
|
+
source_i=source_i+1
|
345
|
+
end
|
346
|
+
output_fh.close
|
347
|
+
else # Other formats just copy a file across ... must be a single source
|
348
|
+
|
349
|
+
throw "Only a single source file is permitted for formats other than fasta" unless source_files.length == 1
|
350
|
+
|
351
|
+
sh "cp #{source_files[0]} #{raw_db_filename}" do |ok,res|
|
352
|
+
if ! ok
|
353
|
+
puts "Unable to copy #{source_files[0]} to #{raw_db_filename}"
|
354
|
+
end
|
355
|
+
end
|
356
|
+
|
357
|
+
end
|
358
|
+
end
|
359
|
+
|
360
|
+
#####################
|
361
|
+
# Decoys #
|
362
|
+
#####################
|
363
|
+
|
364
|
+
decoy_db_filename = "#{dbdir}/with_decoys.fasta"
|
365
|
+
file decoy_db_filename => raw_db_filename do
|
366
|
+
|
367
|
+
archive_fasta_file(decoy_db_filename) if dbspec[:archive_old]
|
368
|
+
|
369
|
+
|
370
|
+
decoys_filename = "#{dbdir}/decoys_only.fasta"
|
371
|
+
decoy_prefix=dbspec[:decoy_prefix]
|
372
|
+
|
373
|
+
# Count entries in the raw input file
|
374
|
+
#
|
375
|
+
ff=Bio::FlatFile.open(Bio::FastaFormat, raw_db_filename)
|
376
|
+
db_length=0
|
377
|
+
ff.each do |entry|
|
378
|
+
db_length=db_length+1
|
379
|
+
end
|
380
|
+
|
381
|
+
p "Generating decoy sequences ... this could take a while"
|
382
|
+
# Make decoys, concatenate and delete decoy only file
|
383
|
+
Randomize.make_decoys #{raw_db_filename} #{db_length} #{decoys_filename} #{decoy_prefix}"
|
384
|
+
cmd << "cat #{raw_db_filename} #{decoys_filename} >> #{decoy_db_filename}; rm #{decoys_filename}"
|
385
|
+
sh %{ #{cmd} }
|
386
|
+
end
|
387
|
+
|
388
|
+
# Adjust dependencies depending on whether we're making decoys
|
389
|
+
#
|
390
|
+
case dbspec[:decoys]
|
391
|
+
when true
|
392
|
+
throw "Decoys are only supported for fasta formatted databases" unless format=="fasta"
|
393
|
+
file db_filename => decoy_db_filename
|
394
|
+
else
|
395
|
+
file db_filename => raw_db_filename
|
396
|
+
end
|
397
|
+
|
398
|
+
|
399
|
+
###################
|
400
|
+
# Symlink Current #
|
401
|
+
###################
|
402
|
+
|
403
|
+
|
404
|
+
# Current database file should symlink to raw or decoy
|
405
|
+
#
|
406
|
+
file db_filename do
|
407
|
+
if ( dbspec[:is_annotation_db])
|
408
|
+
db_filename=raw_db_filename # For annotation databases we don't use symlinks at all
|
409
|
+
else
|
410
|
+
# if we are an annotation db we can't symlink so do nothing
|
411
|
+
|
412
|
+
# source db filename is either decoy or raw
|
413
|
+
#
|
414
|
+
case dbspec[:decoys]
|
415
|
+
when true
|
416
|
+
source_db_filename = decoy_db_filename
|
417
|
+
when false
|
418
|
+
source_db_filename = raw_db_filename
|
419
|
+
end
|
420
|
+
|
421
|
+
p "Current db links to #{source_db_filename}"
|
422
|
+
|
423
|
+
# Symlink to the source file
|
424
|
+
#
|
425
|
+
File.symlink(source_db_filename,db_filename)
|
426
|
+
end
|
427
|
+
end
|
428
|
+
|
429
|
+
|
430
|
+
|
431
|
+
###################
|
432
|
+
# Indexing #
|
433
|
+
###################
|
434
|
+
if dbspec[:make_blast_index]
|
435
|
+
blast_index_files=FileList.new([".phr"].collect {|ext| "#{db_filename}#{ext}" })
|
436
|
+
# task :make_blast_index => blast_index_files do
|
437
|
+
blast_index_files.each do |indfile|
|
438
|
+
file indfile => db_filename do
|
439
|
+
cmd="cd #{dbdir}; #{$genv.makeblastdb} -in #{db_filename} -parse_seqids -dbtype prot"
|
440
|
+
p "Creating blast index"
|
441
|
+
sh %{ #{cmd} }
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
task dbname => blast_index_files
|
446
|
+
|
447
|
+
end
|
448
|
+
|
449
|
+
|
450
|
+
if dbspec[:make_msgf_index]
|
451
|
+
msgf_index_files=FileList.new([".canno"].collect {|ext| "#{db_filename}#{ext}" })
|
452
|
+
# task :make_blast_index => blast_index_files do
|
453
|
+
msgf_index_files.each do |indfile|
|
454
|
+
file indfile => db_filename do
|
455
|
+
cmd="cd #{dbdir}; java -Xmx3500M -cp #{$genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{db_filename} -tda 0"
|
456
|
+
p "Creating msgf index"
|
457
|
+
sh %{ #{cmd} }
|
458
|
+
end
|
459
|
+
end
|
460
|
+
|
461
|
+
task dbname => msgf_index_files
|
462
|
+
end
|
463
|
+
|
464
|
+
if format=="dat" && dbspec[:is_annotation_db]
|
465
|
+
dat_index_files=FileList.new(["config.dat","id_AC.index","key_ID.key"].collect {|file| "#{dbdir}/#{file}"} )
|
466
|
+
|
467
|
+
dat_index_files.each do |indexfile|
|
468
|
+
file indexfile => db_filename do
|
469
|
+
puts "Indexing annotation database"
|
470
|
+
dbclass=Bio::SPTR
|
471
|
+
parser = Bio::FlatFileIndex::Indexer::Parser.new(dbclass, nil, nil)
|
472
|
+
Bio::FlatFileIndex::Indexer::makeindexFlat(dbdir, parser, {}, db_filename)
|
473
|
+
end
|
474
|
+
end
|
475
|
+
|
476
|
+
task dbname => dat_index_files
|
477
|
+
|
478
|
+
end
|
479
|
+
|
480
|
+
#################
|
481
|
+
# Root task #
|
482
|
+
#################
|
483
|
+
|
484
|
+
task dbname => db_filename
|