protk 1.2.4 → 1.2.5
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/add_retention_times.rb +89 -0
- data/bin/augustus_to_proteindb.rb +193 -0
- data/bin/blastxml_to_table.rb +72 -0
- data/bin/feature_finder.rb +7 -1
- data/bin/make_decoy.rb +10 -2
- data/bin/mascot_search.rb +14 -4
- data/bin/msgfplus_search.rb +14 -5
- data/bin/peptide_prophet.rb +14 -7
- data/bin/protxml_to_gff.rb +624 -0
- data/bin/protxml_to_table.rb +19 -2
- data/bin/sixframe.rb +3 -1
- data/bin/tandem_search.rb +51 -23
- data/bin/toppas_pipeline.rb +8 -3
- data/bin/uniprot_annotation.rb +6 -1
- data/ext/protk/{protk.c → decoymaker/decoymaker.c} +13 -15
- data/ext/protk/decoymaker/extconf.rb +3 -0
- data/ext/protk/simplealign/extconf.rb +3 -0
- data/lib/protk/data/FeatureFinderIsotopeWavelet.ini +6 -6
- data/lib/protk/gapped_aligner.rb +264 -0
- data/lib/protk/manage_db_rakefile.rake +2 -1
- data/lib/protk/mascot_util.rb +7 -2
- data/lib/protk/randomize.rb +2 -2
- data/lib/protk/search_tool.rb +1 -1
- data/lib/protk/setup_rakefile.rake +25 -2
- data/lib/protk/spreadsheet_extensions.rb +1 -0
- data/lib/protk/swissprot_database.rb +11 -1
- metadata +30 -8
- data/bin/mascot2xml.rb +0 -87
- data/ext/protk/extconf.rb +0 -3
- data/lib/protk/data/pepxml_mascot_template.xml +0 -29
- data/lib/protk/data/predefined_db.trembl_annotation.yaml +0 -20
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 14/12/2010
|
5
|
+
#
|
6
|
+
# Attempts to add retention times to pepxml by looking up retention times in a raw file
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/command_runner'
|
11
|
+
require 'protk/tool'
|
12
|
+
require 'libxml'
|
13
|
+
require 'protk/mascot_util'
|
14
|
+
include LibXML
|
15
|
+
|
16
|
+
# Environment with global constants
|
17
|
+
#
|
18
|
+
genv=Constants.new
|
19
|
+
|
20
|
+
tool=Tool.new([:over_write])
|
21
|
+
tool.option_parser.banner = "Look up retention times in a raw file and \
|
22
|
+
add them to a pepxml file.\n\nUsage: add_retention_times.rb [options] file1.pep.xml file2.mgf"
|
23
|
+
|
24
|
+
exit unless tool.check_options
|
25
|
+
|
26
|
+
if ( ARGV[0].nil? || ARGV[1].nil? )
|
27
|
+
puts "You must supply an input file"
|
28
|
+
puts tool.option_parser
|
29
|
+
exit
|
30
|
+
end
|
31
|
+
|
32
|
+
pepxml_file=ARGV[0]
|
33
|
+
mgf_file=ARGV[1]
|
34
|
+
|
35
|
+
pepxml_parser=XML::Parser.file(pepxml_file)
|
36
|
+
|
37
|
+
begin
|
38
|
+
"Creating mascot spectrum id table"
|
39
|
+
rt_table=MascotUtil.index_mgf_times(mgf_file)
|
40
|
+
rescue
|
41
|
+
puts "Unable to index retention times in mgf file"
|
42
|
+
exit
|
43
|
+
end
|
44
|
+
|
45
|
+
pepxml_ns_prefix="xmlns:"
|
46
|
+
pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
|
47
|
+
|
48
|
+
pepxml_doc=pepxml_parser.parse
|
49
|
+
if not pepxml_doc.root.namespaces.default
|
50
|
+
pepxml_ns_prefix=""
|
51
|
+
pepxml_ns=nil
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
queries=pepxml_doc.find("//#{pepxml_ns_prefix}spectrum_query", pepxml_ns)
|
57
|
+
|
58
|
+
queries.each do |query|
|
59
|
+
|
60
|
+
atts=query.attributes
|
61
|
+
spect=atts["spectrum"]
|
62
|
+
|
63
|
+
|
64
|
+
throw "No spectrum found for spectrum_query #{query}" unless ( spect!=nil)
|
65
|
+
|
66
|
+
retention_time = rt_table[spect]
|
67
|
+
if retention_time==nil
|
68
|
+
retention_time=rt_table[spect.chop]
|
69
|
+
if retention_time==nil
|
70
|
+
retention_time=rt_table[spect.chop.chop]
|
71
|
+
end
|
72
|
+
end
|
73
|
+
if ( retention_time!=nil)
|
74
|
+
|
75
|
+
if ( query.attributes["retention_time_sec"]!=nil )
|
76
|
+
puts "A retention time value is already present"
|
77
|
+
exit
|
78
|
+
end
|
79
|
+
|
80
|
+
if ( query.attributes["retention_time_sec"]==nil || over_write)
|
81
|
+
query.attributes["retention_time_sec"]=retention_time
|
82
|
+
# p queries[i].attributes["retention_time_sec"]
|
83
|
+
end
|
84
|
+
else
|
85
|
+
puts "No retention time found for spectrum #{spect}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
pepxml_doc.save(pepxml_file)
|
@@ -0,0 +1,193 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 4/9/2013
|
5
|
+
#
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'protk/constants'
|
9
|
+
require 'protk/tool'
|
10
|
+
require 'bio'
|
11
|
+
|
12
|
+
tool=Tool.new([:explicit_output])
|
13
|
+
tool.option_parser.banner = "Create a protein database from Augustus gene prediction output that is suitable for later processing by proteogenomics tools.\n\nUsage: augustus_to_proteindb.rb [options] augustus.gff3"
|
14
|
+
|
15
|
+
tool.options.add_transcript_info=false
|
16
|
+
tool.option_parser.on( '--info', 'Include CDS coordinates' ) do
|
17
|
+
tool.options.add_transcript_info=true
|
18
|
+
end
|
19
|
+
|
20
|
+
exit unless tool.check_options
|
21
|
+
|
22
|
+
if ( ARGV[0].nil? )
|
23
|
+
puts "You must supply an input file"
|
24
|
+
puts tool.option_parser
|
25
|
+
exit
|
26
|
+
end
|
27
|
+
|
28
|
+
inname=ARGV.shift
|
29
|
+
|
30
|
+
$add_transcript_info=tool.add_transcript_info
|
31
|
+
|
32
|
+
$print_progress=true
|
33
|
+
|
34
|
+
outfile=nil
|
35
|
+
if ( tool.explicit_output != nil)
|
36
|
+
outfile=File.open(tool.explicit_output,'w')
|
37
|
+
else
|
38
|
+
outfile=$stdout
|
39
|
+
$print_progress=false
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
def get_transcript_lines(gene_lines)
|
44
|
+
transcripts=[]
|
45
|
+
gene_lines.each do |line|
|
46
|
+
if line =~ /transcript\t(\d*?)\t/
|
47
|
+
transcripts << line
|
48
|
+
end
|
49
|
+
end
|
50
|
+
transcripts
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_cds_lines(gene_lines)
|
54
|
+
coding_sequences=[]
|
55
|
+
gene_lines.each do |line|
|
56
|
+
if line =~ /CDS\t(\d*?)\t/
|
57
|
+
coding_sequences << line
|
58
|
+
end
|
59
|
+
end
|
60
|
+
coding_sequences
|
61
|
+
end
|
62
|
+
|
63
|
+
$capturing_protein=false
|
64
|
+
|
65
|
+
def capture_protein_start(line)
|
66
|
+
if line=~/protein sequence = \[/
|
67
|
+
$capturing_protein=true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def at_protein_end(line)
|
72
|
+
if $capturing_protein && line =~ /# .*?\]/
|
73
|
+
return true
|
74
|
+
end
|
75
|
+
return false
|
76
|
+
end
|
77
|
+
|
78
|
+
def get_protein_sequence_lines(gene_lines)
|
79
|
+
$capturing_protein=false
|
80
|
+
proteins=[]
|
81
|
+
current_protein_lines=[]
|
82
|
+
gene_lines.each do |line|
|
83
|
+
capture_protein_start(line)
|
84
|
+
if at_protein_end(line)
|
85
|
+
current_protein_lines << line
|
86
|
+
proteins << current_protein_lines
|
87
|
+
current_protein_lines=[]
|
88
|
+
$capturing_protein=false
|
89
|
+
else
|
90
|
+
current_protein_lines << line if $capturing_protein
|
91
|
+
end
|
92
|
+
end
|
93
|
+
proteins
|
94
|
+
end
|
95
|
+
|
96
|
+
def cds_to_header_text(coding_sequence,transcript_id)
|
97
|
+
# require 'debugger';debugger
|
98
|
+
imatch=coding_sequence.match(/CDS\t(\d+)\t(\d+).*?([-\+]{1}.*?Parent=#{transcript_id})$/)
|
99
|
+
if imatch==nil
|
100
|
+
return ""
|
101
|
+
end
|
102
|
+
istart=imatch[1]
|
103
|
+
iend=imatch[2]
|
104
|
+
"#{istart}|#{iend}"
|
105
|
+
end
|
106
|
+
|
107
|
+
def sequence_fasta_header(transcript_line,coding_sequences,scaffold)
|
108
|
+
|
109
|
+
tmatch=transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
|
110
|
+
# require 'debugger'; debugger
|
111
|
+
tstart=tmatch[1]
|
112
|
+
tend=tmatch[2]
|
113
|
+
tstrand="fwd"
|
114
|
+
tstrand = "rev" if tmatch[3]=="-"
|
115
|
+
|
116
|
+
tid=tmatch[4]
|
117
|
+
header=">lcl|#{scaffold}_#{tstrand}_#{tid} #{tstart}|#{tend}"
|
118
|
+
if $add_transcript_info
|
119
|
+
coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
|
120
|
+
end
|
121
|
+
header
|
122
|
+
end
|
123
|
+
|
124
|
+
def protein_sequence(protein_lines)
|
125
|
+
seq=""
|
126
|
+
protein_lines.each_with_index do |line, i|
|
127
|
+
seq << line.match(/(\w+)\]?$/)[1]
|
128
|
+
end
|
129
|
+
|
130
|
+
seq
|
131
|
+
end
|
132
|
+
|
133
|
+
def parse_gene(gene_lines)
|
134
|
+
|
135
|
+
geneid=gene_lines[0].match(/start gene (.*)/)[1]
|
136
|
+
transcripts=get_transcript_lines(gene_lines)
|
137
|
+
coding_sequences=get_cds_lines(gene_lines)
|
138
|
+
proteins=get_protein_sequence_lines(gene_lines)
|
139
|
+
fasta_string=""
|
140
|
+
throw "transcripts/protein mismatch" unless transcripts.length == proteins.length
|
141
|
+
transcripts.each_with_index do |ts, i|
|
142
|
+
fh=sequence_fasta_header(ts,coding_sequences,$current_scaffold)
|
143
|
+
fasta_string << "#{fh}\n"
|
144
|
+
ps=protein_sequence(proteins[i])
|
145
|
+
fasta_string << "#{ps}\n"
|
146
|
+
end
|
147
|
+
|
148
|
+
gene_lines=[]
|
149
|
+
$capturing_gene=false
|
150
|
+
fasta_string
|
151
|
+
end
|
152
|
+
|
153
|
+
def capture_scaffold(line)
|
154
|
+
if line =~ /-- prediction on sequence number.*?name = (.*)\)/
|
155
|
+
$current_scaffold=line.match(/-- prediction on sequence number.*?name = (.*)\)/)[1]
|
156
|
+
if ( $print_progress)
|
157
|
+
puts $current_scaffold
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def capture_gene_start(line)
|
163
|
+
if line =~ /# start gene/
|
164
|
+
$capturing_gene=true
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def at_gene_end(line)
|
169
|
+
if line =~ /# end gene/
|
170
|
+
return true
|
171
|
+
end
|
172
|
+
return false
|
173
|
+
end
|
174
|
+
|
175
|
+
$current_scaffold=""
|
176
|
+
gene_lines=[]
|
177
|
+
$capturing_gene=false
|
178
|
+
|
179
|
+
|
180
|
+
File.open(inname).each_with_index do |line, line_i|
|
181
|
+
line.chomp!
|
182
|
+
capture_scaffold(line)
|
183
|
+
capture_gene_start(line)
|
184
|
+
|
185
|
+
if at_gene_end(line)
|
186
|
+
gene_string=parse_gene(gene_lines)
|
187
|
+
outfile.write gene_string
|
188
|
+
gene_lines=[]
|
189
|
+
else
|
190
|
+
gene_lines << line if $capturing_gene
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
#
|
5
|
+
#
|
6
|
+
|
7
|
+
require 'protk/constants'
|
8
|
+
require 'protk/tool'
|
9
|
+
require 'bio'
|
10
|
+
require 'protk/fastadb'
|
11
|
+
require 'bio-blastxmlparser'
|
12
|
+
|
13
|
+
|
14
|
+
tool=Tool.new([:explicit_output])
|
15
|
+
tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
|
16
|
+
|
17
|
+
tool.options.database=nil
|
18
|
+
tool.option_parser.on( '-d filename','--database filename', 'Database used for BLAST search. If provided, hit sequences will be looked up in this database' ) do |file|
|
19
|
+
tool.options.database=file
|
20
|
+
end
|
21
|
+
|
22
|
+
exit unless tool.check_options
|
23
|
+
|
24
|
+
#require 'debugger';debugger
|
25
|
+
|
26
|
+
exit unless ARGV.length == 1
|
27
|
+
input_file=ARGV[0]
|
28
|
+
|
29
|
+
out_file=$stdout
|
30
|
+
if ( tool.explicit_output != nil)
|
31
|
+
out_file=File.open(tool.explicit_output, "w")
|
32
|
+
end
|
33
|
+
|
34
|
+
$fastadb = nil
|
35
|
+
if tool.database
|
36
|
+
$fastadb=FastaDB.new(tool.database)
|
37
|
+
end
|
38
|
+
|
39
|
+
def generate_line(hsp,hit,query,hit_seq=nil)
|
40
|
+
line="#{query.query_id}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
|
41
|
+
if hit_seq
|
42
|
+
line << "\t#{hit_seq}"
|
43
|
+
end
|
44
|
+
line<<"\n"
|
45
|
+
line
|
46
|
+
end
|
47
|
+
|
48
|
+
def fetch_hit_seq(hit)
|
49
|
+
hit_seq=nil
|
50
|
+
if $fastadb
|
51
|
+
hit_seq=$fastadb.fetch(hit.hit_id).first.aaseq
|
52
|
+
end
|
53
|
+
hit_seq
|
54
|
+
end
|
55
|
+
|
56
|
+
blast = Bio::BlastXMLParser::XmlSplitterIterator.new(input_file).to_enum
|
57
|
+
|
58
|
+
blast.each do |query|
|
59
|
+
query.hits.each do |hit|
|
60
|
+
# hit=query.hits.first
|
61
|
+
# if hit
|
62
|
+
hit_seq=fetch_hit_seq(hit)
|
63
|
+
hit.hsps.each do |hsp|
|
64
|
+
out_file.write generate_line(hsp,hit,query,hit_seq)
|
65
|
+
end
|
66
|
+
# end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
#require 'debugger';debugger
|
71
|
+
|
72
|
+
#puts "Hi"
|
data/bin/feature_finder.rb
CHANGED
@@ -10,6 +10,7 @@ require 'protk/command_runner'
|
|
10
10
|
require 'protk/tool'
|
11
11
|
require 'protk/openms_defaults'
|
12
12
|
require 'libxml'
|
13
|
+
require 'tempfile'
|
13
14
|
|
14
15
|
include LibXML
|
15
16
|
|
@@ -62,7 +63,9 @@ end
|
|
62
63
|
|
63
64
|
throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
|
64
65
|
|
65
|
-
|
66
|
+
input_basename=Pathname.new(ARGV[0]).basename.to_s
|
67
|
+
ini_file_name="#{Pathname.new(Tempfile.new(input_basename).path).basename.to_s}_feature_finder.ini"
|
68
|
+
ini_file="#{Pathname.new(ini_file_name).dirname.realpath.to_s}/#{ini_file_name}"
|
66
69
|
|
67
70
|
generate_ini(tool,ini_file)
|
68
71
|
|
@@ -74,6 +77,9 @@ ARGV.each do |filen|
|
|
74
77
|
output_dir=Pathname.new(input_basename).dirname.realpath.to_s
|
75
78
|
output_base=Pathname.new(input_basename).basename.to_s
|
76
79
|
output_file = "#{output_dir}/#{tool.output_prefix}#{output_base}#{tool.output_suffix}.featureXML"
|
80
|
+
if ( tool.explicit_output )
|
81
|
+
output_file = "#{output_dir}/#{tool.explicit_output}"
|
82
|
+
end
|
77
83
|
|
78
84
|
if ( tool.over_write || !Pathname.new(output_file).exist? )
|
79
85
|
output_base_filename=Pathname.new(output_file).basename.to_s
|
data/bin/make_decoy.rb
CHANGED
@@ -11,6 +11,8 @@ require 'libxml'
|
|
11
11
|
require 'protk/constants'
|
12
12
|
require 'protk/command_runner'
|
13
13
|
require 'protk/tool'
|
14
|
+
require 'protk/randomize'
|
15
|
+
require 'tempfile'
|
14
16
|
require 'bio'
|
15
17
|
|
16
18
|
include LibXML
|
@@ -60,8 +62,14 @@ output_file = tool.explicit_output if tool.explicit_output!=nil
|
|
60
62
|
|
61
63
|
genv=Constants.new()
|
62
64
|
|
63
|
-
|
64
|
-
|
65
|
+
decoys_tmp_file = Pathname.new(Tempfile.new("random").path).basename.to_s;
|
66
|
+
|
67
|
+
Randomize.make_decoys input_file, db_length, decoys_tmp_file, tool.prefix_string
|
68
|
+
cmd = "cat #{input_file} #{decoys_tmp_file} >> #{output_file}; rm #{decoys_tmp_file}" if ( tool.append )
|
69
|
+
|
70
|
+
# Randomize.make_decoys raw_db_filename, db_length, decoys_filename, decoy_prefix
|
71
|
+
# cmd = "cat #{raw_db_filename} #{decoys_filename} >> #{decoy_db_filename}; rm #{decoys_filename}"
|
72
|
+
|
65
73
|
p cmd
|
66
74
|
# Run the conversion
|
67
75
|
#
|
data/bin/mascot_search.rb
CHANGED
@@ -124,7 +124,7 @@ search_tool.option_parser.on('--username un', 'Username.') do |un|
|
|
124
124
|
search_tool.options.username = un
|
125
125
|
end
|
126
126
|
|
127
|
-
search_tool.options.httpproxy=
|
127
|
+
search_tool.options.httpproxy=nil
|
128
128
|
search_tool.option_parser.on( '--proxy url', 'The url to a proxy server' ) do |urll|
|
129
129
|
search_tool.options.httpproxy=urll
|
130
130
|
end
|
@@ -144,6 +144,11 @@ search_tool.option_parser.on( '--export format', 'Save results in a specified fo
|
|
144
144
|
search_tool.options.export_format=format
|
145
145
|
end
|
146
146
|
|
147
|
+
search_tool.options.timeout=200
|
148
|
+
search_tool.option_parser.on( '--timeout seconds', 'Timeout for sending data file to mascot in seconds' ) do |seconds|
|
149
|
+
search_tool.options.timeout=seconds.to_i
|
150
|
+
end
|
151
|
+
|
147
152
|
exit unless search_tool.check_options
|
148
153
|
|
149
154
|
if ( ARGV[0].nil? )
|
@@ -161,8 +166,7 @@ unless ( mascot_cgi =~ /^http[s]?:\/\//)
|
|
161
166
|
mascot_cgi = "http://#{mascot_cgi}"
|
162
167
|
end
|
163
168
|
|
164
|
-
RestClient.proxy=search_tool.httpproxy
|
165
|
-
|
169
|
+
RestClient.proxy=search_tool.httpproxy if search_tool.httpproxy
|
166
170
|
$genv.log("Var mods #{search_tool.var_mods} and fixed #{search_tool.fix_mods}",:info)
|
167
171
|
|
168
172
|
cookie=""
|
@@ -177,7 +181,13 @@ end
|
|
177
181
|
postdict = search_params_dictionary search_tool, ARGV[0]
|
178
182
|
$genv.log("Sending #{postdict}",:info)
|
179
183
|
|
180
|
-
|
184
|
+
#site = RestClient::Resource.new(mascot_cgi, timeout=300)
|
185
|
+
#search_response=site['/nph-mascot.exe?1'].post , postdict, {:cookies=>cookie}
|
186
|
+
|
187
|
+
search_response=RestClient::Request.execute(:method => :post, :url => "#{mascot_cgi}/nph-mascot.exe?1", :payload => postdict,:headers=>{:cookies=>cookie},:timeout => search_tool.options.timeout, :open_timeout => 10)
|
188
|
+
|
189
|
+
|
190
|
+
#search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
|
181
191
|
|
182
192
|
$genv.log("Mascot search response was #{search_response}",:info)
|
183
193
|
|