protk 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/add_retention_times.rb +89 -0
- data/bin/augustus_to_proteindb.rb +193 -0
- data/bin/blastxml_to_table.rb +72 -0
- data/bin/feature_finder.rb +7 -1
- data/bin/make_decoy.rb +10 -2
- data/bin/mascot_search.rb +14 -4
- data/bin/msgfplus_search.rb +14 -5
- data/bin/peptide_prophet.rb +14 -7
- data/bin/protxml_to_gff.rb +624 -0
- data/bin/protxml_to_table.rb +19 -2
- data/bin/sixframe.rb +3 -1
- data/bin/tandem_search.rb +51 -23
- data/bin/toppas_pipeline.rb +8 -3
- data/bin/uniprot_annotation.rb +6 -1
- data/ext/protk/{protk.c → decoymaker/decoymaker.c} +13 -15
- data/ext/protk/decoymaker/extconf.rb +3 -0
- data/ext/protk/simplealign/extconf.rb +3 -0
- data/lib/protk/data/FeatureFinderIsotopeWavelet.ini +6 -6
- data/lib/protk/gapped_aligner.rb +264 -0
- data/lib/protk/manage_db_rakefile.rake +2 -1
- data/lib/protk/mascot_util.rb +7 -2
- data/lib/protk/randomize.rb +2 -2
- data/lib/protk/search_tool.rb +1 -1
- data/lib/protk/setup_rakefile.rake +25 -2
- data/lib/protk/spreadsheet_extensions.rb +1 -0
- data/lib/protk/swissprot_database.rb +11 -1
- metadata +30 -8
- data/bin/mascot2xml.rb +0 -87
- data/ext/protk/extconf.rb +0 -3
- data/lib/protk/data/pepxml_mascot_template.xml +0 -29
- data/lib/protk/data/predefined_db.trembl_annotation.yaml +0 -20
@@ -0,0 +1,89 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 14/12/2010
|
5
|
+
#
|
6
|
+
# Attempts to add retention times to pepxml by looking up retention times in a raw file
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/command_runner'
|
11
|
+
require 'protk/tool'
|
12
|
+
require 'libxml'
|
13
|
+
require 'protk/mascot_util'
|
14
|
+
include LibXML
|
15
|
+
|
16
|
+
# Environment with global constants
|
17
|
+
#
|
18
|
+
genv=Constants.new
|
19
|
+
|
20
|
+
tool=Tool.new([:over_write])
|
21
|
+
tool.option_parser.banner = "Look up retention times in a raw file and \
|
22
|
+
add them to a pepxml file.\n\nUsage: add_retention_times.rb [options] file1.pep.xml file2.mgf"
|
23
|
+
|
24
|
+
exit unless tool.check_options
|
25
|
+
|
26
|
+
if ( ARGV[0].nil? || ARGV[1].nil? )
|
27
|
+
puts "You must supply an input file"
|
28
|
+
puts tool.option_parser
|
29
|
+
exit
|
30
|
+
end
|
31
|
+
|
32
|
+
pepxml_file=ARGV[0]
|
33
|
+
mgf_file=ARGV[1]
|
34
|
+
|
35
|
+
pepxml_parser=XML::Parser.file(pepxml_file)
|
36
|
+
|
37
|
+
begin
|
38
|
+
"Creating mascot spectrum id table"
|
39
|
+
rt_table=MascotUtil.index_mgf_times(mgf_file)
|
40
|
+
rescue
|
41
|
+
puts "Unable to index retention times in mgf file"
|
42
|
+
exit
|
43
|
+
end
|
44
|
+
|
45
|
+
pepxml_ns_prefix="xmlns:"
|
46
|
+
pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
|
47
|
+
|
48
|
+
pepxml_doc=pepxml_parser.parse
|
49
|
+
if not pepxml_doc.root.namespaces.default
|
50
|
+
pepxml_ns_prefix=""
|
51
|
+
pepxml_ns=nil
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
queries=pepxml_doc.find("//#{pepxml_ns_prefix}spectrum_query", pepxml_ns)
|
57
|
+
|
58
|
+
queries.each do |query|
|
59
|
+
|
60
|
+
atts=query.attributes
|
61
|
+
spect=atts["spectrum"]
|
62
|
+
|
63
|
+
|
64
|
+
throw "No spectrum found for spectrum_query #{query}" unless ( spect!=nil)
|
65
|
+
|
66
|
+
retention_time = rt_table[spect]
|
67
|
+
if retention_time==nil
|
68
|
+
retention_time=rt_table[spect.chop]
|
69
|
+
if retention_time==nil
|
70
|
+
retention_time=rt_table[spect.chop.chop]
|
71
|
+
end
|
72
|
+
end
|
73
|
+
if ( retention_time!=nil)
|
74
|
+
|
75
|
+
if ( query.attributes["retention_time_sec"]!=nil )
|
76
|
+
puts "A retention time value is already present"
|
77
|
+
exit
|
78
|
+
end
|
79
|
+
|
80
|
+
if ( query.attributes["retention_time_sec"]==nil || over_write)
|
81
|
+
query.attributes["retention_time_sec"]=retention_time
|
82
|
+
# p queries[i].attributes["retention_time_sec"]
|
83
|
+
end
|
84
|
+
else
|
85
|
+
puts "No retention time found for spectrum #{spect}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
pepxml_doc.save(pepxml_file)
|
@@ -0,0 +1,193 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 4/9/2013
|
5
|
+
#
|
6
|
+
#
|
7
|
+
|
8
|
+
require 'protk/constants'
|
9
|
+
require 'protk/tool'
|
10
|
+
require 'bio'
|
11
|
+
|
12
|
+
tool=Tool.new([:explicit_output])
|
13
|
+
tool.option_parser.banner = "Create a protein database from Augustus gene prediction output that is suitable for later processing by proteogenomics tools.\n\nUsage: augustus_to_proteindb.rb [options] augustus.gff3"
|
14
|
+
|
15
|
+
tool.options.add_transcript_info=false
|
16
|
+
tool.option_parser.on( '--info', 'Include CDS coordinates' ) do
|
17
|
+
tool.options.add_transcript_info=true
|
18
|
+
end
|
19
|
+
|
20
|
+
exit unless tool.check_options
|
21
|
+
|
22
|
+
if ( ARGV[0].nil? )
|
23
|
+
puts "You must supply an input file"
|
24
|
+
puts tool.option_parser
|
25
|
+
exit
|
26
|
+
end
|
27
|
+
|
28
|
+
inname=ARGV.shift
|
29
|
+
|
30
|
+
$add_transcript_info=tool.add_transcript_info
|
31
|
+
|
32
|
+
$print_progress=true
|
33
|
+
|
34
|
+
outfile=nil
|
35
|
+
if ( tool.explicit_output != nil)
|
36
|
+
outfile=File.open(tool.explicit_output,'w')
|
37
|
+
else
|
38
|
+
outfile=$stdout
|
39
|
+
$print_progress=false
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
def get_transcript_lines(gene_lines)
|
44
|
+
transcripts=[]
|
45
|
+
gene_lines.each do |line|
|
46
|
+
if line =~ /transcript\t(\d*?)\t/
|
47
|
+
transcripts << line
|
48
|
+
end
|
49
|
+
end
|
50
|
+
transcripts
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_cds_lines(gene_lines)
|
54
|
+
coding_sequences=[]
|
55
|
+
gene_lines.each do |line|
|
56
|
+
if line =~ /CDS\t(\d*?)\t/
|
57
|
+
coding_sequences << line
|
58
|
+
end
|
59
|
+
end
|
60
|
+
coding_sequences
|
61
|
+
end
|
62
|
+
|
63
|
+
$capturing_protein=false
|
64
|
+
|
65
|
+
def capture_protein_start(line)
|
66
|
+
if line=~/protein sequence = \[/
|
67
|
+
$capturing_protein=true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def at_protein_end(line)
|
72
|
+
if $capturing_protein && line =~ /# .*?\]/
|
73
|
+
return true
|
74
|
+
end
|
75
|
+
return false
|
76
|
+
end
|
77
|
+
|
78
|
+
def get_protein_sequence_lines(gene_lines)
|
79
|
+
$capturing_protein=false
|
80
|
+
proteins=[]
|
81
|
+
current_protein_lines=[]
|
82
|
+
gene_lines.each do |line|
|
83
|
+
capture_protein_start(line)
|
84
|
+
if at_protein_end(line)
|
85
|
+
current_protein_lines << line
|
86
|
+
proteins << current_protein_lines
|
87
|
+
current_protein_lines=[]
|
88
|
+
$capturing_protein=false
|
89
|
+
else
|
90
|
+
current_protein_lines << line if $capturing_protein
|
91
|
+
end
|
92
|
+
end
|
93
|
+
proteins
|
94
|
+
end
|
95
|
+
|
96
|
+
def cds_to_header_text(coding_sequence,transcript_id)
|
97
|
+
# require 'debugger';debugger
|
98
|
+
imatch=coding_sequence.match(/CDS\t(\d+)\t(\d+).*?([-\+]{1}.*?Parent=#{transcript_id})$/)
|
99
|
+
if imatch==nil
|
100
|
+
return ""
|
101
|
+
end
|
102
|
+
istart=imatch[1]
|
103
|
+
iend=imatch[2]
|
104
|
+
"#{istart}|#{iend}"
|
105
|
+
end
|
106
|
+
|
107
|
+
def sequence_fasta_header(transcript_line,coding_sequences,scaffold)
|
108
|
+
|
109
|
+
tmatch=transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
|
110
|
+
# require 'debugger'; debugger
|
111
|
+
tstart=tmatch[1]
|
112
|
+
tend=tmatch[2]
|
113
|
+
tstrand="fwd"
|
114
|
+
tstrand = "rev" if tmatch[3]=="-"
|
115
|
+
|
116
|
+
tid=tmatch[4]
|
117
|
+
header=">lcl|#{scaffold}_#{tstrand}_#{tid} #{tstart}|#{tend}"
|
118
|
+
if $add_transcript_info
|
119
|
+
coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
|
120
|
+
end
|
121
|
+
header
|
122
|
+
end
|
123
|
+
|
124
|
+
def protein_sequence(protein_lines)
|
125
|
+
seq=""
|
126
|
+
protein_lines.each_with_index do |line, i|
|
127
|
+
seq << line.match(/(\w+)\]?$/)[1]
|
128
|
+
end
|
129
|
+
|
130
|
+
seq
|
131
|
+
end
|
132
|
+
|
133
|
+
def parse_gene(gene_lines)
|
134
|
+
|
135
|
+
geneid=gene_lines[0].match(/start gene (.*)/)[1]
|
136
|
+
transcripts=get_transcript_lines(gene_lines)
|
137
|
+
coding_sequences=get_cds_lines(gene_lines)
|
138
|
+
proteins=get_protein_sequence_lines(gene_lines)
|
139
|
+
fasta_string=""
|
140
|
+
throw "transcripts/protein mismatch" unless transcripts.length == proteins.length
|
141
|
+
transcripts.each_with_index do |ts, i|
|
142
|
+
fh=sequence_fasta_header(ts,coding_sequences,$current_scaffold)
|
143
|
+
fasta_string << "#{fh}\n"
|
144
|
+
ps=protein_sequence(proteins[i])
|
145
|
+
fasta_string << "#{ps}\n"
|
146
|
+
end
|
147
|
+
|
148
|
+
gene_lines=[]
|
149
|
+
$capturing_gene=false
|
150
|
+
fasta_string
|
151
|
+
end
|
152
|
+
|
153
|
+
def capture_scaffold(line)
|
154
|
+
if line =~ /-- prediction on sequence number.*?name = (.*)\)/
|
155
|
+
$current_scaffold=line.match(/-- prediction on sequence number.*?name = (.*)\)/)[1]
|
156
|
+
if ( $print_progress)
|
157
|
+
puts $current_scaffold
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def capture_gene_start(line)
|
163
|
+
if line =~ /# start gene/
|
164
|
+
$capturing_gene=true
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def at_gene_end(line)
|
169
|
+
if line =~ /# end gene/
|
170
|
+
return true
|
171
|
+
end
|
172
|
+
return false
|
173
|
+
end
|
174
|
+
|
175
|
+
$current_scaffold=""
|
176
|
+
gene_lines=[]
|
177
|
+
$capturing_gene=false
|
178
|
+
|
179
|
+
|
180
|
+
File.open(inname).each_with_index do |line, line_i|
|
181
|
+
line.chomp!
|
182
|
+
capture_scaffold(line)
|
183
|
+
capture_gene_start(line)
|
184
|
+
|
185
|
+
if at_gene_end(line)
|
186
|
+
gene_string=parse_gene(gene_lines)
|
187
|
+
outfile.write gene_string
|
188
|
+
gene_lines=[]
|
189
|
+
else
|
190
|
+
gene_lines << line if $capturing_gene
|
191
|
+
end
|
192
|
+
|
193
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
#
|
5
|
+
#
|
6
|
+
|
7
|
+
require 'protk/constants'
|
8
|
+
require 'protk/tool'
|
9
|
+
require 'bio'
|
10
|
+
require 'protk/fastadb'
|
11
|
+
require 'bio-blastxmlparser'
|
12
|
+
|
13
|
+
|
14
|
+
tool=Tool.new([:explicit_output])
|
15
|
+
tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
|
16
|
+
|
17
|
+
tool.options.database=nil
|
18
|
+
tool.option_parser.on( '-d filename','--database filename', 'Database used for BLAST search. If provided, hit sequences will be looked up in this database' ) do |file|
|
19
|
+
tool.options.database=file
|
20
|
+
end
|
21
|
+
|
22
|
+
exit unless tool.check_options
|
23
|
+
|
24
|
+
#require 'debugger';debugger
|
25
|
+
|
26
|
+
exit unless ARGV.length == 1
|
27
|
+
input_file=ARGV[0]
|
28
|
+
|
29
|
+
out_file=$stdout
|
30
|
+
if ( tool.explicit_output != nil)
|
31
|
+
out_file=File.open(tool.explicit_output, "w")
|
32
|
+
end
|
33
|
+
|
34
|
+
$fastadb = nil
|
35
|
+
if tool.database
|
36
|
+
$fastadb=FastaDB.new(tool.database)
|
37
|
+
end
|
38
|
+
|
39
|
+
def generate_line(hsp,hit,query,hit_seq=nil)
|
40
|
+
line="#{query.query_id}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
|
41
|
+
if hit_seq
|
42
|
+
line << "\t#{hit_seq}"
|
43
|
+
end
|
44
|
+
line<<"\n"
|
45
|
+
line
|
46
|
+
end
|
47
|
+
|
48
|
+
def fetch_hit_seq(hit)
|
49
|
+
hit_seq=nil
|
50
|
+
if $fastadb
|
51
|
+
hit_seq=$fastadb.fetch(hit.hit_id).first.aaseq
|
52
|
+
end
|
53
|
+
hit_seq
|
54
|
+
end
|
55
|
+
|
56
|
+
blast = Bio::BlastXMLParser::XmlSplitterIterator.new(input_file).to_enum
|
57
|
+
|
58
|
+
blast.each do |query|
|
59
|
+
query.hits.each do |hit|
|
60
|
+
# hit=query.hits.first
|
61
|
+
# if hit
|
62
|
+
hit_seq=fetch_hit_seq(hit)
|
63
|
+
hit.hsps.each do |hsp|
|
64
|
+
out_file.write generate_line(hsp,hit,query,hit_seq)
|
65
|
+
end
|
66
|
+
# end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
#require 'debugger';debugger
|
71
|
+
|
72
|
+
#puts "Hi"
|
data/bin/feature_finder.rb
CHANGED
@@ -10,6 +10,7 @@ require 'protk/command_runner'
|
|
10
10
|
require 'protk/tool'
|
11
11
|
require 'protk/openms_defaults'
|
12
12
|
require 'libxml'
|
13
|
+
require 'tempfile'
|
13
14
|
|
14
15
|
include LibXML
|
15
16
|
|
@@ -62,7 +63,9 @@ end
|
|
62
63
|
|
63
64
|
throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
|
64
65
|
|
65
|
-
|
66
|
+
input_basename=Pathname.new(ARGV[0]).basename.to_s
|
67
|
+
ini_file_name="#{Pathname.new(Tempfile.new(input_basename).path).basename.to_s}_feature_finder.ini"
|
68
|
+
ini_file="#{Pathname.new(ini_file_name).dirname.realpath.to_s}/#{ini_file_name}"
|
66
69
|
|
67
70
|
generate_ini(tool,ini_file)
|
68
71
|
|
@@ -74,6 +77,9 @@ ARGV.each do |filen|
|
|
74
77
|
output_dir=Pathname.new(input_basename).dirname.realpath.to_s
|
75
78
|
output_base=Pathname.new(input_basename).basename.to_s
|
76
79
|
output_file = "#{output_dir}/#{tool.output_prefix}#{output_base}#{tool.output_suffix}.featureXML"
|
80
|
+
if ( tool.explicit_output )
|
81
|
+
output_file = "#{output_dir}/#{tool.explicit_output}"
|
82
|
+
end
|
77
83
|
|
78
84
|
if ( tool.over_write || !Pathname.new(output_file).exist? )
|
79
85
|
output_base_filename=Pathname.new(output_file).basename.to_s
|
data/bin/make_decoy.rb
CHANGED
@@ -11,6 +11,8 @@ require 'libxml'
|
|
11
11
|
require 'protk/constants'
|
12
12
|
require 'protk/command_runner'
|
13
13
|
require 'protk/tool'
|
14
|
+
require 'protk/randomize'
|
15
|
+
require 'tempfile'
|
14
16
|
require 'bio'
|
15
17
|
|
16
18
|
include LibXML
|
@@ -60,8 +62,14 @@ output_file = tool.explicit_output if tool.explicit_output!=nil
|
|
60
62
|
|
61
63
|
genv=Constants.new()
|
62
64
|
|
63
|
-
|
64
|
-
|
65
|
+
decoys_tmp_file = Pathname.new(Tempfile.new("random").path).basename.to_s;
|
66
|
+
|
67
|
+
Randomize.make_decoys input_file, db_length, decoys_tmp_file, tool.prefix_string
|
68
|
+
cmd = "cat #{input_file} #{decoys_tmp_file} >> #{output_file}; rm #{decoys_tmp_file}" if ( tool.append )
|
69
|
+
|
70
|
+
# Randomize.make_decoys raw_db_filename, db_length, decoys_filename, decoy_prefix
|
71
|
+
# cmd = "cat #{raw_db_filename} #{decoys_filename} >> #{decoy_db_filename}; rm #{decoys_filename}"
|
72
|
+
|
65
73
|
p cmd
|
66
74
|
# Run the conversion
|
67
75
|
#
|
data/bin/mascot_search.rb
CHANGED
@@ -124,7 +124,7 @@ search_tool.option_parser.on('--username un', 'Username.') do |un|
|
|
124
124
|
search_tool.options.username = un
|
125
125
|
end
|
126
126
|
|
127
|
-
search_tool.options.httpproxy=
|
127
|
+
search_tool.options.httpproxy=nil
|
128
128
|
search_tool.option_parser.on( '--proxy url', 'The url to a proxy server' ) do |urll|
|
129
129
|
search_tool.options.httpproxy=urll
|
130
130
|
end
|
@@ -144,6 +144,11 @@ search_tool.option_parser.on( '--export format', 'Save results in a specified fo
|
|
144
144
|
search_tool.options.export_format=format
|
145
145
|
end
|
146
146
|
|
147
|
+
search_tool.options.timeout=200
|
148
|
+
search_tool.option_parser.on( '--timeout seconds', 'Timeout for sending data file to mascot in seconds' ) do |seconds|
|
149
|
+
search_tool.options.timeout=seconds.to_i
|
150
|
+
end
|
151
|
+
|
147
152
|
exit unless search_tool.check_options
|
148
153
|
|
149
154
|
if ( ARGV[0].nil? )
|
@@ -161,8 +166,7 @@ unless ( mascot_cgi =~ /^http[s]?:\/\//)
|
|
161
166
|
mascot_cgi = "http://#{mascot_cgi}"
|
162
167
|
end
|
163
168
|
|
164
|
-
RestClient.proxy=search_tool.httpproxy
|
165
|
-
|
169
|
+
RestClient.proxy=search_tool.httpproxy if search_tool.httpproxy
|
166
170
|
$genv.log("Var mods #{search_tool.var_mods} and fixed #{search_tool.fix_mods}",:info)
|
167
171
|
|
168
172
|
cookie=""
|
@@ -177,7 +181,13 @@ end
|
|
177
181
|
postdict = search_params_dictionary search_tool, ARGV[0]
|
178
182
|
$genv.log("Sending #{postdict}",:info)
|
179
183
|
|
180
|
-
|
184
|
+
#site = RestClient::Resource.new(mascot_cgi, timeout=300)
|
185
|
+
#search_response=site['/nph-mascot.exe?1'].post , postdict, {:cookies=>cookie}
|
186
|
+
|
187
|
+
search_response=RestClient::Request.execute(:method => :post, :url => "#{mascot_cgi}/nph-mascot.exe?1", :payload => postdict,:headers=>{:cookies=>cookie},:timeout => search_tool.options.timeout, :open_timeout => 10)
|
188
|
+
|
189
|
+
|
190
|
+
#search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
|
181
191
|
|
182
192
|
$genv.log("Mascot search response was #{search_response}",:info)
|
183
193
|
|