protk 1.2.4 → 1.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 14/12/2010
5
+ #
6
+ # Attempts to add retention times to pepxml by looking up retention times in a raw file
7
+ #
8
+
9
+ require 'protk/constants'
10
+ require 'protk/command_runner'
11
+ require 'protk/tool'
12
+ require 'libxml'
13
+ require 'protk/mascot_util'
14
+ include LibXML
15
+
16
+ # Environment with global constants
17
+ #
18
+ genv=Constants.new
19
+
20
+ tool=Tool.new([:over_write])
21
+ tool.option_parser.banner = "Look up retention times in a raw file and \
22
+ add them to a pepxml file.\n\nUsage: add_retention_times.rb [options] file1.pep.xml file2.mgf"
23
+
24
+ exit unless tool.check_options
25
+
26
+ if ( ARGV[0].nil? || ARGV[1].nil? )
27
+ puts "You must supply an input file"
28
+ puts tool.option_parser
29
+ exit
30
+ end
31
+
32
+ pepxml_file=ARGV[0]
33
+ mgf_file=ARGV[1]
34
+
35
+ pepxml_parser=XML::Parser.file(pepxml_file)
36
+
37
+ begin
38
+ "Creating mascot spectrum id table"
39
+ rt_table=MascotUtil.index_mgf_times(mgf_file)
40
+ rescue
41
+ puts "Unable to index retention times in mgf file"
42
+ exit
43
+ end
44
+
45
+ pepxml_ns_prefix="xmlns:"
46
+ pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
47
+
48
+ pepxml_doc=pepxml_parser.parse
49
+ if not pepxml_doc.root.namespaces.default
50
+ pepxml_ns_prefix=""
51
+ pepxml_ns=nil
52
+ end
53
+
54
+
55
+
56
+ queries=pepxml_doc.find("//#{pepxml_ns_prefix}spectrum_query", pepxml_ns)
57
+
58
+ queries.each do |query|
59
+
60
+ atts=query.attributes
61
+ spect=atts["spectrum"]
62
+
63
+
64
+ throw "No spectrum found for spectrum_query #{query}" unless ( spect!=nil)
65
+
66
+ retention_time = rt_table[spect]
67
+ if retention_time==nil
68
+ retention_time=rt_table[spect.chop]
69
+ if retention_time==nil
70
+ retention_time=rt_table[spect.chop.chop]
71
+ end
72
+ end
73
+ if ( retention_time!=nil)
74
+
75
+ if ( query.attributes["retention_time_sec"]!=nil )
76
+ puts "A retention time value is already present"
77
+ exit
78
+ end
79
+
80
+ if ( query.attributes["retention_time_sec"]==nil || over_write)
81
+ query.attributes["retention_time_sec"]=retention_time
82
+ # p queries[i].attributes["retention_time_sec"]
83
+ end
84
+ else
85
+ puts "No retention time found for spectrum #{spect}"
86
+ end
87
+ end
88
+
89
+ pepxml_doc.save(pepxml_file)
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 4/9/2013
5
+ #
6
+ #
7
+
8
+ require 'protk/constants'
9
+ require 'protk/tool'
10
+ require 'bio'
11
+
12
+ tool=Tool.new([:explicit_output])
13
+ tool.option_parser.banner = "Create a protein database from Augustus gene prediction output that is suitable for later processing by proteogenomics tools.\n\nUsage: augustus_to_proteindb.rb [options] augustus.gff3"
14
+
15
+ tool.options.add_transcript_info=false
16
+ tool.option_parser.on( '--info', 'Include CDS coordinates' ) do
17
+ tool.options.add_transcript_info=true
18
+ end
19
+
20
+ exit unless tool.check_options
21
+
22
+ if ( ARGV[0].nil? )
23
+ puts "You must supply an input file"
24
+ puts tool.option_parser
25
+ exit
26
+ end
27
+
28
+ inname=ARGV.shift
29
+
30
+ $add_transcript_info=tool.add_transcript_info
31
+
32
+ $print_progress=true
33
+
34
+ outfile=nil
35
+ if ( tool.explicit_output != nil)
36
+ outfile=File.open(tool.explicit_output,'w')
37
+ else
38
+ outfile=$stdout
39
+ $print_progress=false
40
+ end
41
+
42
+
43
+ def get_transcript_lines(gene_lines)
44
+ transcripts=[]
45
+ gene_lines.each do |line|
46
+ if line =~ /transcript\t(\d*?)\t/
47
+ transcripts << line
48
+ end
49
+ end
50
+ transcripts
51
+ end
52
+
53
+ def get_cds_lines(gene_lines)
54
+ coding_sequences=[]
55
+ gene_lines.each do |line|
56
+ if line =~ /CDS\t(\d*?)\t/
57
+ coding_sequences << line
58
+ end
59
+ end
60
+ coding_sequences
61
+ end
62
+
63
+ $capturing_protein=false
64
+
65
+ def capture_protein_start(line)
66
+ if line=~/protein sequence = \[/
67
+ $capturing_protein=true
68
+ end
69
+ end
70
+
71
+ def at_protein_end(line)
72
+ if $capturing_protein && line =~ /# .*?\]/
73
+ return true
74
+ end
75
+ return false
76
+ end
77
+
78
+ def get_protein_sequence_lines(gene_lines)
79
+ $capturing_protein=false
80
+ proteins=[]
81
+ current_protein_lines=[]
82
+ gene_lines.each do |line|
83
+ capture_protein_start(line)
84
+ if at_protein_end(line)
85
+ current_protein_lines << line
86
+ proteins << current_protein_lines
87
+ current_protein_lines=[]
88
+ $capturing_protein=false
89
+ else
90
+ current_protein_lines << line if $capturing_protein
91
+ end
92
+ end
93
+ proteins
94
+ end
95
+
96
+ def cds_to_header_text(coding_sequence,transcript_id)
97
+ # require 'debugger';debugger
98
+ imatch=coding_sequence.match(/CDS\t(\d+)\t(\d+).*?([-\+]{1}.*?Parent=#{transcript_id})$/)
99
+ if imatch==nil
100
+ return ""
101
+ end
102
+ istart=imatch[1]
103
+ iend=imatch[2]
104
+ "#{istart}|#{iend}"
105
+ end
106
+
107
+ def sequence_fasta_header(transcript_line,coding_sequences,scaffold)
108
+
109
+ tmatch=transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
110
+ # require 'debugger'; debugger
111
+ tstart=tmatch[1]
112
+ tend=tmatch[2]
113
+ tstrand="fwd"
114
+ tstrand = "rev" if tmatch[3]=="-"
115
+
116
+ tid=tmatch[4]
117
+ header=">lcl|#{scaffold}_#{tstrand}_#{tid} #{tstart}|#{tend}"
118
+ if $add_transcript_info
119
+ coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
120
+ end
121
+ header
122
+ end
123
+
124
+ def protein_sequence(protein_lines)
125
+ seq=""
126
+ protein_lines.each_with_index do |line, i|
127
+ seq << line.match(/(\w+)\]?$/)[1]
128
+ end
129
+
130
+ seq
131
+ end
132
+
133
+ def parse_gene(gene_lines)
134
+
135
+ geneid=gene_lines[0].match(/start gene (.*)/)[1]
136
+ transcripts=get_transcript_lines(gene_lines)
137
+ coding_sequences=get_cds_lines(gene_lines)
138
+ proteins=get_protein_sequence_lines(gene_lines)
139
+ fasta_string=""
140
+ throw "transcripts/protein mismatch" unless transcripts.length == proteins.length
141
+ transcripts.each_with_index do |ts, i|
142
+ fh=sequence_fasta_header(ts,coding_sequences,$current_scaffold)
143
+ fasta_string << "#{fh}\n"
144
+ ps=protein_sequence(proteins[i])
145
+ fasta_string << "#{ps}\n"
146
+ end
147
+
148
+ gene_lines=[]
149
+ $capturing_gene=false
150
+ fasta_string
151
+ end
152
+
153
+ def capture_scaffold(line)
154
+ if line =~ /-- prediction on sequence number.*?name = (.*)\)/
155
+ $current_scaffold=line.match(/-- prediction on sequence number.*?name = (.*)\)/)[1]
156
+ if ( $print_progress)
157
+ puts $current_scaffold
158
+ end
159
+ end
160
+ end
161
+
162
+ def capture_gene_start(line)
163
+ if line =~ /# start gene/
164
+ $capturing_gene=true
165
+ end
166
+ end
167
+
168
+ def at_gene_end(line)
169
+ if line =~ /# end gene/
170
+ return true
171
+ end
172
+ return false
173
+ end
174
+
175
+ $current_scaffold=""
176
+ gene_lines=[]
177
+ $capturing_gene=false
178
+
179
+
180
+ File.open(inname).each_with_index do |line, line_i|
181
+ line.chomp!
182
+ capture_scaffold(line)
183
+ capture_gene_start(line)
184
+
185
+ if at_gene_end(line)
186
+ gene_string=parse_gene(gene_lines)
187
+ outfile.write gene_string
188
+ gene_lines=[]
189
+ else
190
+ gene_lines << line if $capturing_gene
191
+ end
192
+
193
+ end
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ #
5
+ #
6
+
7
+ require 'protk/constants'
8
+ require 'protk/tool'
9
+ require 'bio'
10
+ require 'protk/fastadb'
11
+ require 'bio-blastxmlparser'
12
+
13
+
14
+ tool=Tool.new([:explicit_output])
15
+ tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
16
+
17
+ tool.options.database=nil
18
+ tool.option_parser.on( '-d filename','--database filename', 'Database used for BLAST search. If provided, hit sequences will be looked up in this database' ) do |file|
19
+ tool.options.database=file
20
+ end
21
+
22
+ exit unless tool.check_options
23
+
24
+ #require 'debugger';debugger
25
+
26
+ exit unless ARGV.length == 1
27
+ input_file=ARGV[0]
28
+
29
+ out_file=$stdout
30
+ if ( tool.explicit_output != nil)
31
+ out_file=File.open(tool.explicit_output, "w")
32
+ end
33
+
34
+ $fastadb = nil
35
+ if tool.database
36
+ $fastadb=FastaDB.new(tool.database)
37
+ end
38
+
39
+ def generate_line(hsp,hit,query,hit_seq=nil)
40
+ line="#{query.query_id}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
41
+ if hit_seq
42
+ line << "\t#{hit_seq}"
43
+ end
44
+ line<<"\n"
45
+ line
46
+ end
47
+
48
+ def fetch_hit_seq(hit)
49
+ hit_seq=nil
50
+ if $fastadb
51
+ hit_seq=$fastadb.fetch(hit.hit_id).first.aaseq
52
+ end
53
+ hit_seq
54
+ end
55
+
56
+ blast = Bio::BlastXMLParser::XmlSplitterIterator.new(input_file).to_enum
57
+
58
+ blast.each do |query|
59
+ query.hits.each do |hit|
60
+ # hit=query.hits.first
61
+ # if hit
62
+ hit_seq=fetch_hit_seq(hit)
63
+ hit.hsps.each do |hsp|
64
+ out_file.write generate_line(hsp,hit,query,hit_seq)
65
+ end
66
+ # end
67
+ end
68
+ end
69
+
70
+ #require 'debugger';debugger
71
+
72
+ #puts "Hi"
@@ -10,6 +10,7 @@ require 'protk/command_runner'
10
10
  require 'protk/tool'
11
11
  require 'protk/openms_defaults'
12
12
  require 'libxml'
13
+ require 'tempfile'
13
14
 
14
15
  include LibXML
15
16
 
@@ -62,7 +63,9 @@ end
62
63
 
63
64
  throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
64
65
 
65
- ini_file="#{Pathname.new(ARGV[0]).dirname.realpath.to_s}/feature_finder.ini"
66
+ input_basename=Pathname.new(ARGV[0]).basename.to_s
67
+ ini_file_name="#{Pathname.new(Tempfile.new(input_basename).path).basename.to_s}_feature_finder.ini"
68
+ ini_file="#{Pathname.new(ini_file_name).dirname.realpath.to_s}/#{ini_file_name}"
66
69
 
67
70
  generate_ini(tool,ini_file)
68
71
 
@@ -74,6 +77,9 @@ ARGV.each do |filen|
74
77
  output_dir=Pathname.new(input_basename).dirname.realpath.to_s
75
78
  output_base=Pathname.new(input_basename).basename.to_s
76
79
  output_file = "#{output_dir}/#{tool.output_prefix}#{output_base}#{tool.output_suffix}.featureXML"
80
+ if ( tool.explicit_output )
81
+ output_file = "#{output_dir}/#{tool.explicit_output}"
82
+ end
77
83
 
78
84
  if ( tool.over_write || !Pathname.new(output_file).exist? )
79
85
  output_base_filename=Pathname.new(output_file).basename.to_s
data/bin/make_decoy.rb CHANGED
@@ -11,6 +11,8 @@ require 'libxml'
11
11
  require 'protk/constants'
12
12
  require 'protk/command_runner'
13
13
  require 'protk/tool'
14
+ require 'protk/randomize'
15
+ require 'tempfile'
14
16
  require 'bio'
15
17
 
16
18
  include LibXML
@@ -60,8 +62,14 @@ output_file = tool.explicit_output if tool.explicit_output!=nil
60
62
 
61
63
  genv=Constants.new()
62
64
 
63
- Randomize.make_decoys #{input_file} #{db_length} #{output_file} #{tool.prefix_string}"
64
- cmd << "cat #{input_file} >> #{output_file}" if ( tool.append )
65
+ decoys_tmp_file = Pathname.new(Tempfile.new("random").path).basename.to_s;
66
+
67
+ Randomize.make_decoys input_file, db_length, decoys_tmp_file, tool.prefix_string
68
+ cmd = "cat #{input_file} #{decoys_tmp_file} >> #{output_file}; rm #{decoys_tmp_file}" if ( tool.append )
69
+
70
+ # Randomize.make_decoys raw_db_filename, db_length, decoys_filename, decoy_prefix
71
+ # cmd = "cat #{raw_db_filename} #{decoys_filename} >> #{decoy_db_filename}; rm #{decoys_filename}"
72
+
65
73
  p cmd
66
74
  # Run the conversion
67
75
  #
data/bin/mascot_search.rb CHANGED
@@ -124,7 +124,7 @@ search_tool.option_parser.on('--username un', 'Username.') do |un|
124
124
  search_tool.options.username = un
125
125
  end
126
126
 
127
- search_tool.options.httpproxy=""
127
+ search_tool.options.httpproxy=nil
128
128
  search_tool.option_parser.on( '--proxy url', 'The url to a proxy server' ) do |urll|
129
129
  search_tool.options.httpproxy=urll
130
130
  end
@@ -144,6 +144,11 @@ search_tool.option_parser.on( '--export format', 'Save results in a specified fo
144
144
  search_tool.options.export_format=format
145
145
  end
146
146
 
147
+ search_tool.options.timeout=200
148
+ search_tool.option_parser.on( '--timeout seconds', 'Timeout for sending data file to mascot in seconds' ) do |seconds|
149
+ search_tool.options.timeout=seconds.to_i
150
+ end
151
+
147
152
  exit unless search_tool.check_options
148
153
 
149
154
  if ( ARGV[0].nil? )
@@ -161,8 +166,7 @@ unless ( mascot_cgi =~ /^http[s]?:\/\//)
161
166
  mascot_cgi = "http://#{mascot_cgi}"
162
167
  end
163
168
 
164
- RestClient.proxy=search_tool.httpproxy
165
-
169
+ RestClient.proxy=search_tool.httpproxy if search_tool.httpproxy
166
170
  $genv.log("Var mods #{search_tool.var_mods} and fixed #{search_tool.fix_mods}",:info)
167
171
 
168
172
  cookie=""
@@ -177,7 +181,13 @@ end
177
181
  postdict = search_params_dictionary search_tool, ARGV[0]
178
182
  $genv.log("Sending #{postdict}",:info)
179
183
 
180
- search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
184
+ #site = RestClient::Resource.new(mascot_cgi, timeout=300)
185
+ #search_response=site['/nph-mascot.exe?1'].post , postdict, {:cookies=>cookie}
186
+
187
+ search_response=RestClient::Request.execute(:method => :post, :url => "#{mascot_cgi}/nph-mascot.exe?1", :payload => postdict,:headers=>{:cookies=>cookie},:timeout => search_tool.options.timeout, :open_timeout => 10)
188
+
189
+
190
+ #search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
181
191
 
182
192
  $genv.log("Mascot search response was #{search_response}",:info)
183
193