protk 1.2.4 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,89 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 14/12/2010
5
+ #
6
+ # Attempts to add retention times to pepxml by looking up retention times in a raw file
7
+ #
8
+
9
+ require 'protk/constants'
10
+ require 'protk/command_runner'
11
+ require 'protk/tool'
12
+ require 'libxml'
13
+ require 'protk/mascot_util'
14
+ include LibXML
15
+
16
+ # Environment with global constants
17
+ #
18
+ genv=Constants.new
19
+
20
+ tool=Tool.new([:over_write])
21
+ tool.option_parser.banner = "Look up retention times in a raw file and \
22
+ add them to a pepxml file.\n\nUsage: add_retention_times.rb [options] file1.pep.xml file2.mgf"
23
+
24
+ exit unless tool.check_options
25
+
26
+ if ( ARGV[0].nil? || ARGV[1].nil? )
27
+ puts "You must supply an input file"
28
+ puts tool.option_parser
29
+ exit
30
+ end
31
+
32
+ pepxml_file=ARGV[0]
33
+ mgf_file=ARGV[1]
34
+
35
+ pepxml_parser=XML::Parser.file(pepxml_file)
36
+
37
+ begin
38
+ "Creating mascot spectrum id table"
39
+ rt_table=MascotUtil.index_mgf_times(mgf_file)
40
+ rescue
41
+ puts "Unable to index retention times in mgf file"
42
+ exit
43
+ end
44
+
45
+ pepxml_ns_prefix="xmlns:"
46
+ pepxml_ns="xmlns:http://regis-web.systemsbiology.net/pepXML"
47
+
48
+ pepxml_doc=pepxml_parser.parse
49
+ if not pepxml_doc.root.namespaces.default
50
+ pepxml_ns_prefix=""
51
+ pepxml_ns=nil
52
+ end
53
+
54
+
55
+
56
+ queries=pepxml_doc.find("//#{pepxml_ns_prefix}spectrum_query", pepxml_ns)
57
+
58
+ queries.each do |query|
59
+
60
+ atts=query.attributes
61
+ spect=atts["spectrum"]
62
+
63
+
64
+ throw "No spectrum found for spectrum_query #{query}" unless ( spect!=nil)
65
+
66
+ retention_time = rt_table[spect]
67
+ if retention_time==nil
68
+ retention_time=rt_table[spect.chop]
69
+ if retention_time==nil
70
+ retention_time=rt_table[spect.chop.chop]
71
+ end
72
+ end
73
+ if ( retention_time!=nil)
74
+
75
+ if ( query.attributes["retention_time_sec"]!=nil )
76
+ puts "A retention time value is already present"
77
+ exit
78
+ end
79
+
80
+ if ( query.attributes["retention_time_sec"]==nil || over_write)
81
+ query.attributes["retention_time_sec"]=retention_time
82
+ # p queries[i].attributes["retention_time_sec"]
83
+ end
84
+ else
85
+ puts "No retention time found for spectrum #{spect}"
86
+ end
87
+ end
88
+
89
+ pepxml_doc.save(pepxml_file)
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 4/9/2013
5
+ #
6
+ #
7
+
8
+ require 'protk/constants'
9
+ require 'protk/tool'
10
+ require 'bio'
11
+
12
+ tool=Tool.new([:explicit_output])
13
+ tool.option_parser.banner = "Create a protein database from Augustus gene prediction output that is suitable for later processing by proteogenomics tools.\n\nUsage: augustus_to_proteindb.rb [options] augustus.gff3"
14
+
15
+ tool.options.add_transcript_info=false
16
+ tool.option_parser.on( '--info', 'Include CDS coordinates' ) do
17
+ tool.options.add_transcript_info=true
18
+ end
19
+
20
+ exit unless tool.check_options
21
+
22
+ if ( ARGV[0].nil? )
23
+ puts "You must supply an input file"
24
+ puts tool.option_parser
25
+ exit
26
+ end
27
+
28
+ inname=ARGV.shift
29
+
30
+ $add_transcript_info=tool.add_transcript_info
31
+
32
+ $print_progress=true
33
+
34
+ outfile=nil
35
+ if ( tool.explicit_output != nil)
36
+ outfile=File.open(tool.explicit_output,'w')
37
+ else
38
+ outfile=$stdout
39
+ $print_progress=false
40
+ end
41
+
42
+
43
+ def get_transcript_lines(gene_lines)
44
+ transcripts=[]
45
+ gene_lines.each do |line|
46
+ if line =~ /transcript\t(\d*?)\t/
47
+ transcripts << line
48
+ end
49
+ end
50
+ transcripts
51
+ end
52
+
53
+ def get_cds_lines(gene_lines)
54
+ coding_sequences=[]
55
+ gene_lines.each do |line|
56
+ if line =~ /CDS\t(\d*?)\t/
57
+ coding_sequences << line
58
+ end
59
+ end
60
+ coding_sequences
61
+ end
62
+
63
+ $capturing_protein=false
64
+
65
+ def capture_protein_start(line)
66
+ if line=~/protein sequence = \[/
67
+ $capturing_protein=true
68
+ end
69
+ end
70
+
71
+ def at_protein_end(line)
72
+ if $capturing_protein && line =~ /# .*?\]/
73
+ return true
74
+ end
75
+ return false
76
+ end
77
+
78
+ def get_protein_sequence_lines(gene_lines)
79
+ $capturing_protein=false
80
+ proteins=[]
81
+ current_protein_lines=[]
82
+ gene_lines.each do |line|
83
+ capture_protein_start(line)
84
+ if at_protein_end(line)
85
+ current_protein_lines << line
86
+ proteins << current_protein_lines
87
+ current_protein_lines=[]
88
+ $capturing_protein=false
89
+ else
90
+ current_protein_lines << line if $capturing_protein
91
+ end
92
+ end
93
+ proteins
94
+ end
95
+
96
+ def cds_to_header_text(coding_sequence,transcript_id)
97
+ # require 'debugger';debugger
98
+ imatch=coding_sequence.match(/CDS\t(\d+)\t(\d+).*?([-\+]{1}.*?Parent=#{transcript_id})$/)
99
+ if imatch==nil
100
+ return ""
101
+ end
102
+ istart=imatch[1]
103
+ iend=imatch[2]
104
+ "#{istart}|#{iend}"
105
+ end
106
+
107
+ def sequence_fasta_header(transcript_line,coding_sequences,scaffold)
108
+
109
+ tmatch=transcript_line.match(/transcript\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
110
+ # require 'debugger'; debugger
111
+ tstart=tmatch[1]
112
+ tend=tmatch[2]
113
+ tstrand="fwd"
114
+ tstrand = "rev" if tmatch[3]=="-"
115
+
116
+ tid=tmatch[4]
117
+ header=">lcl|#{scaffold}_#{tstrand}_#{tid} #{tstart}|#{tend}"
118
+ if $add_transcript_info
119
+ coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
120
+ end
121
+ header
122
+ end
123
+
124
+ def protein_sequence(protein_lines)
125
+ seq=""
126
+ protein_lines.each_with_index do |line, i|
127
+ seq << line.match(/(\w+)\]?$/)[1]
128
+ end
129
+
130
+ seq
131
+ end
132
+
133
+ def parse_gene(gene_lines)
134
+
135
+ geneid=gene_lines[0].match(/start gene (.*)/)[1]
136
+ transcripts=get_transcript_lines(gene_lines)
137
+ coding_sequences=get_cds_lines(gene_lines)
138
+ proteins=get_protein_sequence_lines(gene_lines)
139
+ fasta_string=""
140
+ throw "transcripts/protein mismatch" unless transcripts.length == proteins.length
141
+ transcripts.each_with_index do |ts, i|
142
+ fh=sequence_fasta_header(ts,coding_sequences,$current_scaffold)
143
+ fasta_string << "#{fh}\n"
144
+ ps=protein_sequence(proteins[i])
145
+ fasta_string << "#{ps}\n"
146
+ end
147
+
148
+ gene_lines=[]
149
+ $capturing_gene=false
150
+ fasta_string
151
+ end
152
+
153
+ def capture_scaffold(line)
154
+ if line =~ /-- prediction on sequence number.*?name = (.*)\)/
155
+ $current_scaffold=line.match(/-- prediction on sequence number.*?name = (.*)\)/)[1]
156
+ if ( $print_progress)
157
+ puts $current_scaffold
158
+ end
159
+ end
160
+ end
161
+
162
+ def capture_gene_start(line)
163
+ if line =~ /# start gene/
164
+ $capturing_gene=true
165
+ end
166
+ end
167
+
168
+ def at_gene_end(line)
169
+ if line =~ /# end gene/
170
+ return true
171
+ end
172
+ return false
173
+ end
174
+
175
+ $current_scaffold=""
176
+ gene_lines=[]
177
+ $capturing_gene=false
178
+
179
+
180
+ File.open(inname).each_with_index do |line, line_i|
181
+ line.chomp!
182
+ capture_scaffold(line)
183
+ capture_gene_start(line)
184
+
185
+ if at_gene_end(line)
186
+ gene_string=parse_gene(gene_lines)
187
+ outfile.write gene_string
188
+ gene_lines=[]
189
+ else
190
+ gene_lines << line if $capturing_gene
191
+ end
192
+
193
+ end
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ #
5
+ #
6
+
7
+ require 'protk/constants'
8
+ require 'protk/tool'
9
+ require 'bio'
10
+ require 'protk/fastadb'
11
+ require 'bio-blastxmlparser'
12
+
13
+
14
+ tool=Tool.new([:explicit_output])
15
+ tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
16
+
17
+ tool.options.database=nil
18
+ tool.option_parser.on( '-d filename','--database filename', 'Database used for BLAST search. If provided, hit sequences will be looked up in this database' ) do |file|
19
+ tool.options.database=file
20
+ end
21
+
22
+ exit unless tool.check_options
23
+
24
+ #require 'debugger';debugger
25
+
26
+ exit unless ARGV.length == 1
27
+ input_file=ARGV[0]
28
+
29
+ out_file=$stdout
30
+ if ( tool.explicit_output != nil)
31
+ out_file=File.open(tool.explicit_output, "w")
32
+ end
33
+
34
+ $fastadb = nil
35
+ if tool.database
36
+ $fastadb=FastaDB.new(tool.database)
37
+ end
38
+
39
+ def generate_line(hsp,hit,query,hit_seq=nil)
40
+ line="#{query.query_id}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
41
+ if hit_seq
42
+ line << "\t#{hit_seq}"
43
+ end
44
+ line<<"\n"
45
+ line
46
+ end
47
+
48
+ def fetch_hit_seq(hit)
49
+ hit_seq=nil
50
+ if $fastadb
51
+ hit_seq=$fastadb.fetch(hit.hit_id).first.aaseq
52
+ end
53
+ hit_seq
54
+ end
55
+
56
+ blast = Bio::BlastXMLParser::XmlSplitterIterator.new(input_file).to_enum
57
+
58
+ blast.each do |query|
59
+ query.hits.each do |hit|
60
+ # hit=query.hits.first
61
+ # if hit
62
+ hit_seq=fetch_hit_seq(hit)
63
+ hit.hsps.each do |hsp|
64
+ out_file.write generate_line(hsp,hit,query,hit_seq)
65
+ end
66
+ # end
67
+ end
68
+ end
69
+
70
+ #require 'debugger';debugger
71
+
72
+ #puts "Hi"
@@ -10,6 +10,7 @@ require 'protk/command_runner'
10
10
  require 'protk/tool'
11
11
  require 'protk/openms_defaults'
12
12
  require 'libxml'
13
+ require 'tempfile'
13
14
 
14
15
  include LibXML
15
16
 
@@ -62,7 +63,9 @@ end
62
63
 
63
64
  throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
64
65
 
65
- ini_file="#{Pathname.new(ARGV[0]).dirname.realpath.to_s}/feature_finder.ini"
66
+ input_basename=Pathname.new(ARGV[0]).basename.to_s
67
+ ini_file_name="#{Pathname.new(Tempfile.new(input_basename).path).basename.to_s}_feature_finder.ini"
68
+ ini_file="#{Pathname.new(ini_file_name).dirname.realpath.to_s}/#{ini_file_name}"
66
69
 
67
70
  generate_ini(tool,ini_file)
68
71
 
@@ -74,6 +77,9 @@ ARGV.each do |filen|
74
77
  output_dir=Pathname.new(input_basename).dirname.realpath.to_s
75
78
  output_base=Pathname.new(input_basename).basename.to_s
76
79
  output_file = "#{output_dir}/#{tool.output_prefix}#{output_base}#{tool.output_suffix}.featureXML"
80
+ if ( tool.explicit_output )
81
+ output_file = "#{output_dir}/#{tool.explicit_output}"
82
+ end
77
83
 
78
84
  if ( tool.over_write || !Pathname.new(output_file).exist? )
79
85
  output_base_filename=Pathname.new(output_file).basename.to_s
data/bin/make_decoy.rb CHANGED
@@ -11,6 +11,8 @@ require 'libxml'
11
11
  require 'protk/constants'
12
12
  require 'protk/command_runner'
13
13
  require 'protk/tool'
14
+ require 'protk/randomize'
15
+ require 'tempfile'
14
16
  require 'bio'
15
17
 
16
18
  include LibXML
@@ -60,8 +62,14 @@ output_file = tool.explicit_output if tool.explicit_output!=nil
60
62
 
61
63
  genv=Constants.new()
62
64
 
63
- Randomize.make_decoys #{input_file} #{db_length} #{output_file} #{tool.prefix_string}"
64
- cmd << "cat #{input_file} >> #{output_file}" if ( tool.append )
65
+ decoys_tmp_file = Pathname.new(Tempfile.new("random").path).basename.to_s;
66
+
67
+ Randomize.make_decoys input_file, db_length, decoys_tmp_file, tool.prefix_string
68
+ cmd = "cat #{input_file} #{decoys_tmp_file} >> #{output_file}; rm #{decoys_tmp_file}" if ( tool.append )
69
+
70
+ # Randomize.make_decoys raw_db_filename, db_length, decoys_filename, decoy_prefix
71
+ # cmd = "cat #{raw_db_filename} #{decoys_filename} >> #{decoy_db_filename}; rm #{decoys_filename}"
72
+
65
73
  p cmd
66
74
  # Run the conversion
67
75
  #
data/bin/mascot_search.rb CHANGED
@@ -124,7 +124,7 @@ search_tool.option_parser.on('--username un', 'Username.') do |un|
124
124
  search_tool.options.username = un
125
125
  end
126
126
 
127
- search_tool.options.httpproxy=""
127
+ search_tool.options.httpproxy=nil
128
128
  search_tool.option_parser.on( '--proxy url', 'The url to a proxy server' ) do |urll|
129
129
  search_tool.options.httpproxy=urll
130
130
  end
@@ -144,6 +144,11 @@ search_tool.option_parser.on( '--export format', 'Save results in a specified fo
144
144
  search_tool.options.export_format=format
145
145
  end
146
146
 
147
+ search_tool.options.timeout=200
148
+ search_tool.option_parser.on( '--timeout seconds', 'Timeout for sending data file to mascot in seconds' ) do |seconds|
149
+ search_tool.options.timeout=seconds.to_i
150
+ end
151
+
147
152
  exit unless search_tool.check_options
148
153
 
149
154
  if ( ARGV[0].nil? )
@@ -161,8 +166,7 @@ unless ( mascot_cgi =~ /^http[s]?:\/\//)
161
166
  mascot_cgi = "http://#{mascot_cgi}"
162
167
  end
163
168
 
164
- RestClient.proxy=search_tool.httpproxy
165
-
169
+ RestClient.proxy=search_tool.httpproxy if search_tool.httpproxy
166
170
  $genv.log("Var mods #{search_tool.var_mods} and fixed #{search_tool.fix_mods}",:info)
167
171
 
168
172
  cookie=""
@@ -177,7 +181,13 @@ end
177
181
  postdict = search_params_dictionary search_tool, ARGV[0]
178
182
  $genv.log("Sending #{postdict}",:info)
179
183
 
180
- search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
184
+ #site = RestClient::Resource.new(mascot_cgi, timeout=300)
185
+ #search_response=site['/nph-mascot.exe?1'].post , postdict, {:cookies=>cookie}
186
+
187
+ search_response=RestClient::Request.execute(:method => :post, :url => "#{mascot_cgi}/nph-mascot.exe?1", :payload => postdict,:headers=>{:cookies=>cookie},:timeout => search_tool.options.timeout, :open_timeout => 10)
188
+
189
+
190
+ #search_response=RestClient.post "#{mascot_cgi}/nph-mascot.exe?1", postdict, {:cookies=>cookie}
181
191
 
182
192
  $genv.log("Mascot search response was #{search_response}",:info)
183
193