protk 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,10 +15,36 @@ require 'protk/mascot_util'
15
15
  #
16
16
  genv=Constants.new
17
17
 
18
- tool=SearchTool.new({:database=>true,:explicit_output=>true,:over_write=>true})
18
+ tool=SearchTool.new([:database,:explicit_output,:over_write,:enzyme])
19
19
  tool.option_parser.banner = "Convert mascot dat files to pep.xml files.\n\nUsage: mascot_to_pepxml.rb [options] file1.dat file2.dat ... "
20
+
21
+ tool.options.enzyme="trypsin"
22
+
23
+ tool.options.shortid=false
24
+ tool.option_parser.on( '--shortid', 'Use short protein id as per Mascot result (default uses full protein ids in fasta file)' ) do
25
+ tool.options.shortid=true
26
+ end
27
+
20
28
  tool.option_parser.parse!
21
29
 
30
+ exit unless tool.check_options
31
+
32
+ if ( ARGV[0].nil? )
33
+ puts "You must supply an input file"
34
+ puts tool.option_parser
35
+ exit
36
+ end
37
+
38
+ current_db=""
39
+
40
+ case
41
+ when Pathname.new(tool.database).exist? # It's an explicitly named db
42
+ current_db=Pathname.new(tool.database).realpath.to_s
43
+ else
44
+ current_db=tool.current_database :fasta
45
+ end
46
+
47
+
22
48
 
23
49
  ARGV.each do |file_name|
24
50
  name=file_name.chomp
@@ -28,12 +54,15 @@ ARGV.each do |file_name|
28
54
  if ( tool.explicit_output==nil )
29
55
  new_basename="#{this_dir}/#{MascotUtil.input_basename(name)}_mascot2xml"
30
56
  cmd="cp #{name} #{new_basename}.dat"
31
- cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{tool.current_database :fasta}"
57
+ cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{current_db} -E#{tool.enzyme}"
32
58
 
59
+ cmd << " -shortid" if tool.shortid
60
+
33
61
  else #Mascot2XML doesn't support explicitly named output files so we move the file to an appropriate output filename after finishing
34
62
  new_basename="#{this_dir}/#{MascotUtil.input_basename(name)}_mascot2xml"
35
63
  cmd="cp #{name} #{new_basename}.dat"
36
- cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{tool.current_database :fasta}"
64
+ cmd << "; #{genv.mascot2xml} #{new_basename}.dat -D#{current_db} -E#{tool.enzyme}"
65
+ cmd << " -shortid" if tool.shortid
37
66
  cmd << "; mv #{new_basename}.pep.xml #{tool.explicit_output}; rm #{new_basename}.dat"
38
67
  repair_script="#{File.dirname(__FILE__)}/repair_run_summary.rb"
39
68
  cmd << "; #{repair_script} #{tool.explicit_output}"
@@ -17,17 +17,32 @@ input_stager = nil
17
17
 
18
18
  # Setup specific command-line options for this tool. Other options are inherited from SearchTool
19
19
  #
20
- search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>false,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
20
+ search_tool=SearchTool.new([:database,:explicit_output,:over_write,:enzyme,
21
+ :modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:missed_cleavages])
22
+
21
23
  search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
22
24
  search_tool.options.output_suffix="_msgfplus"
23
25
 
26
+ search_tool.options.enzyme=1
27
+ search_tool.options.instrument=0
28
+
29
+ search_tool.options.no_pepxml=false
30
+ search_tool.option_parser.on( '--no-pepxml', 'Dont convert results to pepxml. Keep native mzidentml format' ) do
31
+ search_tool.options.no_pepxml=true
32
+ end
33
+
34
+ search_tool.options.isotope_error_range="0,1"
35
+ search_tool.option_parser.on( '--isotope-error-range range', 'Takes into account of the error introduced by chooosing a non-monoisotopic peak for fragmentation.(Default 0,1)' ) do |range|
36
+ search_tool.options.isotope_error_range=range
37
+ end
38
+
24
39
  search_tool.options.fragment_method=0
25
40
  search_tool.option_parser.on( '--fragment-method method', 'Fragment method 0: As written in the spectrum or CID if no info (Default), 1: CID, 2: ETD, 3: HCD, 4: Merge spectra from the same precursor' ) do |method|
26
41
  search_tool.options.fragment_method=method
27
42
  end
28
43
 
29
44
  search_tool.options.protocol=0
30
- search_tool.option_parser.on( '--protocol p', '0: NoProtocol (Default), 1: Phosphorylation' ) do |p|
45
+ search_tool.option_parser.on( '--protocol p', '0: NoProtocol (Default), 1: Phosphorylation, 2: iTRAQ, 3: iTRAQPhospho' ) do |p|
31
46
  search_tool.options.protocol=p
32
47
  end
33
48
 
@@ -61,12 +76,23 @@ search_tool.option_parser.on( '--add-features', 'output additional features' )
61
76
  search_tool.options.add_features=true
62
77
  end
63
78
 
79
+ search_tool.options.num_threads=nil
80
+ search_tool.option_parser.on('--threads NumThreads','Number of processing threads to use') do |nt|
81
+ search_tool.options.num_threads=nt
82
+ end
83
+
64
84
  search_tool.options.java_mem="3500M"
65
85
  search_tool.option_parser.on('--java-mem mem','Java memory limit when running the search (Default 3.5Gb)') do |mem|
66
86
  search_tool.options.java_mem=mem
67
87
  end
68
88
 
69
- search_tool.option_parser.parse!
89
+ exit unless search_tool.check_options
90
+
91
+ if ( ARGV[0].nil? )
92
+ puts "You must supply an input file"
93
+ puts search_tool.option_parser
94
+ exit
95
+ end
70
96
 
71
97
  # Environment with global constants
72
98
  #
@@ -149,17 +175,33 @@ ARGV.each do |filename|
149
175
  # Instrument type
150
176
  cmd << " -inst #{search_tool.instrument}"
151
177
 
152
- # cmd << " -m 4"
178
+ cmd << " -m #{search_tool.fragment_method}"
153
179
 
154
180
  cmd << " -addFeatures 1"
155
181
 
182
+ cmd << " -protocol #{search_tool.protocol}"
183
+
184
+ cmd << " -minLength #{search_tool.min_pep_length}"
185
+
186
+ cmd << " -maxLength #{search_tool.max_pep_length}"
187
+
188
+ cmd << " -minCharge #{search_tool.min_pep_charge}"
189
+
190
+ cmd << " -maxCharge #{search_tool.max_pep_charge}"
191
+
192
+ cmd << " -ti #{search_tool.isotope_error_range}"
193
+
194
+ cmd << " -n #{search_tool.num_reported_matches}"
195
+
156
196
  # Enzyme
157
197
  #
158
- # if ( search_tool.enzyme!="Trypsin")
159
- # cmd << " -e #{search_tool.enzyme}"
160
- # end
198
+ cmd << " -e #{search_tool.enzyme}"
199
+
200
+ # Num Threads
201
+ #
202
+ cmd << " -thread #{search_tool.num_threads}" if search_tool.num_threads
161
203
 
162
- mods_file_content = ""
204
+ mods_file_content = ""
163
205
 
164
206
  # Variable Modifications
165
207
  #
@@ -188,10 +230,14 @@ ARGV.each do |filename|
188
230
  end
189
231
 
190
232
  # As a final part of the command we convert to pepxml
191
- cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
192
-
193
- #Then copy the pepxml to the final output path
194
- cmd << "; cp #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
233
+ if search_tool.no_pepxml
234
+ cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
235
+ #Then copy the pepxml to the final output path
236
+ cmd << "; cp #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
237
+ elsif search_tool.explicit_output
238
+ cmd << "; cp #{mzid_output_path} #{output_path}"
239
+ end
240
+
195
241
 
196
242
  # Up to here we've formulated the command. The rest is cleanup
197
243
  p "Running:#{cmd}"
@@ -16,7 +16,12 @@ for_galaxy = GalaxyUtil.for_galaxy?
16
16
 
17
17
  # Setup specific command-line options for this tool. Other options are inherited from SearchTool
18
18
  #
19
- search_tool=SearchTool.new({:msms_search=>true,:background=>false,:glyco=>true,:database=>true,:explicit_output=>true,:over_write=>true,:msms_search_detailed_options=>true})
19
+ search_tool=SearchTool.new([:database,:explicit_output,:over_write,:enzyme,
20
+ :modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:missed_cleavages,
21
+ :precursor_search_type,:respect_precursor_charges,:num_peaks_for_multi_isotope_search,:searched_ions
22
+ ])
23
+
24
+
20
25
  search_tool.option_parser.banner = "Run an OMSSA msms search on a set of mgf input files.\n\nUsage: omssa_search.rb [options] file1.mgf file2.mgf ..."
21
26
  search_tool.options.output_suffix="_omssa"
22
27
 
@@ -54,7 +59,13 @@ search_tool.option_parser.on( '--nthreads num', 'Number of search threads to use
54
59
  search_tool.options.nthreads=num
55
60
  end
56
61
 
57
- search_tool.option_parser.parse!
62
+ exit unless search_tool.check_options
63
+
64
+ if ( ARGV[0].nil? )
65
+ puts "You must supply an input file"
66
+ puts search_tool.option_parser
67
+ exit
68
+ end
58
69
 
59
70
  # Environment with global constants
60
71
  #
@@ -13,7 +13,7 @@ require 'protk/prophet_tool'
13
13
 
14
14
  # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
15
15
  #
16
- prophet_tool=ProphetTool.new({:glyco=>true,:explicit_output=>true,:maldi=>true})
16
+ prophet_tool=ProphetTool.new([:glyco,:explicit_output,:maldi])
17
17
  prophet_tool.option_parser.banner = "Run PeptideProphet on a set of pep.xml input files.\n\nUsage: peptide_prophet.rb [options] file1.pep.xml file2.pep.xml ..."
18
18
  prophet_tool.options.output_suffix="_pproph"
19
19
 
@@ -92,7 +92,13 @@ prophet_tool.option_parser.on( '--override-database database', 'Manually specify
92
92
  prophet_tool.options.override_database = database
93
93
  end
94
94
 
95
- prophet_tool.option_parser.parse!
95
+ exit unless prophet_tool.check_options
96
+
97
+ if ( ARGV[0].nil? )
98
+ puts "You must supply an input file"
99
+ puts prophet_tool.option_parser
100
+ exit
101
+ end
96
102
 
97
103
  throw "When --output and -F options are set only one file at a time can be run" if ( ARGV.length> 1 ) && ( prophet_tool.explicit_output!=nil ) && (prophet_tool.one_ata_time!=nil)
98
104
 
@@ -16,10 +16,16 @@ include LibXML
16
16
 
17
17
  # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
18
18
  #
19
- tool=Tool.new({:explicit_output=>true})
19
+ tool=Tool.new([:explicit_output])
20
20
  tool.option_parser.banner = "Convert a pepXML file to a tab delimited table.\n\nUsage: pepxml_to_table.rb [options] file1.pep.xml"
21
21
 
22
- tool.option_parser.parse!
22
+ exit unless tool.check_options
23
+
24
+ if ( ARGV[0].nil? )
25
+ puts "You must supply an input file"
26
+ puts tool.option_parser
27
+ exit
28
+ end
23
29
 
24
30
  # Obtain a global environment object
25
31
  #genv=Constants.new
@@ -26,7 +26,7 @@ end
26
26
 
27
27
  # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
28
28
  #
29
- prophet_tool=ProphetTool.new({:glyco=>true,:explicit_output=>true})
29
+ prophet_tool=ProphetTool.new([:glyco,:explicit_output])
30
30
  prophet_tool.option_parser.banner = "Run ProteinProphet on a set of pep.xml input files.\n\nUsage: protein_prophet.rb [options] file1.pep.xml file2.pep.xml ..."
31
31
  prophet_tool.options.output_suffix="_protproph"
32
32
 
@@ -90,7 +90,13 @@ prophet_tool.option_parser.on( '--minindep mp',"Minimum percentage of independen
90
90
  prophet_tool.options.minindep = mp
91
91
  end
92
92
 
93
- prophet_tool.option_parser.parse!
93
+ exit unless prophet_tool.check_options
94
+
95
+ if ( ARGV[0].nil? )
96
+ puts "You must supply an input file"
97
+ puts prophet_tool.option_parser
98
+ exit
99
+ end
94
100
 
95
101
 
96
102
  # Obtain a global environment object
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 18/1/2011
5
+ #
6
+ # Convert a pepXML file to a tab delimited table
7
+ #
8
+ #
9
+
10
+ require 'libxml'
11
+ require 'protk/constants'
12
+ require 'protk/command_runner'
13
+ require 'protk/tool'
14
+
15
+ include LibXML
16
+
17
+ # Setup specific command-line options for this tool. Other options are inherited from ProphetTool
18
+ #
19
+ tool=Tool.new([:explicit_output])
20
+ tool.option_parser.banner = "Convert a protXML file to a tab delimited table.\n\nUsage: protxml_to_table.rb [options] file1.protXML"
21
+
22
+ exit unless tool.check_options
23
+
24
+ if ( ARGV[0].nil? )
25
+ puts "You must supply an input file"
26
+ puts tool.option_parser
27
+ exit
28
+ end
29
+
30
+ input_file=ARGV[0]
31
+
32
+ output_file = tool.explicit_output!=nil ? tool.explicit_output : nil
33
+
34
+ output_fh = output_file!=nil ? File.new("#{output_file}",'w') : $stdout
35
+
36
+
37
+ XML::Error.set_handler(&XML::Error::QUIET_HANDLER)
38
+
39
+ protxml_parser=XML::Parser.file("#{input_file}")
40
+
41
+ protxml_ns_prefix="xmlns:"
42
+ protxml_ns="xmlns:http://regis-web.systemsbiology.net/protXML"
43
+ protxml_doc=protxml_parser.parse
44
+ if not protxml_doc.root.namespaces.default
45
+ protxml_ns_prefix=""
46
+ protxml_ns=nil
47
+ end
48
+
49
+
50
+ column_headers=[
51
+ "group_number","group_probability","protein_name",
52
+ "protein_probability","coverage","peptides",
53
+ "num_peptides","confidence"
54
+ ]
55
+
56
+ output_fh.write "#{column_headers.join("\t")}\n"
57
+
58
+
59
+ protein_groups=protxml_doc.find("//#{protxml_ns_prefix}protein_group", protxml_ns)
60
+
61
+ protein_groups.each do |protein_group|
62
+
63
+ proteins=protein_group.find("./#{protxml_ns_prefix}protein", protxml_ns)
64
+
65
+ proteins.each do |protein|
66
+ column_values=[]
67
+
68
+ column_values << protein_group.attributes['group_number']
69
+ column_values << protein_group.attributes['probability']
70
+
71
+ column_values << protein.attributes['protein_name']
72
+ column_values << protein.attributes['probability']
73
+ column_values << protein.attributes['percent_coverage']
74
+ column_values << protein.attributes['unique_stripped_peptides']
75
+ column_values << protein.attributes['total_number_peptides']
76
+ column_values << protein.attributes['confidence']
77
+ output_fh.write(column_values.join("\t"))
78
+ output_fh.write("\n")
79
+
80
+ end
81
+ end
82
+
@@ -40,7 +40,13 @@ tool.option_parser.on('--omssa-itol fitol','Add a fragment ion tolerance paramet
40
40
  tool.options.omssa_ion_tolerance=fitol
41
41
  end
42
42
 
43
- tool.option_parser.parse!
43
+ exit unless tool.check_options
44
+
45
+ if ( ARGV[0].nil? )
46
+ puts "You must supply an input file"
47
+ puts tool.option_parser
48
+ exit
49
+ end
44
50
 
45
51
  pepxml_file=ARGV[0]
46
52
 
@@ -10,10 +10,34 @@ require 'protk/constants'
10
10
  require 'protk/tool'
11
11
  require 'bio'
12
12
 
13
- tool=Tool.new(:explicit_output=>true)
13
+ def check_coords(naseq,aaseq,frame,pstart,pend)
14
+ orf_from_coords=""
15
+ if ( frame<=3)
16
+ orf_from_coords=naseq[pstart-1..pend-1].translate(1)
17
+ else
18
+ orf_from_coords=naseq[pstart-1..pend-1].reverse_complement.translate(1)
19
+ # current coords give
20
+ # naseq.reverse_complement[pstart-1..pend-1].translate(1)
21
+ # naseq[350368-pend..(350367-pstart+1)].reverse_complement.translate(1)
22
+ # orf_from_coords=naseq[naseq.length-pend..naseq.length-pstart].reverse_complement.translate(1)
23
+ end
24
+ if ( orf_from_coords!=aaseq)
25
+ require 'debugger'; debugger
26
+ end
27
+ # p "#{aaseq} #{frame}"
28
+ end
29
+
30
+
31
+ tool=Tool.new([:explicit_output])
14
32
  tool.option_parser.banner = "Create a sixframe translation of a genome.\n\nUsage: sixframe.rb [options] genome.fasta"
15
33
 
16
- tool.option_parser.parse!
34
+ exit unless tool.check_options
35
+
36
+ if ( ARGV[0].nil? )
37
+ puts "You must supply an input file"
38
+ puts tool.option_parser
39
+ exit
40
+ end
17
41
 
18
42
  inname=ARGV.shift
19
43
 
@@ -26,7 +50,11 @@ end
26
50
  file = Bio::FastaFormat.open(inname)
27
51
 
28
52
  file.each do |entry|
53
+
54
+ puts entry.entry_id
55
+
29
56
  length = entry.naseq.length
57
+
30
58
  (1...7).each do |frame|
31
59
  translated_seq= entry.naseq.translate(frame)
32
60
  orfs=translated_seq.split("*")
@@ -37,15 +65,30 @@ file.each do |entry|
37
65
  orfs.each do |orf|
38
66
  oi+=1
39
67
  if ( orf.length > 20 )
68
+
40
69
  position_start = position
41
70
  position_end = position_start + orf.length*3 -1
42
71
 
72
+ if ( frame>3) #On reverse strand. Coordinates need translating to forward strand
73
+ forward_position_start=length-position_end+1
74
+ forward_position_end = length-position_start+1
75
+ position_start=forward_position_start
76
+ position_end=forward_position_end
77
+ end
78
+
79
+
80
+
81
+
43
82
  # Create accession compliant with NCBI naming standard
44
83
  # See http://www.ncbi.nlm.nih.gov/books/NBK7183/?rendertype=table&id=ch_demo.T5
45
84
  ncbi_scaffold_id = entry.entry_id.gsub('|','_').gsub(' ','_')
46
85
  ncbi_accession = "lcl|#{ncbi_scaffold_id}_frame_#{frame}_orf_#{oi}"
47
86
 
87
+ # check_coords(entry.naseq,orf,frame,position_start,position_end)
88
+
48
89
  # Output in fasta format
90
+ # start and end positions are always relative to the forward strand
91
+
49
92
  outfile.write(">#{ncbi_accession} #{position_start}|#{position_end}\n#{orf}\n")
50
93
 
51
94
  end
@@ -54,3 +97,6 @@ file.each do |entry|
54
97
 
55
98
  end
56
99
  end
100
+
101
+
102
+