protk 1.2.6.pre5 → 1.3.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/blastxml_to_table.rb
DELETED
@@ -1,119 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
#
|
5
|
-
#
|
6
|
-
|
7
|
-
require 'protk/constants'
|
8
|
-
require 'protk/tool'
|
9
|
-
require 'bio'
|
10
|
-
require 'protk/fastadb'
|
11
|
-
require 'bio-blastxmlparser'
|
12
|
-
|
13
|
-
tool=Tool.new([:explicit_output])
|
14
|
-
tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
|
15
|
-
|
16
|
-
tool.options.database=nil
|
17
|
-
tool.option_parser.on( '-d filename','--database filename', 'Database used for BLAST search. If provided, hit sequences will be looked up in this database' ) do |file|
|
18
|
-
tool.options.database=file
|
19
|
-
end
|
20
|
-
|
21
|
-
tool.options.gene2go=nil
|
22
|
-
tool.option_parser.on('--gene2go pathtogene2go','Path to gene2go database. If provided GO terms will be looked up') do |gene2go|
|
23
|
-
tool.options.gene2go=gene2go
|
24
|
-
end
|
25
|
-
|
26
|
-
tool.options.gitogeneid=nil
|
27
|
-
tool.option_parser.on('--gitogeneid gitogeneid.db','Path to GDBM formatted gi to geneid mapping database. If provided gene ids will be looked up') do |gitogeneid|
|
28
|
-
tool.options.gitogeneid=gitogeneid
|
29
|
-
end
|
30
|
-
|
31
|
-
exit unless tool.check_options
|
32
|
-
|
33
|
-
#require 'debugger';debugger
|
34
|
-
|
35
|
-
exit unless ARGV.length == 1
|
36
|
-
input_file=ARGV[0]
|
37
|
-
|
38
|
-
out_file=$stdout
|
39
|
-
if ( tool.explicit_output != nil)
|
40
|
-
out_file=File.open(tool.explicit_output, "w")
|
41
|
-
end
|
42
|
-
|
43
|
-
$fastadb = nil
|
44
|
-
if tool.database
|
45
|
-
$fastadb=FastaDB.new(tool.database)
|
46
|
-
end
|
47
|
-
|
48
|
-
$gitogeneid = nil
|
49
|
-
if (tool.gitogeneid!=nil) && (File.exist? tool.gitogeneid)
|
50
|
-
require 'gdbm'
|
51
|
-
$gitogeneid = GDBM.new(tool.gitogeneid,flags=GDBM::READER)
|
52
|
-
end
|
53
|
-
|
54
|
-
|
55
|
-
$gene2go = nil
|
56
|
-
if (tool.gene2go!=nil) && (File.exist? tool.gene2go)
|
57
|
-
require 'gdbm'
|
58
|
-
$gene2go = GDBM.new(tool.gene2go,flags=GDBM::READER)
|
59
|
-
end
|
60
|
-
|
61
|
-
def gi_from_hit_id(hit_id)
|
62
|
-
gi_scan=hit_id.scan(/gi\|(\d+)/)
|
63
|
-
gi_scan.join("")
|
64
|
-
end
|
65
|
-
|
66
|
-
def generate_line(hsp,hit,query,hit_seq=nil)
|
67
|
-
|
68
|
-
line="#{query.query_id}\t#{query.query_def}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hit.accession}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
|
69
|
-
if hit_seq
|
70
|
-
line << "\t#{hit_seq}"
|
71
|
-
end
|
72
|
-
geneid=""
|
73
|
-
goterm=""
|
74
|
-
if $gitogeneid
|
75
|
-
geneid=$gitogeneid[gi_from_hit_id(hit.hit_id)]
|
76
|
-
goterm=$gene2go[geneid] if geneid!=nil && $gene2go
|
77
|
-
end
|
78
|
-
|
79
|
-
|
80
|
-
# throw "No geneid" if geneid==nil
|
81
|
-
line << "\t#{geneid}\t#{goterm}"
|
82
|
-
# require 'debugger';debugger
|
83
|
-
# puts gi_from_hit_id(hit.hit_id)
|
84
|
-
# puts $gene2go[gi_from_hit_id(hit.hit_id)]
|
85
|
-
line<<"\n"
|
86
|
-
line
|
87
|
-
end
|
88
|
-
|
89
|
-
def fetch_hit_seq(hit)
|
90
|
-
hit_seq=nil
|
91
|
-
if $fastadb
|
92
|
-
hit_seq=$fastadb.fetch(hit.hit_id).first.aaseq
|
93
|
-
end
|
94
|
-
hit_seq
|
95
|
-
end
|
96
|
-
|
97
|
-
blast = Bio::BlastXMLParser::XmlSplitterIterator.new(input_file).to_enum
|
98
|
-
|
99
|
-
blast.each do |query|
|
100
|
-
query.hits.each do |hit|
|
101
|
-
# hit=query.hits.first
|
102
|
-
# if hit
|
103
|
-
hit_seq=fetch_hit_seq(hit)
|
104
|
-
hit.hsps.each do |hsp|
|
105
|
-
out_line=generate_line(hsp,hit,query,hit_seq)
|
106
|
-
|
107
|
-
out_file.write out_line
|
108
|
-
end
|
109
|
-
# end
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
|
114
|
-
$gitogeneid.close if $gitogeneid!=nil
|
115
|
-
$gene2go.close if $gene2go!=nil
|
116
|
-
|
117
|
-
#require 'debugger';debugger
|
118
|
-
|
119
|
-
#puts "Hi"
|
@@ -1,27 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Created by Ira Cooke 14/12/2010
|
5
|
-
#
|
6
|
-
# Corrects retention times in omssa output
|
7
|
-
#
|
8
|
-
|
9
|
-
$VERBOSE=nil
|
10
|
-
|
11
|
-
require 'protk/constants'
|
12
|
-
require 'protk/command_runner'
|
13
|
-
require 'protk/tool'
|
14
|
-
require 'protk/omssa_util'
|
15
|
-
|
16
|
-
# Environment with global constants
|
17
|
-
#
|
18
|
-
genv=Constants.new
|
19
|
-
|
20
|
-
tool=Tool.new([:over_write])
|
21
|
-
tool.option_parser.banner = "Correct retention times on a pepxml file produced by omssa using information from an mgf file.\n\nUsage: correct_omssa_retention_times.rb [options] file1.pep.xml file2.mgf"
|
22
|
-
tool.option_parser.parse!
|
23
|
-
|
24
|
-
|
25
|
-
OMSSAUtil.add_retention_times(ARGV[1],ARGV[0],tool.over_write,true)
|
26
|
-
|
27
|
-
|
data/bin/feature_finder.rb
DELETED
@@ -1,95 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Created by Ira Cooke 21/3/2012
|
5
|
-
#
|
6
|
-
# A wrapper for the OpenMS FeatureFinder tools (FeatureFinderCentroided and FeatureFinderIsotopeWavelet)
|
7
|
-
|
8
|
-
require 'protk/constants'
|
9
|
-
require 'protk/command_runner'
|
10
|
-
require 'protk/tool'
|
11
|
-
require 'protk/openms_defaults'
|
12
|
-
require 'libxml'
|
13
|
-
require 'tempfile'
|
14
|
-
|
15
|
-
include LibXML
|
16
|
-
|
17
|
-
tool=Tool.new([:explicit_output, :background,:over_write,:prefix_suffix])
|
18
|
-
tool.option_parser.banner = "Find molecular features on a set of input files.\n\nUsage: feature_finder.rb [options] file1.mzML file2.mzML ..."
|
19
|
-
|
20
|
-
tool.options.intensity_type = "ref"
|
21
|
-
tool.option_parser.on( '--intensity-type type',"method used to calculate intensities (ref,trans,corrected). Default = ref. See OpenMS documentation for details" ) do |type|
|
22
|
-
tool.options.intensity_type = type
|
23
|
-
end
|
24
|
-
|
25
|
-
tool.options.intensity_threshold = "3"
|
26
|
-
tool.option_parser.on( '--intensity-threshold thresh',"discard features below this intensity (Default=3). Set to -1 to retain all detected features" ) do |thresh|
|
27
|
-
tool.options.intensity_threshold = thresh
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
exit unless tool.check_options
|
32
|
-
|
33
|
-
if ( ARGV[0].nil? )
|
34
|
-
puts "You must supply an input file"
|
35
|
-
puts tool.option_parser
|
36
|
-
exit
|
37
|
-
end
|
38
|
-
|
39
|
-
# Obtain a global environment object
|
40
|
-
genv=Constants.new
|
41
|
-
|
42
|
-
def run_ff(genv,tool,cmd,output_path,jobid)
|
43
|
-
if ( !tool.over_write && Pathname.new(output_path).exist? )
|
44
|
-
genv.log("Skipping analysis on existing file #{output_path}",:warn)
|
45
|
-
else
|
46
|
-
jobscript_path="#{output_path}.pbs.sh"
|
47
|
-
job_params={:jobid=>jobid, :vmem=>"14Gb", :queue => "sixteen"}
|
48
|
-
code=tool.run(cmd,genv,job_params,jobscript_path)
|
49
|
-
throw "Command failed with exit code #{code}" unless code==0
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
def generate_ini(tool,out_path)
|
54
|
-
base_ini_file=OpenMSDefaults.new.featurefinderisotopewavelet
|
55
|
-
parser = XML::Parser.file(base_ini_file)
|
56
|
-
doc = parser.parse
|
57
|
-
intensity_threshold_node = doc.find('//ITEM[@name="intensity_threshold"]')[0]
|
58
|
-
intensity_type_node = doc.find('//ITEM[@name="intensity_type"]')[0]
|
59
|
-
intensity_threshold_node['value']=tool.intensity_threshold
|
60
|
-
intensity_type_node['value']=tool.intensity_type
|
61
|
-
doc.save(out_path)
|
62
|
-
end
|
63
|
-
|
64
|
-
throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
|
65
|
-
|
66
|
-
input_basename=Pathname.new(ARGV[0]).basename.to_s
|
67
|
-
ini_file_name="#{Pathname.new(Tempfile.new(input_basename).path).basename.to_s}_feature_finder.ini"
|
68
|
-
ini_file="#{Pathname.new(ini_file_name).dirname.realpath.to_s}/#{ini_file_name}"
|
69
|
-
|
70
|
-
generate_ini(tool,ini_file)
|
71
|
-
|
72
|
-
ARGV.each do |filen|
|
73
|
-
input_file=filen.chomp
|
74
|
-
throw "Input must be an mzML file" unless input_file=~/\.mzML$/
|
75
|
-
|
76
|
-
input_basename=input_file.gsub(/\.mzML$/,'')
|
77
|
-
output_dir=Pathname.new(input_basename).dirname.realpath.to_s
|
78
|
-
output_base=Pathname.new(input_basename).basename.to_s
|
79
|
-
output_file = "#{output_dir}/#{tool.output_prefix}#{output_base}#{tool.output_suffix}.featureXML"
|
80
|
-
if ( tool.explicit_output )
|
81
|
-
output_file = "#{output_dir}/#{tool.explicit_output}"
|
82
|
-
end
|
83
|
-
|
84
|
-
if ( tool.over_write || !Pathname.new(output_file).exist? )
|
85
|
-
output_base_filename=Pathname.new(output_file).basename.to_s
|
86
|
-
cmd=""
|
87
|
-
cmd<<"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:#{genv.openms_root}/lib;
|
88
|
-
#{genv.featurefinderisotopewavelet} -in #{Pathname.new(input_file).realpath.to_s} -out #{output_dir}/#{output_base_filename} -ini #{ini_file}"
|
89
|
-
|
90
|
-
run_ff(genv,tool,cmd,output_file,tool.jobid_from_filename(input_basename))
|
91
|
-
|
92
|
-
else
|
93
|
-
genv.log("Skipping search on existing file #{output_file}",:warn)
|
94
|
-
end
|
95
|
-
end
|
data/bin/file_convert.rb
DELETED
@@ -1,164 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Created by Ira Cooke 14/12/2010
|
5
|
-
#
|
6
|
-
# Wrapper for msconvert
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/command_runner'
|
11
|
-
require 'protk/tool'
|
12
|
-
require 'tempfile'
|
13
|
-
require 'libxml'
|
14
|
-
|
15
|
-
include LibXML
|
16
|
-
|
17
|
-
# Regex for cleaning mgf sed -i.bak 's/\(PEPMASS=[0-9]*.[0-9]*\)[ \t]*[0-9]*/\1/g'
|
18
|
-
|
19
|
-
# Read the input file and search for an instance of the charge state cvParam inside a precursor tag. Return true if one is found. False otherwise
|
20
|
-
#
|
21
|
-
def has_charge_information(input_filename)
|
22
|
-
#<precursorList count="1">
|
23
|
-
# <precursor spectrumRef="controllerType=0 controllerNumber=1 scan=59">
|
24
|
-
# <isolationWindow>
|
25
|
-
# <cvParam cvRef="MS" accession="MS:1000827" name="isolation window target m/z" value="939.43" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
26
|
-
# <cvParam cvRef="MS" accession="MS:1000828" name="isolation window lower offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
27
|
-
# <cvParam cvRef="MS" accession="MS:1000829" name="isolation window upper offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
28
|
-
# </isolationWindow>
|
29
|
-
# <selectedIonList count="1">
|
30
|
-
# <selectedIon>
|
31
|
-
# <cvParam cvRef="MS" accession="MS:1000744" name="selected ion m/z" value="939.432189941406" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
32
|
-
# <cvParam cvRef="MS" accession="MS:1000041" name="charge state" value="2"/>
|
33
|
-
# <cvParam cvRef="MS" accession="MS:1000042" name="peak intensity" value="1321.692016601563" unitCvRef="MS" unitAccession="MS:1000131" unitName="number of counts"/>
|
34
|
-
# </selectedIon>
|
35
|
-
# </selectedIonList>
|
36
|
-
|
37
|
-
reader=XML::Reader.file(input_filename)
|
38
|
-
|
39
|
-
while(reader.read)
|
40
|
-
|
41
|
-
if ( reader.local_name=="precursor")
|
42
|
-
|
43
|
-
subdoc=reader.read_inner_xml
|
44
|
-
|
45
|
-
if ( subdoc =~ /MS:1000041/ )
|
46
|
-
return true
|
47
|
-
end
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
return false
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
# Setup specific command-line options for this tool. Other options are inherited from Tool
|
60
|
-
#
|
61
|
-
convert_tool=Tool.new([:explicit_output,:over_write,:maldi])
|
62
|
-
convert_tool.option_parser.banner = "Convert files between different formats.\n\nUsage: file_convert.rb [options] input_file output_file"
|
63
|
-
|
64
|
-
# Special case (usually tool specific options use capitals). Use lowercase l here to mimick maldi option in the search_tool class
|
65
|
-
#
|
66
|
-
convert_tool.options.maldi=false
|
67
|
-
convert_tool.option_parser.on( '-l', '--maldi', 'Input Files are MALDI Spectra' ) do
|
68
|
-
convert_tool.options.maldi=true
|
69
|
-
end
|
70
|
-
|
71
|
-
convert_tool.options.output_format="mgf"
|
72
|
-
convert_tool.option_parser.on( '-F', '--format fmt', 'Convert to a specified format' ) do |fmt|
|
73
|
-
convert_tool.options.output_format=fmt
|
74
|
-
end
|
75
|
-
|
76
|
-
#convert_tool.options.missing_charge_state="false"
|
77
|
-
#convert_tool.option_parser.on( '-C', '--missing-charges', 'No attempt will be made to write charge states. Leads to better looking spectrum names' ) do |fmt|
|
78
|
-
# convert_tool.options.output_format=fmt
|
79
|
-
#end
|
80
|
-
#end
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
exit unless convert_tool.check_options
|
85
|
-
|
86
|
-
if ( ARGV[0].nil? )
|
87
|
-
puts "You must supply an input file"
|
88
|
-
puts convert_tool.option_parser
|
89
|
-
exit
|
90
|
-
end
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
# Environment with global constants
|
95
|
-
#
|
96
|
-
genv=Constants.new
|
97
|
-
|
98
|
-
filename=ARGV[0]
|
99
|
-
|
100
|
-
|
101
|
-
input_ext=Pathname.new(filename).extname
|
102
|
-
input_relative_filename=Pathname.new(filename).basename.to_s
|
103
|
-
|
104
|
-
base_output_dir=Pathname.new(filename).dirname.realpath.to_s #Default output dir is input dir
|
105
|
-
|
106
|
-
output_basename=input_relative_filename.gsub(/#{input_ext}$/,"").to_s
|
107
|
-
|
108
|
-
if ( convert_tool.explicit_output )
|
109
|
-
output_filepath=Pathname.new(convert_tool.explicit_output)
|
110
|
-
base_output_dir=output_filepath.dirname.to_s
|
111
|
-
|
112
|
-
if ( convert_tool.explicit_output=~/^\//) # It's an absolute path so use absolute path as output dir
|
113
|
-
# Convert base_output_dir to realpath
|
114
|
-
#
|
115
|
-
base_output_dir=Pathname.new(base_output_dir).realpath.to_s
|
116
|
-
end
|
117
|
-
|
118
|
-
output_filename=output_filepath.basename.to_s
|
119
|
-
|
120
|
-
end
|
121
|
-
|
122
|
-
# Create a uniquely named directory to hold the output. This is the only way to know the output of msconvert
|
123
|
-
#
|
124
|
-
output_dir="#{base_output_dir}/#{Pathname.new(Tempfile.new("file_convert").path).basename.to_s}"
|
125
|
-
Dir.mkdir(output_dir)
|
126
|
-
|
127
|
-
|
128
|
-
throw "Input format is the same as output format" if ( input_ext==".#{convert_tool.output_format}" )
|
129
|
-
|
130
|
-
genv.log("Converting #{filename} to #{convert_tool.output_format}",:info)
|
131
|
-
runner=CommandRunner.new(genv)
|
132
|
-
basedir=Pathname.new(filename).dirname.to_s #Where we run the tool
|
133
|
-
|
134
|
-
if ( convert_tool.maldi )
|
135
|
-
#For MALDI we know the charge is 1 so set it explicitly. Sometimes it is missing from the data
|
136
|
-
runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
|
137
|
-
else
|
138
|
-
if ( has_charge_information(filename) )
|
139
|
-
runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.<ChargeState>\" --#{convert_tool.output_format} -o #{output_dir}")
|
140
|
-
else
|
141
|
-
# If input file is missing charges the best we can do is just assign charge=1. Search engines can choose to ignore this value anyway.
|
142
|
-
#
|
143
|
-
runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
|
144
|
-
end
|
145
|
-
end
|
146
|
-
|
147
|
-
# Find out what the output name was
|
148
|
-
#
|
149
|
-
tmp_output_filename=""
|
150
|
-
Dir.foreach(output_dir) { |entry_name|
|
151
|
-
if ( entry_name=~/^\.$/ || entry_name=~/^\.\.$/ )
|
152
|
-
else
|
153
|
-
tmp_output_filename=entry_name
|
154
|
-
end
|
155
|
-
}
|
156
|
-
|
157
|
-
# Cleanup after converting
|
158
|
-
cmd = "cd #{output_dir};pwd; mv #{tmp_output_filename} #{base_output_dir}/#{output_filename}; cd ../; pwd;rm -r #{output_dir}"
|
159
|
-
|
160
|
-
code =runner.run_local(cmd)
|
161
|
-
|
162
|
-
throw "Command failed with exit code #{code}" unless code==0
|
163
|
-
|
164
|
-
throw "Failed to create output file #{base_output_dir}/#{output_filename}" unless ( FileTest.exists?("#{base_output_dir}/#{output_filename}") )
|
data/bin/generate_omssa_loc.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of MSLIMS
|
4
|
-
# Created by Ira Cooke 12/4/2010
|
5
|
-
#
|
6
|
-
# Generates files required by the omssa galaxy wrapper
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
# Environment with global constants
|
11
|
-
#
|
12
|
-
genv=Constants.new
|
13
|
-
|
14
|
-
# Set search engine specific parameters on the SearchTool object
|
15
|
-
#
|
16
|
-
omssa_root="#{genv.omssa_root}/omssacl"
|
17
|
-
# Get ommssa to print out a list of its acceptable modifications
|
18
|
-
acceptable_mods=%x[#{omssa_root} -ml].split(/\n/).collect do |mod|
|
19
|
-
|
20
|
-
mod_vals=mod.split(":")
|
21
|
-
[mod_vals[0].lstrip.rstrip,mod_vals[1].lstrip.rstrip]
|
22
|
-
|
23
|
-
end
|
24
|
-
|
25
|
-
# Drop the header
|
26
|
-
#
|
27
|
-
acceptable_mods.shift
|
28
|
-
|
29
|
-
loc_output=File.new("omssa_mods.loc",'w')
|
30
|
-
|
31
|
-
loc_output << "#This file lists the names of chemical modifications accepted by OMMSA\n"
|
32
|
-
loc_output << "#\n"
|
33
|
-
loc_output << "#\n"
|
34
|
-
|
35
|
-
acceptable_mods.each { |am|
|
36
|
-
key = am[1].downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
|
37
|
-
loc_output << "#{am[1]}\t#{key}_\t#{am[0]}\t#{key}_\n"
|
38
|
-
}
|
39
|
-
|
40
|
-
loc_output.close
|
41
|
-
|
42
|
-
|
data/bin/gffmerge.rb
DELETED
@@ -1,208 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Original python version created by Max Grant
|
5
|
-
# Translated to ruby by Ira Cooke 29/1/2013
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/tool'
|
11
|
-
require 'protk/fastadb'
|
12
|
-
require 'libxml'
|
13
|
-
require 'bio'
|
14
|
-
|
15
|
-
include LibXML
|
16
|
-
|
17
|
-
tool=Tool.new([:explicit_output])
|
18
|
-
tool.option_parser.banner = "Create a gff containing peptide observations.\n\nUsage: gffmerge.rb "
|
19
|
-
|
20
|
-
|
21
|
-
tool.options.gff_predicted=nil
|
22
|
-
tool.option_parser.on( '-g filename','--gff filename', 'Predicted Data (GFF3 Format)' ) do |file|
|
23
|
-
tool.options.gff_predicted=file
|
24
|
-
end
|
25
|
-
|
26
|
-
tool.options.protxml=nil
|
27
|
-
tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
|
28
|
-
tool.options.protxml=file
|
29
|
-
end
|
30
|
-
|
31
|
-
tool.options.sixframe=nil
|
32
|
-
tool.option_parser.on( '-t filename','--sixframe filename', 'Sixframe Translations (Fasta Format)' ) do |file|
|
33
|
-
tool.options.sixframe=file
|
34
|
-
end
|
35
|
-
|
36
|
-
tool.options.skip_fasta_indexing=false
|
37
|
-
tool.option_parser.on('--skip-index','Don\'t index sixframe translations (Index should already exist)') do
|
38
|
-
tool.options.skip_fasta_indexing=true
|
39
|
-
end
|
40
|
-
|
41
|
-
tool.options.peptide_probability_threshold=0.95
|
42
|
-
tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
|
43
|
-
tool.options.peptide_probability_threshold=thresh.to_f
|
44
|
-
end
|
45
|
-
|
46
|
-
exit unless tool.check_options [:protxml,:sixframe]
|
47
|
-
|
48
|
-
gff_out_file="merged.gff"
|
49
|
-
if ( tool.explicit_output != nil)
|
50
|
-
gff_out_file=tool.explicit_output
|
51
|
-
end
|
52
|
-
|
53
|
-
gff_db = Bio::GFF.new()
|
54
|
-
if ( tool.gff_predicted !=nil)
|
55
|
-
p "Reading source gff file"
|
56
|
-
gff_db = Bio::GFF::GFF3.new(File.open(tool.gff_predicted))
|
57
|
-
# p gff_db.records[1].attributes
|
58
|
-
# exit
|
59
|
-
end
|
60
|
-
|
61
|
-
f = open(gff_out_file,'w+')
|
62
|
-
gff_db.records.each { |rec|
|
63
|
-
f.write(rec.to_s)
|
64
|
-
}
|
65
|
-
|
66
|
-
p "Parsing proteins from protxml"
|
67
|
-
protxml_parser=XML::Parser.file(tool.protxml)
|
68
|
-
protxml_doc=protxml_parser.parse
|
69
|
-
proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
70
|
-
|
71
|
-
|
72
|
-
db_filename = nil
|
73
|
-
case
|
74
|
-
when Pathname.new(tool.sixframe).exist? # It's an explicitly named db
|
75
|
-
db_filename = Pathname.new(tool.sixframe).realpath.to_s
|
76
|
-
else
|
77
|
-
db_filename=Constants.new.current_database_for_name(tool.sixframe)
|
78
|
-
end
|
79
|
-
|
80
|
-
db_indexfilename = "#{db_filename}.pin"
|
81
|
-
|
82
|
-
if File.exist?(db_indexfilename)
|
83
|
-
p "Using existing indexed translations"
|
84
|
-
orf_lookup = FastaDB.new(db_filename)
|
85
|
-
else
|
86
|
-
p "Indexing sixframe translations"
|
87
|
-
orf_lookup = FastaDB.create(db_filename,db_filename,'prot')
|
88
|
-
end
|
89
|
-
|
90
|
-
p "Aligning peptides and writing GFF data..."
|
91
|
-
low_prob = 0
|
92
|
-
skipped = 0
|
93
|
-
peptide_count = 0
|
94
|
-
protein_count = 0
|
95
|
-
total_peptides = 0
|
96
|
-
for prot in proteins
|
97
|
-
prot_prob = prot['probability']
|
98
|
-
if ( prot_prob.to_f < tool.peptide_probability_threshold )
|
99
|
-
next
|
100
|
-
end
|
101
|
-
indis_proteins = prot.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
102
|
-
prot_names = [prot['protein_name']]
|
103
|
-
for protein in indis_proteins
|
104
|
-
prot_names += [protein['protein_name']]
|
105
|
-
end
|
106
|
-
|
107
|
-
peptides = prot.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
108
|
-
|
109
|
-
for protein_name in prot_names
|
110
|
-
protein_count += 1
|
111
|
-
prot_qualifiers = {"source" => "OBSERVATION", "score" => prot_prob, "ID" => 'pr' + protein_count.to_s}
|
112
|
-
begin
|
113
|
-
puts "Looking up #{protein_name}"
|
114
|
-
orf = orf_lookup.get_by_id protein_name
|
115
|
-
if ( orf == nil)
|
116
|
-
puts "Failed lookup for #{protein_name}"
|
117
|
-
raise KeyError
|
118
|
-
end
|
119
|
-
|
120
|
-
|
121
|
-
position = orf.identifiers.description.split('|').collect { |pos| pos.to_i }
|
122
|
-
|
123
|
-
if ( position.length != 2 )
|
124
|
-
puts "Badly formatted entry #{orf}"
|
125
|
-
raise EncodingError
|
126
|
-
end
|
127
|
-
orf_name = orf.entry_id.scan(/lcl\|(.*)/)[0][0]
|
128
|
-
frame=orf_name.scan(/frame_(\d)/)[0][0]
|
129
|
-
scaffold_name = orf_name.scan(/(scaffold_?\d+)_/)[0][0]
|
130
|
-
|
131
|
-
strand = (frame.to_i > 3) ? '-' : '+'
|
132
|
-
# strand = +1
|
133
|
-
|
134
|
-
prot_id = "pr#{protein_count.to_s}"
|
135
|
-
prot_attributes = [["ID",prot_id],["Name",orf_name]]
|
136
|
-
prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",feature_type="protein",
|
137
|
-
start_position=position[0]+1,end_position=position[1],score=prot_prob,strand=strand,frame=nil,attributes=prot_attributes)
|
138
|
-
gff_db.records += ["##gff-version 3\n","##sequence-region #{scaffold_name} 1 160\n",prot_gff_line]
|
139
|
-
|
140
|
-
prot_seq = orf.aaseq.to_s
|
141
|
-
throw "Not amino_acids" if prot_seq != orf.seq.to_s
|
142
|
-
|
143
|
-
if ( strand=='-' )
|
144
|
-
prot_seq.reverse!
|
145
|
-
end
|
146
|
-
|
147
|
-
for peptide in peptides
|
148
|
-
pprob = peptide['nsp_adjusted_probability'].to_f
|
149
|
-
if ( pprob >= tool.peptide_probability_threshold )
|
150
|
-
total_peptides += 1
|
151
|
-
pep_seq = peptide['peptide_sequence']
|
152
|
-
|
153
|
-
if ( strand=='-')
|
154
|
-
pep_seq.reverse!
|
155
|
-
end
|
156
|
-
|
157
|
-
start_indexes = [0]
|
158
|
-
prot_seq.scan /#{pep_seq}/ do |match|
|
159
|
-
start_indexes << prot_seq.index(match,start_indexes.last)
|
160
|
-
end
|
161
|
-
start_indexes.delete_at(0)
|
162
|
-
|
163
|
-
# Now convert peptide coordinate to genome coordinates
|
164
|
-
# And create gff lines for each match
|
165
|
-
start_indexes.collect do |si|
|
166
|
-
pep_genomic_start = position[0] + 3*si
|
167
|
-
pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
|
168
|
-
peptide_count+=1
|
169
|
-
pep_id = "p#{peptide_count.to_s}"
|
170
|
-
pep_attributes = [["ID",pep_id],["Parent",prot_id]]
|
171
|
-
pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",
|
172
|
-
feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=pprob,
|
173
|
-
strand=strand,frame=nil,attributes=pep_attributes)
|
174
|
-
fragment_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",
|
175
|
-
feature_type="fragment",start_position=pep_genomic_start,end_position=pep_genomic_end,score='',
|
176
|
-
strand=strand,frame=nil,attributes=[["Parent",pep_id],["ID",peptide['peptide_sequence']]])
|
177
|
-
gff_db.records += [pep_gff_line,fragment_gff_line]
|
178
|
-
|
179
|
-
end
|
180
|
-
|
181
|
-
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
rescue KeyError,EncodingError
|
186
|
-
skipped+=0
|
187
|
-
end
|
188
|
-
|
189
|
-
# p orf_name
|
190
|
-
# p prot_gff_line
|
191
|
-
# exit
|
192
|
-
end
|
193
|
-
|
194
|
-
end
|
195
|
-
|
196
|
-
f = open(gff_out_file,'w+')
|
197
|
-
gff_db.records.each { |rec|
|
198
|
-
f.write(rec.to_s)
|
199
|
-
}
|
200
|
-
f.close
|
201
|
-
|
202
|
-
p "Finished."
|
203
|
-
p "Proteins: #{protein_count}"
|
204
|
-
p "Skipped Decoys: #{skipped}"
|
205
|
-
p "Total Peptides: #{total_peptides}"
|
206
|
-
p "Peptides Written: #{total_peptides - low_prob}"
|
207
|
-
p "Peptides Culled: #{low_prob}"
|
208
|
-
exit(0)
|