protk 1.2.6.pre5 → 1.3.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/blastxml_to_table.rb
DELETED
@@ -1,119 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
#
|
5
|
-
#
|
6
|
-
|
7
|
-
require 'protk/constants'
|
8
|
-
require 'protk/tool'
|
9
|
-
require 'bio'
|
10
|
-
require 'protk/fastadb'
|
11
|
-
require 'bio-blastxmlparser'
|
12
|
-
|
13
|
-
tool=Tool.new([:explicit_output])
|
14
|
-
tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
|
15
|
-
|
16
|
-
tool.options.database=nil
|
17
|
-
tool.option_parser.on( '-d filename','--database filename', 'Database used for BLAST search. If provided, hit sequences will be looked up in this database' ) do |file|
|
18
|
-
tool.options.database=file
|
19
|
-
end
|
20
|
-
|
21
|
-
tool.options.gene2go=nil
|
22
|
-
tool.option_parser.on('--gene2go pathtogene2go','Path to gene2go database. If provided GO terms will be looked up') do |gene2go|
|
23
|
-
tool.options.gene2go=gene2go
|
24
|
-
end
|
25
|
-
|
26
|
-
tool.options.gitogeneid=nil
|
27
|
-
tool.option_parser.on('--gitogeneid gitogeneid.db','Path to GDBM formatted gi to geneid mapping database. If provided gene ids will be looked up') do |gitogeneid|
|
28
|
-
tool.options.gitogeneid=gitogeneid
|
29
|
-
end
|
30
|
-
|
31
|
-
exit unless tool.check_options
|
32
|
-
|
33
|
-
#require 'debugger';debugger
|
34
|
-
|
35
|
-
exit unless ARGV.length == 1
|
36
|
-
input_file=ARGV[0]
|
37
|
-
|
38
|
-
out_file=$stdout
|
39
|
-
if ( tool.explicit_output != nil)
|
40
|
-
out_file=File.open(tool.explicit_output, "w")
|
41
|
-
end
|
42
|
-
|
43
|
-
$fastadb = nil
|
44
|
-
if tool.database
|
45
|
-
$fastadb=FastaDB.new(tool.database)
|
46
|
-
end
|
47
|
-
|
48
|
-
$gitogeneid = nil
|
49
|
-
if (tool.gitogeneid!=nil) && (File.exist? tool.gitogeneid)
|
50
|
-
require 'gdbm'
|
51
|
-
$gitogeneid = GDBM.new(tool.gitogeneid,flags=GDBM::READER)
|
52
|
-
end
|
53
|
-
|
54
|
-
|
55
|
-
$gene2go = nil
|
56
|
-
if (tool.gene2go!=nil) && (File.exist? tool.gene2go)
|
57
|
-
require 'gdbm'
|
58
|
-
$gene2go = GDBM.new(tool.gene2go,flags=GDBM::READER)
|
59
|
-
end
|
60
|
-
|
61
|
-
def gi_from_hit_id(hit_id)
|
62
|
-
gi_scan=hit_id.scan(/gi\|(\d+)/)
|
63
|
-
gi_scan.join("")
|
64
|
-
end
|
65
|
-
|
66
|
-
def generate_line(hsp,hit,query,hit_seq=nil)
|
67
|
-
|
68
|
-
line="#{query.query_id}\t#{query.query_def}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hit.accession}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
|
69
|
-
if hit_seq
|
70
|
-
line << "\t#{hit_seq}"
|
71
|
-
end
|
72
|
-
geneid=""
|
73
|
-
goterm=""
|
74
|
-
if $gitogeneid
|
75
|
-
geneid=$gitogeneid[gi_from_hit_id(hit.hit_id)]
|
76
|
-
goterm=$gene2go[geneid] if geneid!=nil && $gene2go
|
77
|
-
end
|
78
|
-
|
79
|
-
|
80
|
-
# throw "No geneid" if geneid==nil
|
81
|
-
line << "\t#{geneid}\t#{goterm}"
|
82
|
-
# require 'debugger';debugger
|
83
|
-
# puts gi_from_hit_id(hit.hit_id)
|
84
|
-
# puts $gene2go[gi_from_hit_id(hit.hit_id)]
|
85
|
-
line<<"\n"
|
86
|
-
line
|
87
|
-
end
|
88
|
-
|
89
|
-
def fetch_hit_seq(hit)
|
90
|
-
hit_seq=nil
|
91
|
-
if $fastadb
|
92
|
-
hit_seq=$fastadb.fetch(hit.hit_id).first.aaseq
|
93
|
-
end
|
94
|
-
hit_seq
|
95
|
-
end
|
96
|
-
|
97
|
-
blast = Bio::BlastXMLParser::XmlSplitterIterator.new(input_file).to_enum
|
98
|
-
|
99
|
-
blast.each do |query|
|
100
|
-
query.hits.each do |hit|
|
101
|
-
# hit=query.hits.first
|
102
|
-
# if hit
|
103
|
-
hit_seq=fetch_hit_seq(hit)
|
104
|
-
hit.hsps.each do |hsp|
|
105
|
-
out_line=generate_line(hsp,hit,query,hit_seq)
|
106
|
-
|
107
|
-
out_file.write out_line
|
108
|
-
end
|
109
|
-
# end
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
|
114
|
-
$gitogeneid.close if $gitogeneid!=nil
|
115
|
-
$gene2go.close if $gene2go!=nil
|
116
|
-
|
117
|
-
#require 'debugger';debugger
|
118
|
-
|
119
|
-
#puts "Hi"
|
@@ -1,27 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Created by Ira Cooke 14/12/2010
|
5
|
-
#
|
6
|
-
# Corrects retention times in omssa output
|
7
|
-
#
|
8
|
-
|
9
|
-
$VERBOSE=nil
|
10
|
-
|
11
|
-
require 'protk/constants'
|
12
|
-
require 'protk/command_runner'
|
13
|
-
require 'protk/tool'
|
14
|
-
require 'protk/omssa_util'
|
15
|
-
|
16
|
-
# Environment with global constants
|
17
|
-
#
|
18
|
-
genv=Constants.new
|
19
|
-
|
20
|
-
tool=Tool.new([:over_write])
|
21
|
-
tool.option_parser.banner = "Correct retention times on a pepxml file produced by omssa using information from an mgf file.\n\nUsage: correct_omssa_retention_times.rb [options] file1.pep.xml file2.mgf"
|
22
|
-
tool.option_parser.parse!
|
23
|
-
|
24
|
-
|
25
|
-
OMSSAUtil.add_retention_times(ARGV[1],ARGV[0],tool.over_write,true)
|
26
|
-
|
27
|
-
|
data/bin/feature_finder.rb
DELETED
@@ -1,95 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Created by Ira Cooke 21/3/2012
|
5
|
-
#
|
6
|
-
# A wrapper for the OpenMS FeatureFinder tools (FeatureFinderCentroided and FeatureFinderIsotopeWavelet)
|
7
|
-
|
8
|
-
require 'protk/constants'
|
9
|
-
require 'protk/command_runner'
|
10
|
-
require 'protk/tool'
|
11
|
-
require 'protk/openms_defaults'
|
12
|
-
require 'libxml'
|
13
|
-
require 'tempfile'
|
14
|
-
|
15
|
-
include LibXML
|
16
|
-
|
17
|
-
tool=Tool.new([:explicit_output, :background,:over_write,:prefix_suffix])
|
18
|
-
tool.option_parser.banner = "Find molecular features on a set of input files.\n\nUsage: feature_finder.rb [options] file1.mzML file2.mzML ..."
|
19
|
-
|
20
|
-
tool.options.intensity_type = "ref"
|
21
|
-
tool.option_parser.on( '--intensity-type type',"method used to calculate intensities (ref,trans,corrected). Default = ref. See OpenMS documentation for details" ) do |type|
|
22
|
-
tool.options.intensity_type = type
|
23
|
-
end
|
24
|
-
|
25
|
-
tool.options.intensity_threshold = "3"
|
26
|
-
tool.option_parser.on( '--intensity-threshold thresh',"discard features below this intensity (Default=3). Set to -1 to retain all detected features" ) do |thresh|
|
27
|
-
tool.options.intensity_threshold = thresh
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
exit unless tool.check_options
|
32
|
-
|
33
|
-
if ( ARGV[0].nil? )
|
34
|
-
puts "You must supply an input file"
|
35
|
-
puts tool.option_parser
|
36
|
-
exit
|
37
|
-
end
|
38
|
-
|
39
|
-
# Obtain a global environment object
|
40
|
-
genv=Constants.new
|
41
|
-
|
42
|
-
def run_ff(genv,tool,cmd,output_path,jobid)
|
43
|
-
if ( !tool.over_write && Pathname.new(output_path).exist? )
|
44
|
-
genv.log("Skipping analysis on existing file #{output_path}",:warn)
|
45
|
-
else
|
46
|
-
jobscript_path="#{output_path}.pbs.sh"
|
47
|
-
job_params={:jobid=>jobid, :vmem=>"14Gb", :queue => "sixteen"}
|
48
|
-
code=tool.run(cmd,genv,job_params,jobscript_path)
|
49
|
-
throw "Command failed with exit code #{code}" unless code==0
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
def generate_ini(tool,out_path)
|
54
|
-
base_ini_file=OpenMSDefaults.new.featurefinderisotopewavelet
|
55
|
-
parser = XML::Parser.file(base_ini_file)
|
56
|
-
doc = parser.parse
|
57
|
-
intensity_threshold_node = doc.find('//ITEM[@name="intensity_threshold"]')[0]
|
58
|
-
intensity_type_node = doc.find('//ITEM[@name="intensity_type"]')[0]
|
59
|
-
intensity_threshold_node['value']=tool.intensity_threshold
|
60
|
-
intensity_type_node['value']=tool.intensity_type
|
61
|
-
doc.save(out_path)
|
62
|
-
end
|
63
|
-
|
64
|
-
throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
|
65
|
-
|
66
|
-
input_basename=Pathname.new(ARGV[0]).basename.to_s
|
67
|
-
ini_file_name="#{Pathname.new(Tempfile.new(input_basename).path).basename.to_s}_feature_finder.ini"
|
68
|
-
ini_file="#{Pathname.new(ini_file_name).dirname.realpath.to_s}/#{ini_file_name}"
|
69
|
-
|
70
|
-
generate_ini(tool,ini_file)
|
71
|
-
|
72
|
-
ARGV.each do |filen|
|
73
|
-
input_file=filen.chomp
|
74
|
-
throw "Input must be an mzML file" unless input_file=~/\.mzML$/
|
75
|
-
|
76
|
-
input_basename=input_file.gsub(/\.mzML$/,'')
|
77
|
-
output_dir=Pathname.new(input_basename).dirname.realpath.to_s
|
78
|
-
output_base=Pathname.new(input_basename).basename.to_s
|
79
|
-
output_file = "#{output_dir}/#{tool.output_prefix}#{output_base}#{tool.output_suffix}.featureXML"
|
80
|
-
if ( tool.explicit_output )
|
81
|
-
output_file = "#{output_dir}/#{tool.explicit_output}"
|
82
|
-
end
|
83
|
-
|
84
|
-
if ( tool.over_write || !Pathname.new(output_file).exist? )
|
85
|
-
output_base_filename=Pathname.new(output_file).basename.to_s
|
86
|
-
cmd=""
|
87
|
-
cmd<<"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:#{genv.openms_root}/lib;
|
88
|
-
#{genv.featurefinderisotopewavelet} -in #{Pathname.new(input_file).realpath.to_s} -out #{output_dir}/#{output_base_filename} -ini #{ini_file}"
|
89
|
-
|
90
|
-
run_ff(genv,tool,cmd,output_file,tool.jobid_from_filename(input_basename))
|
91
|
-
|
92
|
-
else
|
93
|
-
genv.log("Skipping search on existing file #{output_file}",:warn)
|
94
|
-
end
|
95
|
-
end
|
data/bin/file_convert.rb
DELETED
@@ -1,164 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Created by Ira Cooke 14/12/2010
|
5
|
-
#
|
6
|
-
# Wrapper for msconvert
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/command_runner'
|
11
|
-
require 'protk/tool'
|
12
|
-
require 'tempfile'
|
13
|
-
require 'libxml'
|
14
|
-
|
15
|
-
include LibXML
|
16
|
-
|
17
|
-
# Regex for cleaning mgf sed -i.bak 's/\(PEPMASS=[0-9]*.[0-9]*\)[ \t]*[0-9]*/\1/g'
|
18
|
-
|
19
|
-
# Read the input file and search for an instance of the charge state cvParam inside a precursor tag. Return true if one is found. False otherwise
|
20
|
-
#
|
21
|
-
def has_charge_information(input_filename)
|
22
|
-
#<precursorList count="1">
|
23
|
-
# <precursor spectrumRef="controllerType=0 controllerNumber=1 scan=59">
|
24
|
-
# <isolationWindow>
|
25
|
-
# <cvParam cvRef="MS" accession="MS:1000827" name="isolation window target m/z" value="939.43" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
26
|
-
# <cvParam cvRef="MS" accession="MS:1000828" name="isolation window lower offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
27
|
-
# <cvParam cvRef="MS" accession="MS:1000829" name="isolation window upper offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
28
|
-
# </isolationWindow>
|
29
|
-
# <selectedIonList count="1">
|
30
|
-
# <selectedIon>
|
31
|
-
# <cvParam cvRef="MS" accession="MS:1000744" name="selected ion m/z" value="939.432189941406" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
|
32
|
-
# <cvParam cvRef="MS" accession="MS:1000041" name="charge state" value="2"/>
|
33
|
-
# <cvParam cvRef="MS" accession="MS:1000042" name="peak intensity" value="1321.692016601563" unitCvRef="MS" unitAccession="MS:1000131" unitName="number of counts"/>
|
34
|
-
# </selectedIon>
|
35
|
-
# </selectedIonList>
|
36
|
-
|
37
|
-
reader=XML::Reader.file(input_filename)
|
38
|
-
|
39
|
-
while(reader.read)
|
40
|
-
|
41
|
-
if ( reader.local_name=="precursor")
|
42
|
-
|
43
|
-
subdoc=reader.read_inner_xml
|
44
|
-
|
45
|
-
if ( subdoc =~ /MS:1000041/ )
|
46
|
-
return true
|
47
|
-
end
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
return false
|
54
|
-
|
55
|
-
end
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
# Setup specific command-line options for this tool. Other options are inherited from Tool
|
60
|
-
#
|
61
|
-
convert_tool=Tool.new([:explicit_output,:over_write,:maldi])
|
62
|
-
convert_tool.option_parser.banner = "Convert files between different formats.\n\nUsage: file_convert.rb [options] input_file output_file"
|
63
|
-
|
64
|
-
# Special case (usually tool specific options use capitals). Use lowercase l here to mimick maldi option in the search_tool class
|
65
|
-
#
|
66
|
-
convert_tool.options.maldi=false
|
67
|
-
convert_tool.option_parser.on( '-l', '--maldi', 'Input Files are MALDI Spectra' ) do
|
68
|
-
convert_tool.options.maldi=true
|
69
|
-
end
|
70
|
-
|
71
|
-
convert_tool.options.output_format="mgf"
|
72
|
-
convert_tool.option_parser.on( '-F', '--format fmt', 'Convert to a specified format' ) do |fmt|
|
73
|
-
convert_tool.options.output_format=fmt
|
74
|
-
end
|
75
|
-
|
76
|
-
#convert_tool.options.missing_charge_state="false"
|
77
|
-
#convert_tool.option_parser.on( '-C', '--missing-charges', 'No attempt will be made to write charge states. Leads to better looking spectrum names' ) do |fmt|
|
78
|
-
# convert_tool.options.output_format=fmt
|
79
|
-
#end
|
80
|
-
#end
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
exit unless convert_tool.check_options
|
85
|
-
|
86
|
-
if ( ARGV[0].nil? )
|
87
|
-
puts "You must supply an input file"
|
88
|
-
puts convert_tool.option_parser
|
89
|
-
exit
|
90
|
-
end
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
# Environment with global constants
|
95
|
-
#
|
96
|
-
genv=Constants.new
|
97
|
-
|
98
|
-
filename=ARGV[0]
|
99
|
-
|
100
|
-
|
101
|
-
input_ext=Pathname.new(filename).extname
|
102
|
-
input_relative_filename=Pathname.new(filename).basename.to_s
|
103
|
-
|
104
|
-
base_output_dir=Pathname.new(filename).dirname.realpath.to_s #Default output dir is input dir
|
105
|
-
|
106
|
-
output_basename=input_relative_filename.gsub(/#{input_ext}$/,"").to_s
|
107
|
-
|
108
|
-
if ( convert_tool.explicit_output )
|
109
|
-
output_filepath=Pathname.new(convert_tool.explicit_output)
|
110
|
-
base_output_dir=output_filepath.dirname.to_s
|
111
|
-
|
112
|
-
if ( convert_tool.explicit_output=~/^\//) # It's an absolute path so use absolute path as output dir
|
113
|
-
# Convert base_output_dir to realpath
|
114
|
-
#
|
115
|
-
base_output_dir=Pathname.new(base_output_dir).realpath.to_s
|
116
|
-
end
|
117
|
-
|
118
|
-
output_filename=output_filepath.basename.to_s
|
119
|
-
|
120
|
-
end
|
121
|
-
|
122
|
-
# Create a uniquely named directory to hold the output. This is the only way to know the output of msconvert
|
123
|
-
#
|
124
|
-
output_dir="#{base_output_dir}/#{Pathname.new(Tempfile.new("file_convert").path).basename.to_s}"
|
125
|
-
Dir.mkdir(output_dir)
|
126
|
-
|
127
|
-
|
128
|
-
throw "Input format is the same as output format" if ( input_ext==".#{convert_tool.output_format}" )
|
129
|
-
|
130
|
-
genv.log("Converting #{filename} to #{convert_tool.output_format}",:info)
|
131
|
-
runner=CommandRunner.new(genv)
|
132
|
-
basedir=Pathname.new(filename).dirname.to_s #Where we run the tool
|
133
|
-
|
134
|
-
if ( convert_tool.maldi )
|
135
|
-
#For MALDI we know the charge is 1 so set it explicitly. Sometimes it is missing from the data
|
136
|
-
runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
|
137
|
-
else
|
138
|
-
if ( has_charge_information(filename) )
|
139
|
-
runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.<ChargeState>\" --#{convert_tool.output_format} -o #{output_dir}")
|
140
|
-
else
|
141
|
-
# If input file is missing charges the best we can do is just assign charge=1. Search engines can choose to ignore this value anyway.
|
142
|
-
#
|
143
|
-
runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
|
144
|
-
end
|
145
|
-
end
|
146
|
-
|
147
|
-
# Find out what the output name was
|
148
|
-
#
|
149
|
-
tmp_output_filename=""
|
150
|
-
Dir.foreach(output_dir) { |entry_name|
|
151
|
-
if ( entry_name=~/^\.$/ || entry_name=~/^\.\.$/ )
|
152
|
-
else
|
153
|
-
tmp_output_filename=entry_name
|
154
|
-
end
|
155
|
-
}
|
156
|
-
|
157
|
-
# Cleanup after converting
|
158
|
-
cmd = "cd #{output_dir};pwd; mv #{tmp_output_filename} #{base_output_dir}/#{output_filename}; cd ../; pwd;rm -r #{output_dir}"
|
159
|
-
|
160
|
-
code =runner.run_local(cmd)
|
161
|
-
|
162
|
-
throw "Command failed with exit code #{code}" unless code==0
|
163
|
-
|
164
|
-
throw "Failed to create output file #{base_output_dir}/#{output_filename}" unless ( FileTest.exists?("#{base_output_dir}/#{output_filename}") )
|
data/bin/generate_omssa_loc.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of MSLIMS
|
4
|
-
# Created by Ira Cooke 12/4/2010
|
5
|
-
#
|
6
|
-
# Generates files required by the omssa galaxy wrapper
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
# Environment with global constants
|
11
|
-
#
|
12
|
-
genv=Constants.new
|
13
|
-
|
14
|
-
# Set search engine specific parameters on the SearchTool object
|
15
|
-
#
|
16
|
-
omssa_root="#{genv.omssa_root}/omssacl"
|
17
|
-
# Get ommssa to print out a list of its acceptable modifications
|
18
|
-
acceptable_mods=%x[#{omssa_root} -ml].split(/\n/).collect do |mod|
|
19
|
-
|
20
|
-
mod_vals=mod.split(":")
|
21
|
-
[mod_vals[0].lstrip.rstrip,mod_vals[1].lstrip.rstrip]
|
22
|
-
|
23
|
-
end
|
24
|
-
|
25
|
-
# Drop the header
|
26
|
-
#
|
27
|
-
acceptable_mods.shift
|
28
|
-
|
29
|
-
loc_output=File.new("omssa_mods.loc",'w')
|
30
|
-
|
31
|
-
loc_output << "#This file lists the names of chemical modifications accepted by OMMSA\n"
|
32
|
-
loc_output << "#\n"
|
33
|
-
loc_output << "#\n"
|
34
|
-
|
35
|
-
acceptable_mods.each { |am|
|
36
|
-
key = am[1].downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
|
37
|
-
loc_output << "#{am[1]}\t#{key}_\t#{am[0]}\t#{key}_\n"
|
38
|
-
}
|
39
|
-
|
40
|
-
loc_output.close
|
41
|
-
|
42
|
-
|
data/bin/gffmerge.rb
DELETED
@@ -1,208 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
#
|
3
|
-
# This file is part of protk
|
4
|
-
# Original python version created by Max Grant
|
5
|
-
# Translated to ruby by Ira Cooke 29/1/2013
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
|
-
require 'protk/constants'
|
10
|
-
require 'protk/tool'
|
11
|
-
require 'protk/fastadb'
|
12
|
-
require 'libxml'
|
13
|
-
require 'bio'
|
14
|
-
|
15
|
-
include LibXML
|
16
|
-
|
17
|
-
tool=Tool.new([:explicit_output])
|
18
|
-
tool.option_parser.banner = "Create a gff containing peptide observations.\n\nUsage: gffmerge.rb "
|
19
|
-
|
20
|
-
|
21
|
-
tool.options.gff_predicted=nil
|
22
|
-
tool.option_parser.on( '-g filename','--gff filename', 'Predicted Data (GFF3 Format)' ) do |file|
|
23
|
-
tool.options.gff_predicted=file
|
24
|
-
end
|
25
|
-
|
26
|
-
tool.options.protxml=nil
|
27
|
-
tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
|
28
|
-
tool.options.protxml=file
|
29
|
-
end
|
30
|
-
|
31
|
-
tool.options.sixframe=nil
|
32
|
-
tool.option_parser.on( '-t filename','--sixframe filename', 'Sixframe Translations (Fasta Format)' ) do |file|
|
33
|
-
tool.options.sixframe=file
|
34
|
-
end
|
35
|
-
|
36
|
-
tool.options.skip_fasta_indexing=false
|
37
|
-
tool.option_parser.on('--skip-index','Don\'t index sixframe translations (Index should already exist)') do
|
38
|
-
tool.options.skip_fasta_indexing=true
|
39
|
-
end
|
40
|
-
|
41
|
-
tool.options.peptide_probability_threshold=0.95
|
42
|
-
tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
|
43
|
-
tool.options.peptide_probability_threshold=thresh.to_f
|
44
|
-
end
|
45
|
-
|
46
|
-
exit unless tool.check_options [:protxml,:sixframe]
|
47
|
-
|
48
|
-
gff_out_file="merged.gff"
|
49
|
-
if ( tool.explicit_output != nil)
|
50
|
-
gff_out_file=tool.explicit_output
|
51
|
-
end
|
52
|
-
|
53
|
-
gff_db = Bio::GFF.new()
|
54
|
-
if ( tool.gff_predicted !=nil)
|
55
|
-
p "Reading source gff file"
|
56
|
-
gff_db = Bio::GFF::GFF3.new(File.open(tool.gff_predicted))
|
57
|
-
# p gff_db.records[1].attributes
|
58
|
-
# exit
|
59
|
-
end
|
60
|
-
|
61
|
-
f = open(gff_out_file,'w+')
|
62
|
-
gff_db.records.each { |rec|
|
63
|
-
f.write(rec.to_s)
|
64
|
-
}
|
65
|
-
|
66
|
-
p "Parsing proteins from protxml"
|
67
|
-
protxml_parser=XML::Parser.file(tool.protxml)
|
68
|
-
protxml_doc=protxml_parser.parse
|
69
|
-
proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
70
|
-
|
71
|
-
|
72
|
-
db_filename = nil
|
73
|
-
case
|
74
|
-
when Pathname.new(tool.sixframe).exist? # It's an explicitly named db
|
75
|
-
db_filename = Pathname.new(tool.sixframe).realpath.to_s
|
76
|
-
else
|
77
|
-
db_filename=Constants.new.current_database_for_name(tool.sixframe)
|
78
|
-
end
|
79
|
-
|
80
|
-
db_indexfilename = "#{db_filename}.pin"
|
81
|
-
|
82
|
-
if File.exist?(db_indexfilename)
|
83
|
-
p "Using existing indexed translations"
|
84
|
-
orf_lookup = FastaDB.new(db_filename)
|
85
|
-
else
|
86
|
-
p "Indexing sixframe translations"
|
87
|
-
orf_lookup = FastaDB.create(db_filename,db_filename,'prot')
|
88
|
-
end
|
89
|
-
|
90
|
-
p "Aligning peptides and writing GFF data..."
|
91
|
-
low_prob = 0
|
92
|
-
skipped = 0
|
93
|
-
peptide_count = 0
|
94
|
-
protein_count = 0
|
95
|
-
total_peptides = 0
|
96
|
-
for prot in proteins
|
97
|
-
prot_prob = prot['probability']
|
98
|
-
if ( prot_prob.to_f < tool.peptide_probability_threshold )
|
99
|
-
next
|
100
|
-
end
|
101
|
-
indis_proteins = prot.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
102
|
-
prot_names = [prot['protein_name']]
|
103
|
-
for protein in indis_proteins
|
104
|
-
prot_names += [protein['protein_name']]
|
105
|
-
end
|
106
|
-
|
107
|
-
peptides = prot.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
108
|
-
|
109
|
-
for protein_name in prot_names
|
110
|
-
protein_count += 1
|
111
|
-
prot_qualifiers = {"source" => "OBSERVATION", "score" => prot_prob, "ID" => 'pr' + protein_count.to_s}
|
112
|
-
begin
|
113
|
-
puts "Looking up #{protein_name}"
|
114
|
-
orf = orf_lookup.get_by_id protein_name
|
115
|
-
if ( orf == nil)
|
116
|
-
puts "Failed lookup for #{protein_name}"
|
117
|
-
raise KeyError
|
118
|
-
end
|
119
|
-
|
120
|
-
|
121
|
-
position = orf.identifiers.description.split('|').collect { |pos| pos.to_i }
|
122
|
-
|
123
|
-
if ( position.length != 2 )
|
124
|
-
puts "Badly formatted entry #{orf}"
|
125
|
-
raise EncodingError
|
126
|
-
end
|
127
|
-
orf_name = orf.entry_id.scan(/lcl\|(.*)/)[0][0]
|
128
|
-
frame=orf_name.scan(/frame_(\d)/)[0][0]
|
129
|
-
scaffold_name = orf_name.scan(/(scaffold_?\d+)_/)[0][0]
|
130
|
-
|
131
|
-
strand = (frame.to_i > 3) ? '-' : '+'
|
132
|
-
# strand = +1
|
133
|
-
|
134
|
-
prot_id = "pr#{protein_count.to_s}"
|
135
|
-
prot_attributes = [["ID",prot_id],["Name",orf_name]]
|
136
|
-
prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",feature_type="protein",
|
137
|
-
start_position=position[0]+1,end_position=position[1],score=prot_prob,strand=strand,frame=nil,attributes=prot_attributes)
|
138
|
-
gff_db.records += ["##gff-version 3\n","##sequence-region #{scaffold_name} 1 160\n",prot_gff_line]
|
139
|
-
|
140
|
-
prot_seq = orf.aaseq.to_s
|
141
|
-
throw "Not amino_acids" if prot_seq != orf.seq.to_s
|
142
|
-
|
143
|
-
if ( strand=='-' )
|
144
|
-
prot_seq.reverse!
|
145
|
-
end
|
146
|
-
|
147
|
-
for peptide in peptides
|
148
|
-
pprob = peptide['nsp_adjusted_probability'].to_f
|
149
|
-
if ( pprob >= tool.peptide_probability_threshold )
|
150
|
-
total_peptides += 1
|
151
|
-
pep_seq = peptide['peptide_sequence']
|
152
|
-
|
153
|
-
if ( strand=='-')
|
154
|
-
pep_seq.reverse!
|
155
|
-
end
|
156
|
-
|
157
|
-
start_indexes = [0]
|
158
|
-
prot_seq.scan /#{pep_seq}/ do |match|
|
159
|
-
start_indexes << prot_seq.index(match,start_indexes.last)
|
160
|
-
end
|
161
|
-
start_indexes.delete_at(0)
|
162
|
-
|
163
|
-
# Now convert peptide coordinate to genome coordinates
|
164
|
-
# And create gff lines for each match
|
165
|
-
start_indexes.collect do |si|
|
166
|
-
pep_genomic_start = position[0] + 3*si
|
167
|
-
pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
|
168
|
-
peptide_count+=1
|
169
|
-
pep_id = "p#{peptide_count.to_s}"
|
170
|
-
pep_attributes = [["ID",pep_id],["Parent",prot_id]]
|
171
|
-
pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",
|
172
|
-
feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=pprob,
|
173
|
-
strand=strand,frame=nil,attributes=pep_attributes)
|
174
|
-
fragment_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",
|
175
|
-
feature_type="fragment",start_position=pep_genomic_start,end_position=pep_genomic_end,score='',
|
176
|
-
strand=strand,frame=nil,attributes=[["Parent",pep_id],["ID",peptide['peptide_sequence']]])
|
177
|
-
gff_db.records += [pep_gff_line,fragment_gff_line]
|
178
|
-
|
179
|
-
end
|
180
|
-
|
181
|
-
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
rescue KeyError,EncodingError
|
186
|
-
skipped+=0
|
187
|
-
end
|
188
|
-
|
189
|
-
# p orf_name
|
190
|
-
# p prot_gff_line
|
191
|
-
# exit
|
192
|
-
end
|
193
|
-
|
194
|
-
end
|
195
|
-
|
196
|
-
f = open(gff_out_file,'w+')
|
197
|
-
gff_db.records.each { |rec|
|
198
|
-
f.write(rec.to_s)
|
199
|
-
}
|
200
|
-
f.close
|
201
|
-
|
202
|
-
p "Finished."
|
203
|
-
p "Proteins: #{protein_count}"
|
204
|
-
p "Skipped Decoys: #{skipped}"
|
205
|
-
p "Total Peptides: #{total_peptides}"
|
206
|
-
p "Peptides Written: #{total_peptides - low_prob}"
|
207
|
-
p "Peptides Culled: #{low_prob}"
|
208
|
-
exit(0)
|