protk 1.2.6.pre5 → 1.3.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
@@ -1,119 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of protk
4
- #
5
- #
6
-
7
- require 'protk/constants'
8
- require 'protk/tool'
9
- require 'bio'
10
- require 'protk/fastadb'
11
- require 'bio-blastxmlparser'
12
-
13
- tool=Tool.new([:explicit_output])
14
- tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
15
-
16
- tool.options.database=nil
17
- tool.option_parser.on( '-d filename','--database filename', 'Database used for BLAST search. If provided, hit sequences will be looked up in this database' ) do |file|
18
- tool.options.database=file
19
- end
20
-
21
- tool.options.gene2go=nil
22
- tool.option_parser.on('--gene2go pathtogene2go','Path to gene2go database. If provided GO terms will be looked up') do |gene2go|
23
- tool.options.gene2go=gene2go
24
- end
25
-
26
- tool.options.gitogeneid=nil
27
- tool.option_parser.on('--gitogeneid gitogeneid.db','Path to GDBM formatted gi to geneid mapping database. If provided gene ids will be looked up') do |gitogeneid|
28
- tool.options.gitogeneid=gitogeneid
29
- end
30
-
31
- exit unless tool.check_options
32
-
33
- #require 'debugger';debugger
34
-
35
- exit unless ARGV.length == 1
36
- input_file=ARGV[0]
37
-
38
- out_file=$stdout
39
- if ( tool.explicit_output != nil)
40
- out_file=File.open(tool.explicit_output, "w")
41
- end
42
-
43
- $fastadb = nil
44
- if tool.database
45
- $fastadb=FastaDB.new(tool.database)
46
- end
47
-
48
- $gitogeneid = nil
49
- if (tool.gitogeneid!=nil) && (File.exist? tool.gitogeneid)
50
- require 'gdbm'
51
- $gitogeneid = GDBM.new(tool.gitogeneid,flags=GDBM::READER)
52
- end
53
-
54
-
55
- $gene2go = nil
56
- if (tool.gene2go!=nil) && (File.exist? tool.gene2go)
57
- require 'gdbm'
58
- $gene2go = GDBM.new(tool.gene2go,flags=GDBM::READER)
59
- end
60
-
61
- def gi_from_hit_id(hit_id)
62
- gi_scan=hit_id.scan(/gi\|(\d+)/)
63
- gi_scan.join("")
64
- end
65
-
66
- def generate_line(hsp,hit,query,hit_seq=nil)
67
-
68
- line="#{query.query_id}\t#{query.query_def}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hit.accession}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
69
- if hit_seq
70
- line << "\t#{hit_seq}"
71
- end
72
- geneid=""
73
- goterm=""
74
- if $gitogeneid
75
- geneid=$gitogeneid[gi_from_hit_id(hit.hit_id)]
76
- goterm=$gene2go[geneid] if geneid!=nil && $gene2go
77
- end
78
-
79
-
80
- # throw "No geneid" if geneid==nil
81
- line << "\t#{geneid}\t#{goterm}"
82
- # require 'debugger';debugger
83
- # puts gi_from_hit_id(hit.hit_id)
84
- # puts $gene2go[gi_from_hit_id(hit.hit_id)]
85
- line<<"\n"
86
- line
87
- end
88
-
89
- def fetch_hit_seq(hit)
90
- hit_seq=nil
91
- if $fastadb
92
- hit_seq=$fastadb.fetch(hit.hit_id).first.aaseq
93
- end
94
- hit_seq
95
- end
96
-
97
- blast = Bio::BlastXMLParser::XmlSplitterIterator.new(input_file).to_enum
98
-
99
- blast.each do |query|
100
- query.hits.each do |hit|
101
- # hit=query.hits.first
102
- # if hit
103
- hit_seq=fetch_hit_seq(hit)
104
- hit.hsps.each do |hsp|
105
- out_line=generate_line(hsp,hit,query,hit_seq)
106
-
107
- out_file.write out_line
108
- end
109
- # end
110
- end
111
- end
112
-
113
-
114
- $gitogeneid.close if $gitogeneid!=nil
115
- $gene2go.close if $gene2go!=nil
116
-
117
- #require 'debugger';debugger
118
-
119
- #puts "Hi"
@@ -1,27 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of protk
4
- # Created by Ira Cooke 14/12/2010
5
- #
6
- # Corrects retention times in omssa output
7
- #
8
-
9
- $VERBOSE=nil
10
-
11
- require 'protk/constants'
12
- require 'protk/command_runner'
13
- require 'protk/tool'
14
- require 'protk/omssa_util'
15
-
16
- # Environment with global constants
17
- #
18
- genv=Constants.new
19
-
20
- tool=Tool.new([:over_write])
21
- tool.option_parser.banner = "Correct retention times on a pepxml file produced by omssa using information from an mgf file.\n\nUsage: correct_omssa_retention_times.rb [options] file1.pep.xml file2.mgf"
22
- tool.option_parser.parse!
23
-
24
-
25
- OMSSAUtil.add_retention_times(ARGV[1],ARGV[0],tool.over_write,true)
26
-
27
-
@@ -1,95 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of protk
4
- # Created by Ira Cooke 21/3/2012
5
- #
6
- # A wrapper for the OpenMS FeatureFinder tools (FeatureFinderCentroided and FeatureFinderIsotopeWavelet)
7
-
8
- require 'protk/constants'
9
- require 'protk/command_runner'
10
- require 'protk/tool'
11
- require 'protk/openms_defaults'
12
- require 'libxml'
13
- require 'tempfile'
14
-
15
- include LibXML
16
-
17
- tool=Tool.new([:explicit_output, :background,:over_write,:prefix_suffix])
18
- tool.option_parser.banner = "Find molecular features on a set of input files.\n\nUsage: feature_finder.rb [options] file1.mzML file2.mzML ..."
19
-
20
- tool.options.intensity_type = "ref"
21
- tool.option_parser.on( '--intensity-type type',"method used to calculate intensities (ref,trans,corrected). Default = ref. See OpenMS documentation for details" ) do |type|
22
- tool.options.intensity_type = type
23
- end
24
-
25
- tool.options.intensity_threshold = "3"
26
- tool.option_parser.on( '--intensity-threshold thresh',"discard features below this intensity (Default=3). Set to -1 to retain all detected features" ) do |thresh|
27
- tool.options.intensity_threshold = thresh
28
- end
29
-
30
-
31
- exit unless tool.check_options
32
-
33
- if ( ARGV[0].nil? )
34
- puts "You must supply an input file"
35
- puts tool.option_parser
36
- exit
37
- end
38
-
39
- # Obtain a global environment object
40
- genv=Constants.new
41
-
42
- def run_ff(genv,tool,cmd,output_path,jobid)
43
- if ( !tool.over_write && Pathname.new(output_path).exist? )
44
- genv.log("Skipping analysis on existing file #{output_path}",:warn)
45
- else
46
- jobscript_path="#{output_path}.pbs.sh"
47
- job_params={:jobid=>jobid, :vmem=>"14Gb", :queue => "sixteen"}
48
- code=tool.run(cmd,genv,job_params,jobscript_path)
49
- throw "Command failed with exit code #{code}" unless code==0
50
- end
51
- end
52
-
53
- def generate_ini(tool,out_path)
54
- base_ini_file=OpenMSDefaults.new.featurefinderisotopewavelet
55
- parser = XML::Parser.file(base_ini_file)
56
- doc = parser.parse
57
- intensity_threshold_node = doc.find('//ITEM[@name="intensity_threshold"]')[0]
58
- intensity_type_node = doc.find('//ITEM[@name="intensity_type"]')[0]
59
- intensity_threshold_node['value']=tool.intensity_threshold
60
- intensity_type_node['value']=tool.intensity_type
61
- doc.save(out_path)
62
- end
63
-
64
- throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
65
-
66
- input_basename=Pathname.new(ARGV[0]).basename.to_s
67
- ini_file_name="#{Pathname.new(Tempfile.new(input_basename).path).basename.to_s}_feature_finder.ini"
68
- ini_file="#{Pathname.new(ini_file_name).dirname.realpath.to_s}/#{ini_file_name}"
69
-
70
- generate_ini(tool,ini_file)
71
-
72
- ARGV.each do |filen|
73
- input_file=filen.chomp
74
- throw "Input must be an mzML file" unless input_file=~/\.mzML$/
75
-
76
- input_basename=input_file.gsub(/\.mzML$/,'')
77
- output_dir=Pathname.new(input_basename).dirname.realpath.to_s
78
- output_base=Pathname.new(input_basename).basename.to_s
79
- output_file = "#{output_dir}/#{tool.output_prefix}#{output_base}#{tool.output_suffix}.featureXML"
80
- if ( tool.explicit_output )
81
- output_file = "#{output_dir}/#{tool.explicit_output}"
82
- end
83
-
84
- if ( tool.over_write || !Pathname.new(output_file).exist? )
85
- output_base_filename=Pathname.new(output_file).basename.to_s
86
- cmd=""
87
- cmd<<"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:#{genv.openms_root}/lib;
88
- #{genv.featurefinderisotopewavelet} -in #{Pathname.new(input_file).realpath.to_s} -out #{output_dir}/#{output_base_filename} -ini #{ini_file}"
89
-
90
- run_ff(genv,tool,cmd,output_file,tool.jobid_from_filename(input_basename))
91
-
92
- else
93
- genv.log("Skipping search on existing file #{output_file}",:warn)
94
- end
95
- end
data/bin/file_convert.rb DELETED
@@ -1,164 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of protk
4
- # Created by Ira Cooke 14/12/2010
5
- #
6
- # Wrapper for msconvert
7
- #
8
-
9
- require 'protk/constants'
10
- require 'protk/command_runner'
11
- require 'protk/tool'
12
- require 'tempfile'
13
- require 'libxml'
14
-
15
- include LibXML
16
-
17
- # Regex for cleaning mgf sed -i.bak 's/\(PEPMASS=[0-9]*.[0-9]*\)[ \t]*[0-9]*/\1/g'
18
-
19
- # Read the input file and search for an instance of the charge state cvParam inside a precursor tag. Return true if one is found. False otherwise
20
- #
21
- def has_charge_information(input_filename)
22
- #<precursorList count="1">
23
- # <precursor spectrumRef="controllerType=0 controllerNumber=1 scan=59">
24
- # <isolationWindow>
25
- # <cvParam cvRef="MS" accession="MS:1000827" name="isolation window target m/z" value="939.43" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
26
- # <cvParam cvRef="MS" accession="MS:1000828" name="isolation window lower offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
27
- # <cvParam cvRef="MS" accession="MS:1000829" name="isolation window upper offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
28
- # </isolationWindow>
29
- # <selectedIonList count="1">
30
- # <selectedIon>
31
- # <cvParam cvRef="MS" accession="MS:1000744" name="selected ion m/z" value="939.432189941406" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
32
- # <cvParam cvRef="MS" accession="MS:1000041" name="charge state" value="2"/>
33
- # <cvParam cvRef="MS" accession="MS:1000042" name="peak intensity" value="1321.692016601563" unitCvRef="MS" unitAccession="MS:1000131" unitName="number of counts"/>
34
- # </selectedIon>
35
- # </selectedIonList>
36
-
37
- reader=XML::Reader.file(input_filename)
38
-
39
- while(reader.read)
40
-
41
- if ( reader.local_name=="precursor")
42
-
43
- subdoc=reader.read_inner_xml
44
-
45
- if ( subdoc =~ /MS:1000041/ )
46
- return true
47
- end
48
-
49
- end
50
-
51
- end
52
-
53
- return false
54
-
55
- end
56
-
57
-
58
-
59
- # Setup specific command-line options for this tool. Other options are inherited from Tool
60
- #
61
- convert_tool=Tool.new([:explicit_output,:over_write,:maldi])
62
- convert_tool.option_parser.banner = "Convert files between different formats.\n\nUsage: file_convert.rb [options] input_file output_file"
63
-
64
- # Special case (usually tool specific options use capitals). Use lowercase l here to mimick maldi option in the search_tool class
65
- #
66
- convert_tool.options.maldi=false
67
- convert_tool.option_parser.on( '-l', '--maldi', 'Input Files are MALDI Spectra' ) do
68
- convert_tool.options.maldi=true
69
- end
70
-
71
- convert_tool.options.output_format="mgf"
72
- convert_tool.option_parser.on( '-F', '--format fmt', 'Convert to a specified format' ) do |fmt|
73
- convert_tool.options.output_format=fmt
74
- end
75
-
76
- #convert_tool.options.missing_charge_state="false"
77
- #convert_tool.option_parser.on( '-C', '--missing-charges', 'No attempt will be made to write charge states. Leads to better looking spectrum names' ) do |fmt|
78
- # convert_tool.options.output_format=fmt
79
- #end
80
- #end
81
-
82
-
83
-
84
- exit unless convert_tool.check_options
85
-
86
- if ( ARGV[0].nil? )
87
- puts "You must supply an input file"
88
- puts convert_tool.option_parser
89
- exit
90
- end
91
-
92
-
93
-
94
- # Environment with global constants
95
- #
96
- genv=Constants.new
97
-
98
- filename=ARGV[0]
99
-
100
-
101
- input_ext=Pathname.new(filename).extname
102
- input_relative_filename=Pathname.new(filename).basename.to_s
103
-
104
- base_output_dir=Pathname.new(filename).dirname.realpath.to_s #Default output dir is input dir
105
-
106
- output_basename=input_relative_filename.gsub(/#{input_ext}$/,"").to_s
107
-
108
- if ( convert_tool.explicit_output )
109
- output_filepath=Pathname.new(convert_tool.explicit_output)
110
- base_output_dir=output_filepath.dirname.to_s
111
-
112
- if ( convert_tool.explicit_output=~/^\//) # It's an absolute path so use absolute path as output dir
113
- # Convert base_output_dir to realpath
114
- #
115
- base_output_dir=Pathname.new(base_output_dir).realpath.to_s
116
- end
117
-
118
- output_filename=output_filepath.basename.to_s
119
-
120
- end
121
-
122
- # Create a uniquely named directory to hold the output. This is the only way to know the output of msconvert
123
- #
124
- output_dir="#{base_output_dir}/#{Pathname.new(Tempfile.new("file_convert").path).basename.to_s}"
125
- Dir.mkdir(output_dir)
126
-
127
-
128
- throw "Input format is the same as output format" if ( input_ext==".#{convert_tool.output_format}" )
129
-
130
- genv.log("Converting #{filename} to #{convert_tool.output_format}",:info)
131
- runner=CommandRunner.new(genv)
132
- basedir=Pathname.new(filename).dirname.to_s #Where we run the tool
133
-
134
- if ( convert_tool.maldi )
135
- #For MALDI we know the charge is 1 so set it explicitly. Sometimes it is missing from the data
136
- runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
137
- else
138
- if ( has_charge_information(filename) )
139
- runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.<ChargeState>\" --#{convert_tool.output_format} -o #{output_dir}")
140
- else
141
- # If input file is missing charges the best we can do is just assign charge=1. Search engines can choose to ignore this value anyway.
142
- #
143
- runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
144
- end
145
- end
146
-
147
- # Find out what the output name was
148
- #
149
- tmp_output_filename=""
150
- Dir.foreach(output_dir) { |entry_name|
151
- if ( entry_name=~/^\.$/ || entry_name=~/^\.\.$/ )
152
- else
153
- tmp_output_filename=entry_name
154
- end
155
- }
156
-
157
- # Cleanup after converting
158
- cmd = "cd #{output_dir};pwd; mv #{tmp_output_filename} #{base_output_dir}/#{output_filename}; cd ../; pwd;rm -r #{output_dir}"
159
-
160
- code =runner.run_local(cmd)
161
-
162
- throw "Command failed with exit code #{code}" unless code==0
163
-
164
- throw "Failed to create output file #{base_output_dir}/#{output_filename}" unless ( FileTest.exists?("#{base_output_dir}/#{output_filename}") )
@@ -1,42 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of MSLIMS
4
- # Created by Ira Cooke 12/4/2010
5
- #
6
- # Generates files required by the omssa galaxy wrapper
7
- #
8
-
9
- require 'protk/constants'
10
- # Environment with global constants
11
- #
12
- genv=Constants.new
13
-
14
- # Set search engine specific parameters on the SearchTool object
15
- #
16
- omssa_root="#{genv.omssa_root}/omssacl"
17
- # Get ommssa to print out a list of its acceptable modifications
18
- acceptable_mods=%x[#{omssa_root} -ml].split(/\n/).collect do |mod|
19
-
20
- mod_vals=mod.split(":")
21
- [mod_vals[0].lstrip.rstrip,mod_vals[1].lstrip.rstrip]
22
-
23
- end
24
-
25
- # Drop the header
26
- #
27
- acceptable_mods.shift
28
-
29
- loc_output=File.new("omssa_mods.loc",'w')
30
-
31
- loc_output << "#This file lists the names of chemical modifications accepted by OMMSA\n"
32
- loc_output << "#\n"
33
- loc_output << "#\n"
34
-
35
- acceptable_mods.each { |am|
36
- key = am[1].downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
37
- loc_output << "#{am[1]}\t#{key}_\t#{am[0]}\t#{key}_\n"
38
- }
39
-
40
- loc_output.close
41
-
42
-
data/bin/gffmerge.rb DELETED
@@ -1,208 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of protk
4
- # Original python version created by Max Grant
5
- # Translated to ruby by Ira Cooke 29/1/2013
6
- #
7
- #
8
-
9
- require 'protk/constants'
10
- require 'protk/tool'
11
- require 'protk/fastadb'
12
- require 'libxml'
13
- require 'bio'
14
-
15
- include LibXML
16
-
17
- tool=Tool.new([:explicit_output])
18
- tool.option_parser.banner = "Create a gff containing peptide observations.\n\nUsage: gffmerge.rb "
19
-
20
-
21
- tool.options.gff_predicted=nil
22
- tool.option_parser.on( '-g filename','--gff filename', 'Predicted Data (GFF3 Format)' ) do |file|
23
- tool.options.gff_predicted=file
24
- end
25
-
26
- tool.options.protxml=nil
27
- tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
28
- tool.options.protxml=file
29
- end
30
-
31
- tool.options.sixframe=nil
32
- tool.option_parser.on( '-t filename','--sixframe filename', 'Sixframe Translations (Fasta Format)' ) do |file|
33
- tool.options.sixframe=file
34
- end
35
-
36
- tool.options.skip_fasta_indexing=false
37
- tool.option_parser.on('--skip-index','Don\'t index sixframe translations (Index should already exist)') do
38
- tool.options.skip_fasta_indexing=true
39
- end
40
-
41
- tool.options.peptide_probability_threshold=0.95
42
- tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
43
- tool.options.peptide_probability_threshold=thresh.to_f
44
- end
45
-
46
- exit unless tool.check_options [:protxml,:sixframe]
47
-
48
- gff_out_file="merged.gff"
49
- if ( tool.explicit_output != nil)
50
- gff_out_file=tool.explicit_output
51
- end
52
-
53
- gff_db = Bio::GFF.new()
54
- if ( tool.gff_predicted !=nil)
55
- p "Reading source gff file"
56
- gff_db = Bio::GFF::GFF3.new(File.open(tool.gff_predicted))
57
- # p gff_db.records[1].attributes
58
- # exit
59
- end
60
-
61
- f = open(gff_out_file,'w+')
62
- gff_db.records.each { |rec|
63
- f.write(rec.to_s)
64
- }
65
-
66
- p "Parsing proteins from protxml"
67
- protxml_parser=XML::Parser.file(tool.protxml)
68
- protxml_doc=protxml_parser.parse
69
- proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
70
-
71
-
72
- db_filename = nil
73
- case
74
- when Pathname.new(tool.sixframe).exist? # It's an explicitly named db
75
- db_filename = Pathname.new(tool.sixframe).realpath.to_s
76
- else
77
- db_filename=Constants.new.current_database_for_name(tool.sixframe)
78
- end
79
-
80
- db_indexfilename = "#{db_filename}.pin"
81
-
82
- if File.exist?(db_indexfilename)
83
- p "Using existing indexed translations"
84
- orf_lookup = FastaDB.new(db_filename)
85
- else
86
- p "Indexing sixframe translations"
87
- orf_lookup = FastaDB.create(db_filename,db_filename,'prot')
88
- end
89
-
90
- p "Aligning peptides and writing GFF data..."
91
- low_prob = 0
92
- skipped = 0
93
- peptide_count = 0
94
- protein_count = 0
95
- total_peptides = 0
96
- for prot in proteins
97
- prot_prob = prot['probability']
98
- if ( prot_prob.to_f < tool.peptide_probability_threshold )
99
- next
100
- end
101
- indis_proteins = prot.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
102
- prot_names = [prot['protein_name']]
103
- for protein in indis_proteins
104
- prot_names += [protein['protein_name']]
105
- end
106
-
107
- peptides = prot.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
108
-
109
- for protein_name in prot_names
110
- protein_count += 1
111
- prot_qualifiers = {"source" => "OBSERVATION", "score" => prot_prob, "ID" => 'pr' + protein_count.to_s}
112
- begin
113
- puts "Looking up #{protein_name}"
114
- orf = orf_lookup.get_by_id protein_name
115
- if ( orf == nil)
116
- puts "Failed lookup for #{protein_name}"
117
- raise KeyError
118
- end
119
-
120
-
121
- position = orf.identifiers.description.split('|').collect { |pos| pos.to_i }
122
-
123
- if ( position.length != 2 )
124
- puts "Badly formatted entry #{orf}"
125
- raise EncodingError
126
- end
127
- orf_name = orf.entry_id.scan(/lcl\|(.*)/)[0][0]
128
- frame=orf_name.scan(/frame_(\d)/)[0][0]
129
- scaffold_name = orf_name.scan(/(scaffold_?\d+)_/)[0][0]
130
-
131
- strand = (frame.to_i > 3) ? '-' : '+'
132
- # strand = +1
133
-
134
- prot_id = "pr#{protein_count.to_s}"
135
- prot_attributes = [["ID",prot_id],["Name",orf_name]]
136
- prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",feature_type="protein",
137
- start_position=position[0]+1,end_position=position[1],score=prot_prob,strand=strand,frame=nil,attributes=prot_attributes)
138
- gff_db.records += ["##gff-version 3\n","##sequence-region #{scaffold_name} 1 160\n",prot_gff_line]
139
-
140
- prot_seq = orf.aaseq.to_s
141
- throw "Not amino_acids" if prot_seq != orf.seq.to_s
142
-
143
- if ( strand=='-' )
144
- prot_seq.reverse!
145
- end
146
-
147
- for peptide in peptides
148
- pprob = peptide['nsp_adjusted_probability'].to_f
149
- if ( pprob >= tool.peptide_probability_threshold )
150
- total_peptides += 1
151
- pep_seq = peptide['peptide_sequence']
152
-
153
- if ( strand=='-')
154
- pep_seq.reverse!
155
- end
156
-
157
- start_indexes = [0]
158
- prot_seq.scan /#{pep_seq}/ do |match|
159
- start_indexes << prot_seq.index(match,start_indexes.last)
160
- end
161
- start_indexes.delete_at(0)
162
-
163
- # Now convert peptide coordinate to genome coordinates
164
- # And create gff lines for each match
165
- start_indexes.collect do |si|
166
- pep_genomic_start = position[0] + 3*si
167
- pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
168
- peptide_count+=1
169
- pep_id = "p#{peptide_count.to_s}"
170
- pep_attributes = [["ID",pep_id],["Parent",prot_id]]
171
- pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",
172
- feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=pprob,
173
- strand=strand,frame=nil,attributes=pep_attributes)
174
- fragment_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",
175
- feature_type="fragment",start_position=pep_genomic_start,end_position=pep_genomic_end,score='',
176
- strand=strand,frame=nil,attributes=[["Parent",pep_id],["ID",peptide['peptide_sequence']]])
177
- gff_db.records += [pep_gff_line,fragment_gff_line]
178
-
179
- end
180
-
181
-
182
- end
183
- end
184
-
185
- rescue KeyError,EncodingError
186
- skipped+=0
187
- end
188
-
189
- # p orf_name
190
- # p prot_gff_line
191
- # exit
192
- end
193
-
194
- end
195
-
196
- f = open(gff_out_file,'w+')
197
- gff_db.records.each { |rec|
198
- f.write(rec.to_s)
199
- }
200
- f.close
201
-
202
- p "Finished."
203
- p "Proteins: #{protein_count}"
204
- p "Skipped Decoys: #{skipped}"
205
- p "Total Peptides: #{total_peptides}"
206
- p "Peptides Written: #{total_peptides - low_prob}"
207
- p "Peptides Culled: #{low_prob}"
208
- exit(0)