protk 1.2.6.pre5 → 1.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
@@ -1,119 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of protk
4
- #
5
- #
6
-
7
- require 'protk/constants'
8
- require 'protk/tool'
9
- require 'bio'
10
- require 'protk/fastadb'
11
- require 'bio-blastxmlparser'
12
-
13
- tool=Tool.new([:explicit_output])
14
- tool.option_parser.banner = "Dump BLAST xml to tabular format.\n\nUsage: blastxml_to_table.rb blast.xml"
15
-
16
- tool.options.database=nil
17
- tool.option_parser.on( '-d filename','--database filename', 'Database used for BLAST search. If provided, hit sequences will be looked up in this database' ) do |file|
18
- tool.options.database=file
19
- end
20
-
21
- tool.options.gene2go=nil
22
- tool.option_parser.on('--gene2go pathtogene2go','Path to gene2go database. If provided GO terms will be looked up') do |gene2go|
23
- tool.options.gene2go=gene2go
24
- end
25
-
26
- tool.options.gitogeneid=nil
27
- tool.option_parser.on('--gitogeneid gitogeneid.db','Path to GDBM formatted gi to geneid mapping database. If provided gene ids will be looked up') do |gitogeneid|
28
- tool.options.gitogeneid=gitogeneid
29
- end
30
-
31
- exit unless tool.check_options
32
-
33
- #require 'debugger';debugger
34
-
35
- exit unless ARGV.length == 1
36
- input_file=ARGV[0]
37
-
38
- out_file=$stdout
39
- if ( tool.explicit_output != nil)
40
- out_file=File.open(tool.explicit_output, "w")
41
- end
42
-
43
- $fastadb = nil
44
- if tool.database
45
- $fastadb=FastaDB.new(tool.database)
46
- end
47
-
48
- $gitogeneid = nil
49
- if (tool.gitogeneid!=nil) && (File.exist? tool.gitogeneid)
50
- require 'gdbm'
51
- $gitogeneid = GDBM.new(tool.gitogeneid,flags=GDBM::READER)
52
- end
53
-
54
-
55
- $gene2go = nil
56
- if (tool.gene2go!=nil) && (File.exist? tool.gene2go)
57
- require 'gdbm'
58
- $gene2go = GDBM.new(tool.gene2go,flags=GDBM::READER)
59
- end
60
-
61
- def gi_from_hit_id(hit_id)
62
- gi_scan=hit_id.scan(/gi\|(\d+)/)
63
- gi_scan.join("")
64
- end
65
-
66
- def generate_line(hsp,hit,query,hit_seq=nil)
67
-
68
- line="#{query.query_id}\t#{query.query_def}\t#{hit.hit_id}\t#{hit.hit_num}\t#{hit.hit_def}\t#{hit.accession}\t#{hsp.hsp_num}\t#{hsp.bit_score}\t#{hsp.evalue}\t#{hsp.qseq}\t#{hsp.hseq}"
69
- if hit_seq
70
- line << "\t#{hit_seq}"
71
- end
72
- geneid=""
73
- goterm=""
74
- if $gitogeneid
75
- geneid=$gitogeneid[gi_from_hit_id(hit.hit_id)]
76
- goterm=$gene2go[geneid] if geneid!=nil && $gene2go
77
- end
78
-
79
-
80
- # throw "No geneid" if geneid==nil
81
- line << "\t#{geneid}\t#{goterm}"
82
- # require 'debugger';debugger
83
- # puts gi_from_hit_id(hit.hit_id)
84
- # puts $gene2go[gi_from_hit_id(hit.hit_id)]
85
- line<<"\n"
86
- line
87
- end
88
-
89
- def fetch_hit_seq(hit)
90
- hit_seq=nil
91
- if $fastadb
92
- hit_seq=$fastadb.fetch(hit.hit_id).first.aaseq
93
- end
94
- hit_seq
95
- end
96
-
97
- blast = Bio::BlastXMLParser::XmlSplitterIterator.new(input_file).to_enum
98
-
99
- blast.each do |query|
100
- query.hits.each do |hit|
101
- # hit=query.hits.first
102
- # if hit
103
- hit_seq=fetch_hit_seq(hit)
104
- hit.hsps.each do |hsp|
105
- out_line=generate_line(hsp,hit,query,hit_seq)
106
-
107
- out_file.write out_line
108
- end
109
- # end
110
- end
111
- end
112
-
113
-
114
- $gitogeneid.close if $gitogeneid!=nil
115
- $gene2go.close if $gene2go!=nil
116
-
117
- #require 'debugger';debugger
118
-
119
- #puts "Hi"
@@ -1,27 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of protk
4
- # Created by Ira Cooke 14/12/2010
5
- #
6
- # Corrects retention times in omssa output
7
- #
8
-
9
- $VERBOSE=nil
10
-
11
- require 'protk/constants'
12
- require 'protk/command_runner'
13
- require 'protk/tool'
14
- require 'protk/omssa_util'
15
-
16
- # Environment with global constants
17
- #
18
- genv=Constants.new
19
-
20
- tool=Tool.new([:over_write])
21
- tool.option_parser.banner = "Correct retention times on a pepxml file produced by omssa using information from an mgf file.\n\nUsage: correct_omssa_retention_times.rb [options] file1.pep.xml file2.mgf"
22
- tool.option_parser.parse!
23
-
24
-
25
- OMSSAUtil.add_retention_times(ARGV[1],ARGV[0],tool.over_write,true)
26
-
27
-
@@ -1,95 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of protk
4
- # Created by Ira Cooke 21/3/2012
5
- #
6
- # A wrapper for the OpenMS FeatureFinder tools (FeatureFinderCentroided and FeatureFinderIsotopeWavelet)
7
-
8
- require 'protk/constants'
9
- require 'protk/command_runner'
10
- require 'protk/tool'
11
- require 'protk/openms_defaults'
12
- require 'libxml'
13
- require 'tempfile'
14
-
15
- include LibXML
16
-
17
- tool=Tool.new([:explicit_output, :background,:over_write,:prefix_suffix])
18
- tool.option_parser.banner = "Find molecular features on a set of input files.\n\nUsage: feature_finder.rb [options] file1.mzML file2.mzML ..."
19
-
20
- tool.options.intensity_type = "ref"
21
- tool.option_parser.on( '--intensity-type type',"method used to calculate intensities (ref,trans,corrected). Default = ref. See OpenMS documentation for details" ) do |type|
22
- tool.options.intensity_type = type
23
- end
24
-
25
- tool.options.intensity_threshold = "3"
26
- tool.option_parser.on( '--intensity-threshold thresh',"discard features below this intensity (Default=3). Set to -1 to retain all detected features" ) do |thresh|
27
- tool.options.intensity_threshold = thresh
28
- end
29
-
30
-
31
- exit unless tool.check_options
32
-
33
- if ( ARGV[0].nil? )
34
- puts "You must supply an input file"
35
- puts tool.option_parser
36
- exit
37
- end
38
-
39
- # Obtain a global environment object
40
- genv=Constants.new
41
-
42
- def run_ff(genv,tool,cmd,output_path,jobid)
43
- if ( !tool.over_write && Pathname.new(output_path).exist? )
44
- genv.log("Skipping analysis on existing file #{output_path}",:warn)
45
- else
46
- jobscript_path="#{output_path}.pbs.sh"
47
- job_params={:jobid=>jobid, :vmem=>"14Gb", :queue => "sixteen"}
48
- code=tool.run(cmd,genv,job_params,jobscript_path)
49
- throw "Command failed with exit code #{code}" unless code==0
50
- end
51
- end
52
-
53
- def generate_ini(tool,out_path)
54
- base_ini_file=OpenMSDefaults.new.featurefinderisotopewavelet
55
- parser = XML::Parser.file(base_ini_file)
56
- doc = parser.parse
57
- intensity_threshold_node = doc.find('//ITEM[@name="intensity_threshold"]')[0]
58
- intensity_type_node = doc.find('//ITEM[@name="intensity_type"]')[0]
59
- intensity_threshold_node['value']=tool.intensity_threshold
60
- intensity_type_node['value']=tool.intensity_type
61
- doc.save(out_path)
62
- end
63
-
64
- throw "Cannot use explicit output in combination with multiple input files" if ( tool.explicit_output && ARGV.length>1)
65
-
66
- input_basename=Pathname.new(ARGV[0]).basename.to_s
67
- ini_file_name="#{Pathname.new(Tempfile.new(input_basename).path).basename.to_s}_feature_finder.ini"
68
- ini_file="#{Pathname.new(ini_file_name).dirname.realpath.to_s}/#{ini_file_name}"
69
-
70
- generate_ini(tool,ini_file)
71
-
72
- ARGV.each do |filen|
73
- input_file=filen.chomp
74
- throw "Input must be an mzML file" unless input_file=~/\.mzML$/
75
-
76
- input_basename=input_file.gsub(/\.mzML$/,'')
77
- output_dir=Pathname.new(input_basename).dirname.realpath.to_s
78
- output_base=Pathname.new(input_basename).basename.to_s
79
- output_file = "#{output_dir}/#{tool.output_prefix}#{output_base}#{tool.output_suffix}.featureXML"
80
- if ( tool.explicit_output )
81
- output_file = "#{output_dir}/#{tool.explicit_output}"
82
- end
83
-
84
- if ( tool.over_write || !Pathname.new(output_file).exist? )
85
- output_base_filename=Pathname.new(output_file).basename.to_s
86
- cmd=""
87
- cmd<<"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:#{genv.openms_root}/lib;
88
- #{genv.featurefinderisotopewavelet} -in #{Pathname.new(input_file).realpath.to_s} -out #{output_dir}/#{output_base_filename} -ini #{ini_file}"
89
-
90
- run_ff(genv,tool,cmd,output_file,tool.jobid_from_filename(input_basename))
91
-
92
- else
93
- genv.log("Skipping search on existing file #{output_file}",:warn)
94
- end
95
- end
data/bin/file_convert.rb DELETED
@@ -1,164 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of protk
4
- # Created by Ira Cooke 14/12/2010
5
- #
6
- # Wrapper for msconvert
7
- #
8
-
9
- require 'protk/constants'
10
- require 'protk/command_runner'
11
- require 'protk/tool'
12
- require 'tempfile'
13
- require 'libxml'
14
-
15
- include LibXML
16
-
17
- # Regex for cleaning mgf sed -i.bak 's/\(PEPMASS=[0-9]*.[0-9]*\)[ \t]*[0-9]*/\1/g'
18
-
19
- # Read the input file and search for an instance of the charge state cvParam inside a precursor tag. Return true if one is found. False otherwise
20
- #
21
- def has_charge_information(input_filename)
22
- #<precursorList count="1">
23
- # <precursor spectrumRef="controllerType=0 controllerNumber=1 scan=59">
24
- # <isolationWindow>
25
- # <cvParam cvRef="MS" accession="MS:1000827" name="isolation window target m/z" value="939.43" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
26
- # <cvParam cvRef="MS" accession="MS:1000828" name="isolation window lower offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
27
- # <cvParam cvRef="MS" accession="MS:1000829" name="isolation window upper offset" value="2.0" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
28
- # </isolationWindow>
29
- # <selectedIonList count="1">
30
- # <selectedIon>
31
- # <cvParam cvRef="MS" accession="MS:1000744" name="selected ion m/z" value="939.432189941406" unitCvRef="MS" unitAccession="MS:1000040" unitName="m/z"/>
32
- # <cvParam cvRef="MS" accession="MS:1000041" name="charge state" value="2"/>
33
- # <cvParam cvRef="MS" accession="MS:1000042" name="peak intensity" value="1321.692016601563" unitCvRef="MS" unitAccession="MS:1000131" unitName="number of counts"/>
34
- # </selectedIon>
35
- # </selectedIonList>
36
-
37
- reader=XML::Reader.file(input_filename)
38
-
39
- while(reader.read)
40
-
41
- if ( reader.local_name=="precursor")
42
-
43
- subdoc=reader.read_inner_xml
44
-
45
- if ( subdoc =~ /MS:1000041/ )
46
- return true
47
- end
48
-
49
- end
50
-
51
- end
52
-
53
- return false
54
-
55
- end
56
-
57
-
58
-
59
- # Setup specific command-line options for this tool. Other options are inherited from Tool
60
- #
61
- convert_tool=Tool.new([:explicit_output,:over_write,:maldi])
62
- convert_tool.option_parser.banner = "Convert files between different formats.\n\nUsage: file_convert.rb [options] input_file output_file"
63
-
64
- # Special case (usually tool specific options use capitals). Use lowercase l here to mimick maldi option in the search_tool class
65
- #
66
- convert_tool.options.maldi=false
67
- convert_tool.option_parser.on( '-l', '--maldi', 'Input Files are MALDI Spectra' ) do
68
- convert_tool.options.maldi=true
69
- end
70
-
71
- convert_tool.options.output_format="mgf"
72
- convert_tool.option_parser.on( '-F', '--format fmt', 'Convert to a specified format' ) do |fmt|
73
- convert_tool.options.output_format=fmt
74
- end
75
-
76
- #convert_tool.options.missing_charge_state="false"
77
- #convert_tool.option_parser.on( '-C', '--missing-charges', 'No attempt will be made to write charge states. Leads to better looking spectrum names' ) do |fmt|
78
- # convert_tool.options.output_format=fmt
79
- #end
80
- #end
81
-
82
-
83
-
84
- exit unless convert_tool.check_options
85
-
86
- if ( ARGV[0].nil? )
87
- puts "You must supply an input file"
88
- puts convert_tool.option_parser
89
- exit
90
- end
91
-
92
-
93
-
94
- # Environment with global constants
95
- #
96
- genv=Constants.new
97
-
98
- filename=ARGV[0]
99
-
100
-
101
- input_ext=Pathname.new(filename).extname
102
- input_relative_filename=Pathname.new(filename).basename.to_s
103
-
104
- base_output_dir=Pathname.new(filename).dirname.realpath.to_s #Default output dir is input dir
105
-
106
- output_basename=input_relative_filename.gsub(/#{input_ext}$/,"").to_s
107
-
108
- if ( convert_tool.explicit_output )
109
- output_filepath=Pathname.new(convert_tool.explicit_output)
110
- base_output_dir=output_filepath.dirname.to_s
111
-
112
- if ( convert_tool.explicit_output=~/^\//) # It's an absolute path so use absolute path as output dir
113
- # Convert base_output_dir to realpath
114
- #
115
- base_output_dir=Pathname.new(base_output_dir).realpath.to_s
116
- end
117
-
118
- output_filename=output_filepath.basename.to_s
119
-
120
- end
121
-
122
- # Create a uniquely named directory to hold the output. This is the only way to know the output of msconvert
123
- #
124
- output_dir="#{base_output_dir}/#{Pathname.new(Tempfile.new("file_convert").path).basename.to_s}"
125
- Dir.mkdir(output_dir)
126
-
127
-
128
- throw "Input format is the same as output format" if ( input_ext==".#{convert_tool.output_format}" )
129
-
130
- genv.log("Converting #{filename} to #{convert_tool.output_format}",:info)
131
- runner=CommandRunner.new(genv)
132
- basedir=Pathname.new(filename).dirname.to_s #Where we run the tool
133
-
134
- if ( convert_tool.maldi )
135
- #For MALDI we know the charge is 1 so set it explicitly. Sometimes it is missing from the data
136
- runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
137
- else
138
- if ( has_charge_information(filename) )
139
- runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.<ChargeState>\" --#{convert_tool.output_format} -o #{output_dir}")
140
- else
141
- # If input file is missing charges the best we can do is just assign charge=1. Search engines can choose to ignore this value anyway.
142
- #
143
- runner.run_local("cd #{basedir}; #{genv.msconvert} #{input_relative_filename} --filter \"titleMaker <RunId>.<ScanNumber>.<ScanNumber>.1\" --#{convert_tool.output_format} -o #{output_dir}")
144
- end
145
- end
146
-
147
- # Find out what the output name was
148
- #
149
- tmp_output_filename=""
150
- Dir.foreach(output_dir) { |entry_name|
151
- if ( entry_name=~/^\.$/ || entry_name=~/^\.\.$/ )
152
- else
153
- tmp_output_filename=entry_name
154
- end
155
- }
156
-
157
- # Cleanup after converting
158
- cmd = "cd #{output_dir};pwd; mv #{tmp_output_filename} #{base_output_dir}/#{output_filename}; cd ../; pwd;rm -r #{output_dir}"
159
-
160
- code =runner.run_local(cmd)
161
-
162
- throw "Command failed with exit code #{code}" unless code==0
163
-
164
- throw "Failed to create output file #{base_output_dir}/#{output_filename}" unless ( FileTest.exists?("#{base_output_dir}/#{output_filename}") )
@@ -1,42 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of MSLIMS
4
- # Created by Ira Cooke 12/4/2010
5
- #
6
- # Generates files required by the omssa galaxy wrapper
7
- #
8
-
9
- require 'protk/constants'
10
- # Environment with global constants
11
- #
12
- genv=Constants.new
13
-
14
- # Set search engine specific parameters on the SearchTool object
15
- #
16
- omssa_root="#{genv.omssa_root}/omssacl"
17
- # Get ommssa to print out a list of its acceptable modifications
18
- acceptable_mods=%x[#{omssa_root} -ml].split(/\n/).collect do |mod|
19
-
20
- mod_vals=mod.split(":")
21
- [mod_vals[0].lstrip.rstrip,mod_vals[1].lstrip.rstrip]
22
-
23
- end
24
-
25
- # Drop the header
26
- #
27
- acceptable_mods.shift
28
-
29
- loc_output=File.new("omssa_mods.loc",'w')
30
-
31
- loc_output << "#This file lists the names of chemical modifications accepted by OMMSA\n"
32
- loc_output << "#\n"
33
- loc_output << "#\n"
34
-
35
- acceptable_mods.each { |am|
36
- key = am[1].downcase.gsub(" ","").gsub("\(","\_").gsub("\)","\_").gsub("\:","\_").gsub("\-\>","\_")
37
- loc_output << "#{am[1]}\t#{key}_\t#{am[0]}\t#{key}_\n"
38
- }
39
-
40
- loc_output.close
41
-
42
-
data/bin/gffmerge.rb DELETED
@@ -1,208 +0,0 @@
1
- #!/usr/bin/env ruby
2
- #
3
- # This file is part of protk
4
- # Original python version created by Max Grant
5
- # Translated to ruby by Ira Cooke 29/1/2013
6
- #
7
- #
8
-
9
- require 'protk/constants'
10
- require 'protk/tool'
11
- require 'protk/fastadb'
12
- require 'libxml'
13
- require 'bio'
14
-
15
- include LibXML
16
-
17
- tool=Tool.new([:explicit_output])
18
- tool.option_parser.banner = "Create a gff containing peptide observations.\n\nUsage: gffmerge.rb "
19
-
20
-
21
- tool.options.gff_predicted=nil
22
- tool.option_parser.on( '-g filename','--gff filename', 'Predicted Data (GFF3 Format)' ) do |file|
23
- tool.options.gff_predicted=file
24
- end
25
-
26
- tool.options.protxml=nil
27
- tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
28
- tool.options.protxml=file
29
- end
30
-
31
- tool.options.sixframe=nil
32
- tool.option_parser.on( '-t filename','--sixframe filename', 'Sixframe Translations (Fasta Format)' ) do |file|
33
- tool.options.sixframe=file
34
- end
35
-
36
- tool.options.skip_fasta_indexing=false
37
- tool.option_parser.on('--skip-index','Don\'t index sixframe translations (Index should already exist)') do
38
- tool.options.skip_fasta_indexing=true
39
- end
40
-
41
- tool.options.peptide_probability_threshold=0.95
42
- tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
43
- tool.options.peptide_probability_threshold=thresh.to_f
44
- end
45
-
46
- exit unless tool.check_options [:protxml,:sixframe]
47
-
48
- gff_out_file="merged.gff"
49
- if ( tool.explicit_output != nil)
50
- gff_out_file=tool.explicit_output
51
- end
52
-
53
- gff_db = Bio::GFF.new()
54
- if ( tool.gff_predicted !=nil)
55
- p "Reading source gff file"
56
- gff_db = Bio::GFF::GFF3.new(File.open(tool.gff_predicted))
57
- # p gff_db.records[1].attributes
58
- # exit
59
- end
60
-
61
- f = open(gff_out_file,'w+')
62
- gff_db.records.each { |rec|
63
- f.write(rec.to_s)
64
- }
65
-
66
- p "Parsing proteins from protxml"
67
- protxml_parser=XML::Parser.file(tool.protxml)
68
- protxml_doc=protxml_parser.parse
69
- proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
70
-
71
-
72
- db_filename = nil
73
- case
74
- when Pathname.new(tool.sixframe).exist? # It's an explicitly named db
75
- db_filename = Pathname.new(tool.sixframe).realpath.to_s
76
- else
77
- db_filename=Constants.new.current_database_for_name(tool.sixframe)
78
- end
79
-
80
- db_indexfilename = "#{db_filename}.pin"
81
-
82
- if File.exist?(db_indexfilename)
83
- p "Using existing indexed translations"
84
- orf_lookup = FastaDB.new(db_filename)
85
- else
86
- p "Indexing sixframe translations"
87
- orf_lookup = FastaDB.create(db_filename,db_filename,'prot')
88
- end
89
-
90
- p "Aligning peptides and writing GFF data..."
91
- low_prob = 0
92
- skipped = 0
93
- peptide_count = 0
94
- protein_count = 0
95
- total_peptides = 0
96
- for prot in proteins
97
- prot_prob = prot['probability']
98
- if ( prot_prob.to_f < tool.peptide_probability_threshold )
99
- next
100
- end
101
- indis_proteins = prot.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
102
- prot_names = [prot['protein_name']]
103
- for protein in indis_proteins
104
- prot_names += [protein['protein_name']]
105
- end
106
-
107
- peptides = prot.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
108
-
109
- for protein_name in prot_names
110
- protein_count += 1
111
- prot_qualifiers = {"source" => "OBSERVATION", "score" => prot_prob, "ID" => 'pr' + protein_count.to_s}
112
- begin
113
- puts "Looking up #{protein_name}"
114
- orf = orf_lookup.get_by_id protein_name
115
- if ( orf == nil)
116
- puts "Failed lookup for #{protein_name}"
117
- raise KeyError
118
- end
119
-
120
-
121
- position = orf.identifiers.description.split('|').collect { |pos| pos.to_i }
122
-
123
- if ( position.length != 2 )
124
- puts "Badly formatted entry #{orf}"
125
- raise EncodingError
126
- end
127
- orf_name = orf.entry_id.scan(/lcl\|(.*)/)[0][0]
128
- frame=orf_name.scan(/frame_(\d)/)[0][0]
129
- scaffold_name = orf_name.scan(/(scaffold_?\d+)_/)[0][0]
130
-
131
- strand = (frame.to_i > 3) ? '-' : '+'
132
- # strand = +1
133
-
134
- prot_id = "pr#{protein_count.to_s}"
135
- prot_attributes = [["ID",prot_id],["Name",orf_name]]
136
- prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",feature_type="protein",
137
- start_position=position[0]+1,end_position=position[1],score=prot_prob,strand=strand,frame=nil,attributes=prot_attributes)
138
- gff_db.records += ["##gff-version 3\n","##sequence-region #{scaffold_name} 1 160\n",prot_gff_line]
139
-
140
- prot_seq = orf.aaseq.to_s
141
- throw "Not amino_acids" if prot_seq != orf.seq.to_s
142
-
143
- if ( strand=='-' )
144
- prot_seq.reverse!
145
- end
146
-
147
- for peptide in peptides
148
- pprob = peptide['nsp_adjusted_probability'].to_f
149
- if ( pprob >= tool.peptide_probability_threshold )
150
- total_peptides += 1
151
- pep_seq = peptide['peptide_sequence']
152
-
153
- if ( strand=='-')
154
- pep_seq.reverse!
155
- end
156
-
157
- start_indexes = [0]
158
- prot_seq.scan /#{pep_seq}/ do |match|
159
- start_indexes << prot_seq.index(match,start_indexes.last)
160
- end
161
- start_indexes.delete_at(0)
162
-
163
- # Now convert peptide coordinate to genome coordinates
164
- # And create gff lines for each match
165
- start_indexes.collect do |si|
166
- pep_genomic_start = position[0] + 3*si
167
- pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
168
- peptide_count+=1
169
- pep_id = "p#{peptide_count.to_s}"
170
- pep_attributes = [["ID",pep_id],["Parent",prot_id]]
171
- pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",
172
- feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=pprob,
173
- strand=strand,frame=nil,attributes=pep_attributes)
174
- fragment_gff_line = Bio::GFF::GFF3::Record.new(seqid = scaffold_name,source="OBSERVATION",
175
- feature_type="fragment",start_position=pep_genomic_start,end_position=pep_genomic_end,score='',
176
- strand=strand,frame=nil,attributes=[["Parent",pep_id],["ID",peptide['peptide_sequence']]])
177
- gff_db.records += [pep_gff_line,fragment_gff_line]
178
-
179
- end
180
-
181
-
182
- end
183
- end
184
-
185
- rescue KeyError,EncodingError
186
- skipped+=0
187
- end
188
-
189
- # p orf_name
190
- # p prot_gff_line
191
- # exit
192
- end
193
-
194
- end
195
-
196
- f = open(gff_out_file,'w+')
197
- gff_db.records.each { |rec|
198
- f.write(rec.to_s)
199
- }
200
- f.close
201
-
202
- p "Finished."
203
- p "Proteins: #{protein_count}"
204
- p "Skipped Decoys: #{skipped}"
205
- p "Total Peptides: #{total_peptides}"
206
- p "Peptides Written: #{total_peptides - low_prob}"
207
- p "Peptides Culled: #{low_prob}"
208
- exit(0)