protk 1.4.1 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +32 -15
- data/bin/mzid_to_pepxml.rb +75 -0
- data/bin/mzid_to_protxml.rb +77 -0
- data/bin/protxml_to_gff.rb +1 -1
- data/bin/sixframe.rb +24 -5
- data/bin/spectrast_create.rb +125 -0
- data/bin/spectrast_filter.rb +108 -0
- data/lib/protk/command_runner.rb +1 -1
- data/lib/protk/data/template_pep.xml +34 -0
- data/lib/protk/data/template_prot.xml +39 -0
- data/lib/protk/mzidentml_doc.rb +140 -0
- data/lib/protk/mzml_parser.rb +9 -0
- data/lib/protk/peptide.rb +39 -5
- data/lib/protk/pepxml_writer.rb +24 -0
- data/lib/protk/physical_constants.rb +1 -0
- data/lib/protk/protein.rb +64 -1
- data/lib/protk/protein_group.rb +70 -0
- data/lib/protk/protxml_writer.rb +27 -0
- data/lib/protk/psm.rb +222 -0
- data/lib/protk/search_tool.rb +1 -6
- data/lib/protk/sniffer.rb +35 -0
- data/lib/protk/spectrum_query.rb +132 -0
- metadata +20 -2
@@ -0,0 +1,108 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 30/4/2015
|
5
|
+
#
|
6
|
+
# A wrapper for SpectraST commands that manipulate splib files
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/command_runner'
|
12
|
+
require 'protk/tool'
|
13
|
+
require 'protk/galaxy_util'
|
14
|
+
|
15
|
+
for_galaxy = GalaxyUtil.for_galaxy?
|
16
|
+
|
17
|
+
genv=Constants.instance
|
18
|
+
|
19
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
20
|
+
#
|
21
|
+
spectrast_tool=Tool.new([:explicit_output])
|
22
|
+
spectrast_tool.option_parser.banner = "Manipulate splib files.\n\nUsage: spectrast_filter.rb [options] file1.splib file1.splib ..."
|
23
|
+
spectrast_tool.add_boolean_option(:binary_output,false,['-B','--binary-output','Produce spectral libraries in binary format rather than ASCII'])
|
24
|
+
spectrast_tool.add_value_option(:filter_predicate,nil,['--predicate pred','Keep only spectra satifying predicate pred. Should be a C-style predicate'])
|
25
|
+
spectrast_tool.add_value_option(:merge_operation,"U",['--merge method',
|
26
|
+
'How to combine multiple splib files (if provided). Options are U,S,H
|
27
|
+
U: Union. Include all the peptide ions in all the files.
|
28
|
+
S: Subtraction. Only include peptide ions in the first file
|
29
|
+
that are not present in any of the other files.
|
30
|
+
H: Subtraction of homologs. Only include peptide ions in the
|
31
|
+
first file that do not have any homologs with
|
32
|
+
same charge and similar m/z in any of the other files.
|
33
|
+
A: Appending. Each peptide ion is added from only one library:
|
34
|
+
the first file in the argument list that contains that peptide ion.
|
35
|
+
Useful for keeping existing consensus spectra unchanged while adding
|
36
|
+
only previously unseen peptide ions.'])
|
37
|
+
spectrast_tool.add_value_option(:spectrum_operation,"None",['--replicates method',
|
38
|
+
'How to derive a single spectrum from replicates. Options are None, C,B
|
39
|
+
C: Consensus. Create the consensus spectrum of all replicate spectra of each peptide ion.
|
40
|
+
B: Best replicate. Pick the best replicate of each peptide ion.'])
|
41
|
+
|
42
|
+
exit unless spectrast_tool.check_options(true)
|
43
|
+
|
44
|
+
spectrast_bin = %x[which spectrast].chomp
|
45
|
+
|
46
|
+
# LIBRARY MANIPULATION OPTIONS (Applicable with .splib files)
|
47
|
+
# -cf<pred> Filter library. Keep only those entries satisfying the predicate <pred>.
|
48
|
+
# <pred> should be a C-style predicate in quotes.
|
49
|
+
# -cJU Union. Include all the peptide ions in all the files.
|
50
|
+
# -cJI Intersection. Only include peptide ions that are present in all the files.
|
51
|
+
# -cJS Subtraction. Only include peptide ions in the first file that are not present in any of the other files.
|
52
|
+
# -cJH Subtraction of homologs. Only include peptide ions in the first file
|
53
|
+
# that do not have any homologs with same charge and similar m/z in any of the other files.
|
54
|
+
# -cJA Appending. Each peptide ion is added from only one library: the first file in the argument list that contains that peptide ion.
|
55
|
+
# Useful for keeping existing consensus spectra unchanged while adding only previously unseen peptide ions.
|
56
|
+
# -cAB Best replicate. Pick the best replicate of each peptide ion.
|
57
|
+
# -cAC Consensus. Create the consensus spectrum of all replicate spectra of each peptide ion.
|
58
|
+
# -cAQ Quality filter. Apply quality filters to library.
|
59
|
+
# IMPORTANT: Quality filter can only be applied on a SINGLE .splib file with no peptide ion represented by more than one spectrum.
|
60
|
+
# -cAD Create artificial decoy spectra.
|
61
|
+
# -cAN Sort library entries by descending number of replicates used (tie-breaking by probability).
|
62
|
+
# -cAM Create semi-empirical spectra based on allowable modifications specified by -cx option.
|
63
|
+
# -cQ<num> Produce reduced spectra of at most <num> peaks. Inactive with -cAQ and -cAD.
|
64
|
+
# -cD<file> Refresh protein mappings of each library entry against the protein database <file> (Must be in .fasta format).
|
65
|
+
# -cu Delete entries whose peptide sequences do not map to any protein during refreshing with -cD option.
|
66
|
+
# When off, unmapped entries will be marked with Protein=0/UNMAPPED but retained in library. (Turn off with -cu!).
|
67
|
+
# -cd Delete entries whose peptide sequences map to multiple proteins during refreshing with -cD option. (Turn off with -cd!).
|
68
|
+
|
69
|
+
input_stagers=[]
|
70
|
+
inputs=ARGV.collect { |file_name| file_name.chomp}
|
71
|
+
if for_galaxy
|
72
|
+
input_stagers = inputs.collect {|ip| GalaxyStager.new(ip,{:extension=>".splib"}) }
|
73
|
+
inputs=input_stagers.collect { |sg| sg.staged_path }
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
cmd="#{spectrast_bin} "
|
78
|
+
|
79
|
+
unless spectrast_tool.binary_output
|
80
|
+
cmd << " -c_BIN!"
|
81
|
+
end
|
82
|
+
|
83
|
+
if spectrast_tool.filter_predicate
|
84
|
+
cmd << " -cf'#{spectrast_tool.filter_predicate}'"
|
85
|
+
end
|
86
|
+
|
87
|
+
if inputs.length > 1
|
88
|
+
cmd << " -cJ#{spectrast_tool.merge_operation}"
|
89
|
+
end
|
90
|
+
|
91
|
+
if spectrast_tool.spectrum_operation!="None"
|
92
|
+
cmd << " -cA#{spectrast_tool.spectrum_operation}"
|
93
|
+
end
|
94
|
+
|
95
|
+
if spectrast_tool.explicit_output==nil
|
96
|
+
output_file_name=Tool.default_output_path(inputs,"","","")
|
97
|
+
else
|
98
|
+
output_file_name=spectrast_tool.explicit_output
|
99
|
+
end
|
100
|
+
|
101
|
+
cmd << " -cN#{output_file_name}"
|
102
|
+
|
103
|
+
inputs.each { |ip| cmd << " #{ip}" }
|
104
|
+
|
105
|
+
# code = spectrast_tool.run(cmd,genv)
|
106
|
+
# throw "Command failed with exit code #{code}" unless code==0
|
107
|
+
|
108
|
+
%x[#{cmd}]
|
data/lib/protk/command_runner.rb
CHANGED
@@ -30,7 +30,7 @@ class CommandRunner
|
|
30
30
|
def run_local(command_string)
|
31
31
|
@env.log("Command: #{command_string} started",:info)
|
32
32
|
status = Open4::popen4("#{command_string} ") do |pid, stdin, stdout, stderr|
|
33
|
-
|
33
|
+
@env.log "PID #{pid}" , :info
|
34
34
|
|
35
35
|
stdout.each { |line| @env.log(line.chomp,:info) }
|
36
36
|
|
@@ -0,0 +1,34 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<?xml-stylesheet type="text/xsl" href="/Users/icooke/Sources/protk/spec/data/mr176-BSA100fmole_BA3_01_8167.d_tandem_pproph.pep.xsl"?>
|
3
|
+
<msms_pipeline_analysis date="2014-06-22T15:28:36" summary_xml="/Users/icooke/Sources/protk/spec/data/mr176-BSA100fmole_BA3_01_8167.d_tandem_pproph.pep.xml" xmlns="http://regis-web.systemsbiology.net/pepXML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://regis-web.systemsbiology.net/pepXML /Users/icooke/bin/tpp/schema/pepXML_v117.xsd">
|
4
|
+
<analysis_summary analysis="peptideprophet" time="2014-06-22T15:28:36">
|
5
|
+
</analysis_summary>
|
6
|
+
<analysis_summary analysis="database_refresh" time="2014-06-22T15:28:36"/>
|
7
|
+
<analysis_summary analysis="interact" time="2014-06-22T15:28:36">
|
8
|
+
<interact_summary filename="/Users/icooke/Sources/protk/spec/data/mr176-BSA100fmole_BA3_01_8167.d_tandem_pproph.pep.xml" directory="">
|
9
|
+
<inputfile name="mr176-BSA100fmole_BA3_01_8167.d_tandem.pep.xml" directory="/Users/icooke/Sources/protk/spec/data"/>
|
10
|
+
</interact_summary>
|
11
|
+
</analysis_summary>
|
12
|
+
<dataset_derivation generation_no="0"/>
|
13
|
+
<msms_run_summary base_name="/Users/icooke/Sources/protk/spec/data/mr176-BSA100fmole_BA3_01_8167.d_tandem.tandem" search_engine="X! Tandem" raw_data_type="raw" raw_data=".?">
|
14
|
+
<sample_enzyme name="trypsin">
|
15
|
+
<specificity cut="KR" no_cut="P" sense="C"/>
|
16
|
+
</sample_enzyme>
|
17
|
+
<search_summary base_name="mr176-BSA100fmole_BA3_01_8167.d_tandem.tandem" search_engine="X! Tandem" precursor_mass_type="monoisotopic" fragment_mass_type="monoisotopic" search_id="1">
|
18
|
+
<search_database local_path="/Users/icooke/Sources/protk/spec/data/AASequences.fasta" type="AA"/>
|
19
|
+
<enzymatic_search_constraint enzyme="trypsin" max_num_internal_cleavages="2" min_number_termini="1"/>
|
20
|
+
<aminoacid_modification aminoacid="E" massdiff="-18.0106" mass="111.0320" variable="Y" symbol="^"/>
|
21
|
+
<!--X! Tandem n-terminal AA variable modification-->
|
22
|
+
<aminoacid_modification aminoacid="M" massdiff="15.9949" mass="147.0354" variable="Y"/>
|
23
|
+
<aminoacid_modification aminoacid="Q" massdiff="-17.0265" mass="111.0321" variable="Y" symbol="^"/>
|
24
|
+
<!--X! Tandem n-terminal AA variable modification-->
|
25
|
+
<terminal_modification terminus="n" massdiff="42.0106" mass="43.0184" protein_terminus="N" variable="Y" symbol="^"/>
|
26
|
+
|
27
|
+
</search_summary>
|
28
|
+
<analysis_timestamp analysis="peptideprophet" time="2014-06-22T15:28:36" id="1"/>
|
29
|
+
<analysis_timestamp analysis="database_refresh" time="2014-06-22T15:28:36" id="1">
|
30
|
+
<database_refresh_timestamp database="/Users/icooke/Sources/protk/spec/data/AASequences.fasta" min_num_enz_term="1"/>
|
31
|
+
</analysis_timestamp>
|
32
|
+
|
33
|
+
</msms_run_summary>
|
34
|
+
</msms_pipeline_analysis>
|
@@ -0,0 +1,39 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<protein_summary xmlns="http://regis-web.systemsbiology.net/protXML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://sashimi.sourceforge.net/schema_revision/protXML/protXML_v6.xsd" summary_xml="">
|
3
|
+
<protein_summary_header reference_database="FULLPATH_TO_REFERENCE_DB" residue_substitution_list="I -> L" source_files="FULLPATH_TO_SOURCE_PEPXML" source_files_alt="FULLPATH_TO_SOURCE_PEPXML" min_peptide_probability="" min_peptide_weight="" num_predicted_correct_prots="" num_input_1_spectra="" num_input_2_spectra="" num_input_3_spectra="" num_input_4_spectra="" num_input_5_spectra="" initial_min_peptide_prob="" total_no_spectrum_ids="" sample_enzyme="trypsin">
|
4
|
+
<program_details analysis="proteinprophet" time="2014-01-20T14:17:37" version=" Insilicos_LabKey_C++ (TPP v0.0 Development trunk rev 0, Build 201307090846 (linux))">
|
5
|
+
<proteinprophet_details occam_flag="Y" groups_flag="Y" degen_flag="Y" nsp_flag="Y" initial_peptide_wt_iters="2" nsp_distribution_iters="2" final_peptide_wt_iters="3">
|
6
|
+
<nsp_information neighboring_bin_smoothing="Y">
|
7
|
+
<nsp_distribution bin_no="0" nsp_lower_bound_incl="0.00" nsp_upper_bound_incl="0.00" pos_freq="0.057" neg_freq="0.625" pos_to_neg_ratio="0.09"/>
|
8
|
+
<nsp_distribution bin_no="1" nsp_lower_bound_excl="0.00" nsp_upper_bound_incl="0.31" pos_freq="0.037" neg_freq="0.152" pos_to_neg_ratio="0.24"/>
|
9
|
+
<nsp_distribution bin_no="2" nsp_lower_bound_excl="0.31" nsp_upper_bound_incl="1.00" pos_freq="0.077" neg_freq="0.032" pos_to_neg_ratio="2.42"/>
|
10
|
+
<nsp_distribution bin_no="3" nsp_lower_bound_excl="1.00" nsp_upper_bound_incl="2.50" pos_freq="0.113" neg_freq="0.033" pos_to_neg_ratio="3.39"/>
|
11
|
+
<nsp_distribution bin_no="4" nsp_lower_bound_excl="2.50" nsp_upper_bound_incl="4.63" pos_freq="0.123" neg_freq="0.032" pos_to_neg_ratio="3.91"/>
|
12
|
+
<nsp_distribution bin_no="5" nsp_lower_bound_excl="4.63" nsp_upper_bound_incl="7.90" pos_freq="0.143" neg_freq="0.032" pos_to_neg_ratio="4.50"/>
|
13
|
+
<nsp_distribution bin_no="6" nsp_lower_bound_excl="7.90" nsp_upper_bound_incl="14.92" pos_freq="0.196" neg_freq="0.041" pos_to_neg_ratio="4.78"/>
|
14
|
+
<nsp_distribution bin_no="7" nsp_lower_bound_excl="14.92" nsp_upper_bound_excl="inf" pos_freq="0.254" neg_freq="0.054" pos_to_neg_ratio="4.72" alt_pos_to_neg_ratio="4.78"/>
|
15
|
+
</nsp_information>
|
16
|
+
<ni_information>
|
17
|
+
</ni_information>
|
18
|
+
<protein_summary_data_filter min_probability="0.00" sensitivity="1.000" false_positive_error_rate="0.835" predicted_num_correct="1787" predicted_num_incorrect="9044"/>
|
19
|
+
<protein_summary_data_filter min_probability="0.10" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
|
20
|
+
<protein_summary_data_filter min_probability="0.20" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
|
21
|
+
<protein_summary_data_filter min_probability="0.30" sensitivity="0.956" false_positive_error_rate="0.151" predicted_num_correct="1709" predicted_num_incorrect="305"/>
|
22
|
+
<protein_summary_data_filter min_probability="0.40" sensitivity="0.916" false_positive_error_rate="0.095" predicted_num_correct="1638" predicted_num_incorrect="171"/>
|
23
|
+
<protein_summary_data_filter min_probability="0.50" sensitivity="0.887" false_positive_error_rate="0.063" predicted_num_correct="1585" predicted_num_incorrect="106"/>
|
24
|
+
<protein_summary_data_filter min_probability="0.60" sensitivity="0.853" false_positive_error_rate="0.036" predicted_num_correct="1525" predicted_num_incorrect="58"/>
|
25
|
+
<protein_summary_data_filter min_probability="0.70" sensitivity="0.826" false_positive_error_rate="0.020" predicted_num_correct="1477" predicted_num_incorrect="31"/>
|
26
|
+
<protein_summary_data_filter min_probability="0.80" sensitivity="0.805" false_positive_error_rate="0.012" predicted_num_correct="1438" predicted_num_incorrect="18"/>
|
27
|
+
<protein_summary_data_filter min_probability="0.90" sensitivity="0.773" false_positive_error_rate="0.006" predicted_num_correct="1381" predicted_num_incorrect="8"/>
|
28
|
+
<protein_summary_data_filter min_probability="0.95" sensitivity="0.749" false_positive_error_rate="0.004" predicted_num_correct="1339" predicted_num_incorrect="5"/>
|
29
|
+
<protein_summary_data_filter min_probability="0.96" sensitivity="0.738" false_positive_error_rate="0.003" predicted_num_correct="1318" predicted_num_incorrect="4"/>
|
30
|
+
<protein_summary_data_filter min_probability="0.97" sensitivity="0.728" false_positive_error_rate="0.002" predicted_num_correct="1302" predicted_num_incorrect="3"/>
|
31
|
+
<protein_summary_data_filter min_probability="0.98" sensitivity="0.711" false_positive_error_rate="0.002" predicted_num_correct="1272" predicted_num_incorrect="2"/>
|
32
|
+
<protein_summary_data_filter min_probability="0.99" sensitivity="0.609" false_positive_error_rate="0.000" predicted_num_correct="1088" predicted_num_incorrect="0"/>
|
33
|
+
<protein_summary_data_filter min_probability="1.00" sensitivity="0.164" false_positive_error_rate="0.000" predicted_num_correct="294" predicted_num_incorrect="0"/>
|
34
|
+
</proteinprophet_details>
|
35
|
+
</program_details>
|
36
|
+
</protein_summary_header>
|
37
|
+
<dataset_derivation generation_no="0">
|
38
|
+
</dataset_derivation>
|
39
|
+
</protein_summary>
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
|
3
|
+
include LibXML
|
4
|
+
|
5
|
+
class MzIdentMLDoc < Object
|
6
|
+
|
7
|
+
MZID_NS_PREFIX="mzidentml"
|
8
|
+
MZID_NS='http://psidev.info/psi/pi/mzIdentML/1.1'
|
9
|
+
|
10
|
+
def initialize(path)
|
11
|
+
parser=XML::Parser.file(path)
|
12
|
+
@document=parser.parse
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def spectrum_queries
|
17
|
+
@document.find("//#{MZID_NS_PREFIX}:SpectrumIdentificationResult","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
18
|
+
end
|
19
|
+
|
20
|
+
def peptide_evidence
|
21
|
+
@document.find("//#{MZID_NS_PREFIX}:PeptideEvidence","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
22
|
+
end
|
23
|
+
|
24
|
+
def psms
|
25
|
+
@document.find("//#{MZID_NS_PREFIX}:SpectrumIdentificationItem","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
26
|
+
end
|
27
|
+
|
28
|
+
def protein_groups
|
29
|
+
@document.find("//#{MZID_NS_PREFIX}:ProteinAmbiguityGroup","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def proteins
|
34
|
+
@document.find("//#{MZID_NS_PREFIX}:ProteinDetectionHypothesis","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
35
|
+
end
|
36
|
+
|
37
|
+
# Peptides are referenced in many ways in mzidentml.
|
38
|
+
# We define a "Peptide" as a peptide supporting a particular protein
|
39
|
+
# Such peptides may encompass several PSM's
|
40
|
+
#
|
41
|
+
def peptides
|
42
|
+
@document.find("//#{MZID_NS_PREFIX}:PeptideHypothesis","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
# -----------------------------------------------------------
|
48
|
+
#
|
49
|
+
# Class Level Utility methods for searching from a given node
|
50
|
+
#
|
51
|
+
# -----------------------------------------------------------
|
52
|
+
|
53
|
+
def self.find(node,expression,root=false)
|
54
|
+
pp = root ? "//" : "./"
|
55
|
+
node.find("#{pp}#{MZID_NS_PREFIX}:#{expression}","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
def self.get_cvParam(mzidnode,accession)
|
60
|
+
self.find(mzidnode,"cvParam[@accession=\'#{accession}\']")[0]
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.get_dbsequence(mzidnode,accession)
|
64
|
+
self.find(mzidnode,"DBSequence[@accession=\'#{accession}\']",true)[0]
|
65
|
+
end
|
66
|
+
|
67
|
+
# As per PeptideShaker. Assume group probability used for protein if it is group rep otherwise 0
|
68
|
+
def self.get_protein_probability(protein_node)
|
69
|
+
|
70
|
+
#MS:1002403
|
71
|
+
is_group_representative=(self.get_cvParam(protein_node,"MS:1002403")!=nil)
|
72
|
+
if is_group_representative
|
73
|
+
return self.get_cvParam(protein_node.parent,"MS:1002470").attributes['value'].to_f*0.01
|
74
|
+
else
|
75
|
+
return 0
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.get_proteins_for_group(group_node)
|
80
|
+
self.find(group_node,"ProteinDetectionHypothesis")
|
81
|
+
end
|
82
|
+
|
83
|
+
# def self.get_sister_proteins(protein_node)
|
84
|
+
# self.find(protein_node.parent,"ProteinDetectionHypothesis")
|
85
|
+
# end
|
86
|
+
|
87
|
+
def self.get_peptides_for_protein(protein_node)
|
88
|
+
self.find(protein_node,"PeptideHypothesis")
|
89
|
+
end
|
90
|
+
|
91
|
+
# <PeptideHypothesis peptideEvidence_ref="PepEv_1">
|
92
|
+
# <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
|
93
|
+
# </PeptideHypothesis>
|
94
|
+
def self.get_best_psm_for_peptide(peptide_node)
|
95
|
+
|
96
|
+
best_score=-1
|
97
|
+
best_psm=nil
|
98
|
+
self.find(peptide_node,"SpectrumIdentificationItemRef").each do |id_ref_node|
|
99
|
+
id_ref = id_ref_node.attributes['spectrumIdentificationItem_ref']
|
100
|
+
psm_node = self.find(peptide_node,"SpectrumIdentificationItem[@id=\'#{id_ref}\']",true)[0]
|
101
|
+
score = self.get_cvParam(psm_node,"MS:1002466")['value'].to_f
|
102
|
+
if score>best_score
|
103
|
+
best_psm=psm_node
|
104
|
+
best_score=score
|
105
|
+
end
|
106
|
+
end
|
107
|
+
best_psm
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.get_sequence_for_peptide(peptide_node)
|
111
|
+
evidence_ref = peptide_node.attributes['peptideEvidence_ref']
|
112
|
+
pep_ref = peptide_node.find("//#{MZID_NS_PREFIX}:PeptideEvidence[@id=\'#{evidence_ref}\']","#{MZID_NS_PREFIX}:#{MZID_NS}")[0].attributes['peptide_ref']
|
113
|
+
peptide=peptide_node.find("//#{MZID_NS_PREFIX}:Peptide[@id=\'#{pep_ref}\']","#{MZID_NS_PREFIX}:#{MZID_NS}")[0]
|
114
|
+
# require 'byebug';byebug
|
115
|
+
peptide.find("./#{MZID_NS_PREFIX}:PeptideSequence","#{MZID_NS_PREFIX}:#{MZID_NS}")[0].content
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.get_sequence_for_psm(psm_node)
|
119
|
+
pep_ref = psm_node.attributes['peptide_ref']
|
120
|
+
peptide=psm_node.find("//#{MZID_NS_PREFIX}:Peptide[@id=\'#{pep_ref}\']","#{MZID_NS_PREFIX}:#{MZID_NS}")[0]
|
121
|
+
peptide.find("./#{MZID_NS_PREFIX}:PeptideSequence","#{MZID_NS_PREFIX}:#{MZID_NS}")[0].content
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.get_peptide_evidence_from_psm(psm_node)
|
125
|
+
pe_nodes = []
|
126
|
+
self.find(psm_node,"PeptideEvidenceRef").each do |pe_node|
|
127
|
+
ev_id=pe_node.attributes['peptideEvidence_ref']
|
128
|
+
pe_nodes << self.find(pe_node,"PeptideEvidence[@id=\'#{ev_id}\']",true)[0]
|
129
|
+
end
|
130
|
+
pe_nodes
|
131
|
+
end
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
|
140
|
+
end
|
data/lib/protk/mzml_parser.rb
CHANGED
@@ -14,6 +14,15 @@ class MzMLParser < Object
|
|
14
14
|
@file_reader=XML::Reader.document(doc)
|
15
15
|
end
|
16
16
|
|
17
|
+
def next_runid()
|
18
|
+
until @file_reader.name=="run"
|
19
|
+
if !@file_reader.read()
|
20
|
+
return nil
|
21
|
+
end
|
22
|
+
end
|
23
|
+
return @file_reader.get_attribute('id')
|
24
|
+
end
|
25
|
+
|
17
26
|
def next_spectrum()
|
18
27
|
|
19
28
|
until @file_reader.name=="spectrum"
|
data/lib/protk/peptide.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'libxml'
|
2
2
|
require 'bio'
|
3
3
|
require 'protk/bio_gff3_extensions'
|
4
|
+
require 'protk/mzidentml_doc'
|
4
5
|
require 'protk/error'
|
5
6
|
|
6
7
|
include LibXML
|
@@ -10,22 +11,55 @@ end
|
|
10
11
|
|
11
12
|
class Peptide
|
12
13
|
|
14
|
+
# Stripped sequence (no modifications)
|
13
15
|
attr_accessor :sequence
|
14
16
|
attr_accessor :protein_name
|
15
17
|
attr_accessor :charge
|
16
|
-
attr_accessor :
|
17
|
-
|
18
|
-
|
18
|
+
attr_accessor :probability
|
19
|
+
attr_accessor :theoretical_neutral_mass
|
20
|
+
|
21
|
+
def as_protxml
|
22
|
+
node = XML::Node.new('peptide')
|
23
|
+
node['peptide_sequence']=self.sequence.to_s
|
24
|
+
node['charge']=self.charge.to_s
|
25
|
+
node['nsp_adjusted_probability']=self.probability.to_s
|
26
|
+
node['calc_neutral_pep_mass']=self.theoretical_neutral_mass.to_s
|
27
|
+
node
|
28
|
+
end
|
19
29
|
|
20
30
|
class << self
|
21
31
|
def from_protxml(xmlnode)
|
22
32
|
pep=new()
|
23
33
|
pep.sequence=xmlnode['peptide_sequence']
|
24
|
-
pep.
|
34
|
+
pep.probability=xmlnode['nsp_adjusted_probability'].to_f
|
25
35
|
pep.charge=xmlnode['charge'].to_i
|
26
36
|
pep
|
27
37
|
end
|
28
38
|
|
39
|
+
# <ProteinDetectionHypothesis id="PAG_0_1" dBSequence_ref="JEMP01000193.1_rev_g3500.t1 280755" passThreshold="false">
|
40
|
+
# <PeptideHypothesis peptideEvidence_ref="PepEv_1">
|
41
|
+
# <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
|
42
|
+
# </PeptideHypothesis>
|
43
|
+
# <cvParam cvRef="PSI-MS" accession="MS:1002403" name="group representative"/>
|
44
|
+
# <cvParam cvRef="PSI-MS" accession="MS:1002401" name="leading protein"/>
|
45
|
+
# <cvParam cvRef="PSI-MS" accession="MS:1001093" name="sequence coverage" value="0.0"/>
|
46
|
+
# </ProteinDetectionHypothesis>
|
47
|
+
|
48
|
+
def from_mzid(xmlnode)
|
49
|
+
pep=new()
|
50
|
+
pep.sequence=MzIdentMLDoc.get_sequence_for_peptide(xmlnode)
|
51
|
+
best_psm = MzIdentMLDoc.get_best_psm_for_peptide(xmlnode)
|
52
|
+
# require 'byebug';byebug
|
53
|
+
pep.probability = MzIdentMLDoc.get_cvParam(best_psm,"MS:1002466")['value'].to_f
|
54
|
+
pep.theoretical_neutral_mass = MzIdentMLDoc.get_cvParam(best_psm,"MS:1001117")['value'].to_f
|
55
|
+
pep.charge = best_psm.attributes['chargeState'].to_i
|
56
|
+
pep.protein_name = MzIdentMLDoc.get_dbsequence(xmlnode.parent,xmlnode.parent.attributes['dBSequence_ref']).attributes['accession']
|
57
|
+
|
58
|
+
# pep.charge = MzIdentMLDoc.get_charge_for_psm(best_psm)
|
59
|
+
|
60
|
+
pep
|
61
|
+
end
|
62
|
+
|
29
63
|
def from_sequence(seq,charge=nil)
|
30
64
|
pep=new()
|
31
65
|
pep.sequence=seq
|
@@ -146,7 +180,7 @@ class Peptide
|
|
146
180
|
cds_id = parent_record.id
|
147
181
|
this_id = "#{cds_id}.#{self.sequence}"
|
148
182
|
this_id << ".#{self.charge}" unless self.charge.nil?
|
149
|
-
score = self.
|
183
|
+
score = self.probability.nil? ? "." : self.probability.to_s
|
150
184
|
gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
|
151
185
|
Bio::GFF::GFF3::Record.new(gff_string)
|
152
186
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
include LibXML
|
2
|
+
|
3
|
+
class PepXMLWriter < Object
|
4
|
+
|
5
|
+
PEPXML_NS_PREFIX="pepxml"
|
6
|
+
PEPXML_NS="http://regis-web.systemsbiology.net/pepXML"
|
7
|
+
|
8
|
+
attr :template_doc
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
template_path="#{File.dirname(__FILE__)}/data/template_pep.xml"
|
12
|
+
template_parser=XML::Parser.file(template_path)
|
13
|
+
@template_doc=template_parser.parse
|
14
|
+
end
|
15
|
+
|
16
|
+
def append_spectrum_query(query_node)
|
17
|
+
@template_doc.root << query_node
|
18
|
+
end
|
19
|
+
|
20
|
+
def save(file_path)
|
21
|
+
@template_doc.save(file_path,:indent=>true,:encoding => XML::Encoding::UTF_8)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|