protk 1.4.1 → 1.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +32 -15
- data/bin/mzid_to_pepxml.rb +75 -0
- data/bin/mzid_to_protxml.rb +77 -0
- data/bin/protxml_to_gff.rb +1 -1
- data/bin/sixframe.rb +24 -5
- data/bin/spectrast_create.rb +125 -0
- data/bin/spectrast_filter.rb +108 -0
- data/lib/protk/command_runner.rb +1 -1
- data/lib/protk/data/template_pep.xml +34 -0
- data/lib/protk/data/template_prot.xml +39 -0
- data/lib/protk/mzidentml_doc.rb +140 -0
- data/lib/protk/mzml_parser.rb +9 -0
- data/lib/protk/peptide.rb +39 -5
- data/lib/protk/pepxml_writer.rb +24 -0
- data/lib/protk/physical_constants.rb +1 -0
- data/lib/protk/protein.rb +64 -1
- data/lib/protk/protein_group.rb +70 -0
- data/lib/protk/protxml_writer.rb +27 -0
- data/lib/protk/psm.rb +222 -0
- data/lib/protk/search_tool.rb +1 -6
- data/lib/protk/sniffer.rb +35 -0
- data/lib/protk/spectrum_query.rb +132 -0
- metadata +20 -2
@@ -0,0 +1,108 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 30/4/2015
|
5
|
+
#
|
6
|
+
# A wrapper for SpectraST commands that manipulate splib files
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/command_runner'
|
12
|
+
require 'protk/tool'
|
13
|
+
require 'protk/galaxy_util'
|
14
|
+
|
15
|
+
for_galaxy = GalaxyUtil.for_galaxy?
|
16
|
+
|
17
|
+
genv=Constants.instance
|
18
|
+
|
19
|
+
# Setup specific command-line options for this tool. Other options are inherited from ProphetTool
|
20
|
+
#
|
21
|
+
spectrast_tool=Tool.new([:explicit_output])
|
22
|
+
spectrast_tool.option_parser.banner = "Manipulate splib files.\n\nUsage: spectrast_filter.rb [options] file1.splib file1.splib ..."
|
23
|
+
spectrast_tool.add_boolean_option(:binary_output,false,['-B','--binary-output','Produce spectral libraries in binary format rather than ASCII'])
|
24
|
+
spectrast_tool.add_value_option(:filter_predicate,nil,['--predicate pred','Keep only spectra satifying predicate pred. Should be a C-style predicate'])
|
25
|
+
spectrast_tool.add_value_option(:merge_operation,"U",['--merge method',
|
26
|
+
'How to combine multiple splib files (if provided). Options are U,S,H
|
27
|
+
U: Union. Include all the peptide ions in all the files.
|
28
|
+
S: Subtraction. Only include peptide ions in the first file
|
29
|
+
that are not present in any of the other files.
|
30
|
+
H: Subtraction of homologs. Only include peptide ions in the
|
31
|
+
first file that do not have any homologs with
|
32
|
+
same charge and similar m/z in any of the other files.
|
33
|
+
A: Appending. Each peptide ion is added from only one library:
|
34
|
+
the first file in the argument list that contains that peptide ion.
|
35
|
+
Useful for keeping existing consensus spectra unchanged while adding
|
36
|
+
only previously unseen peptide ions.'])
|
37
|
+
spectrast_tool.add_value_option(:spectrum_operation,"None",['--replicates method',
|
38
|
+
'How to derive a single spectrum from replicates. Options are None, C,B
|
39
|
+
C: Consensus. Create the consensus spectrum of all replicate spectra of each peptide ion.
|
40
|
+
B: Best replicate. Pick the best replicate of each peptide ion.'])
|
41
|
+
|
42
|
+
exit unless spectrast_tool.check_options(true)
|
43
|
+
|
44
|
+
spectrast_bin = %x[which spectrast].chomp
|
45
|
+
|
46
|
+
# LIBRARY MANIPULATION OPTIONS (Applicable with .splib files)
|
47
|
+
# -cf<pred> Filter library. Keep only those entries satisfying the predicate <pred>.
|
48
|
+
# <pred> should be a C-style predicate in quotes.
|
49
|
+
# -cJU Union. Include all the peptide ions in all the files.
|
50
|
+
# -cJI Intersection. Only include peptide ions that are present in all the files.
|
51
|
+
# -cJS Subtraction. Only include peptide ions in the first file that are not present in any of the other files.
|
52
|
+
# -cJH Subtraction of homologs. Only include peptide ions in the first file
|
53
|
+
# that do not have any homologs with same charge and similar m/z in any of the other files.
|
54
|
+
# -cJA Appending. Each peptide ion is added from only one library: the first file in the argument list that contains that peptide ion.
|
55
|
+
# Useful for keeping existing consensus spectra unchanged while adding only previously unseen peptide ions.
|
56
|
+
# -cAB Best replicate. Pick the best replicate of each peptide ion.
|
57
|
+
# -cAC Consensus. Create the consensus spectrum of all replicate spectra of each peptide ion.
|
58
|
+
# -cAQ Quality filter. Apply quality filters to library.
|
59
|
+
# IMPORTANT: Quality filter can only be applied on a SINGLE .splib file with no peptide ion represented by more than one spectrum.
|
60
|
+
# -cAD Create artificial decoy spectra.
|
61
|
+
# -cAN Sort library entries by descending number of replicates used (tie-breaking by probability).
|
62
|
+
# -cAM Create semi-empirical spectra based on allowable modifications specified by -cx option.
|
63
|
+
# -cQ<num> Produce reduced spectra of at most <num> peaks. Inactive with -cAQ and -cAD.
|
64
|
+
# -cD<file> Refresh protein mappings of each library entry against the protein database <file> (Must be in .fasta format).
|
65
|
+
# -cu Delete entries whose peptide sequences do not map to any protein during refreshing with -cD option.
|
66
|
+
# When off, unmapped entries will be marked with Protein=0/UNMAPPED but retained in library. (Turn off with -cu!).
|
67
|
+
# -cd Delete entries whose peptide sequences map to multiple proteins during refreshing with -cD option. (Turn off with -cd!).
|
68
|
+
|
69
|
+
input_stagers=[]
|
70
|
+
inputs=ARGV.collect { |file_name| file_name.chomp}
|
71
|
+
if for_galaxy
|
72
|
+
input_stagers = inputs.collect {|ip| GalaxyStager.new(ip,{:extension=>".splib"}) }
|
73
|
+
inputs=input_stagers.collect { |sg| sg.staged_path }
|
74
|
+
end
|
75
|
+
|
76
|
+
|
77
|
+
cmd="#{spectrast_bin} "
|
78
|
+
|
79
|
+
unless spectrast_tool.binary_output
|
80
|
+
cmd << " -c_BIN!"
|
81
|
+
end
|
82
|
+
|
83
|
+
if spectrast_tool.filter_predicate
|
84
|
+
cmd << " -cf'#{spectrast_tool.filter_predicate}'"
|
85
|
+
end
|
86
|
+
|
87
|
+
if inputs.length > 1
|
88
|
+
cmd << " -cJ#{spectrast_tool.merge_operation}"
|
89
|
+
end
|
90
|
+
|
91
|
+
if spectrast_tool.spectrum_operation!="None"
|
92
|
+
cmd << " -cA#{spectrast_tool.spectrum_operation}"
|
93
|
+
end
|
94
|
+
|
95
|
+
if spectrast_tool.explicit_output==nil
|
96
|
+
output_file_name=Tool.default_output_path(inputs,"","","")
|
97
|
+
else
|
98
|
+
output_file_name=spectrast_tool.explicit_output
|
99
|
+
end
|
100
|
+
|
101
|
+
cmd << " -cN#{output_file_name}"
|
102
|
+
|
103
|
+
inputs.each { |ip| cmd << " #{ip}" }
|
104
|
+
|
105
|
+
# code = spectrast_tool.run(cmd,genv)
|
106
|
+
# throw "Command failed with exit code #{code}" unless code==0
|
107
|
+
|
108
|
+
%x[#{cmd}]
|
data/lib/protk/command_runner.rb
CHANGED
@@ -30,7 +30,7 @@ class CommandRunner
|
|
30
30
|
def run_local(command_string)
|
31
31
|
@env.log("Command: #{command_string} started",:info)
|
32
32
|
status = Open4::popen4("#{command_string} ") do |pid, stdin, stdout, stderr|
|
33
|
-
|
33
|
+
@env.log "PID #{pid}" , :info
|
34
34
|
|
35
35
|
stdout.each { |line| @env.log(line.chomp,:info) }
|
36
36
|
|
@@ -0,0 +1,34 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<?xml-stylesheet type="text/xsl" href="/Users/icooke/Sources/protk/spec/data/mr176-BSA100fmole_BA3_01_8167.d_tandem_pproph.pep.xsl"?>
|
3
|
+
<msms_pipeline_analysis date="2014-06-22T15:28:36" summary_xml="/Users/icooke/Sources/protk/spec/data/mr176-BSA100fmole_BA3_01_8167.d_tandem_pproph.pep.xml" xmlns="http://regis-web.systemsbiology.net/pepXML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://regis-web.systemsbiology.net/pepXML /Users/icooke/bin/tpp/schema/pepXML_v117.xsd">
|
4
|
+
<analysis_summary analysis="peptideprophet" time="2014-06-22T15:28:36">
|
5
|
+
</analysis_summary>
|
6
|
+
<analysis_summary analysis="database_refresh" time="2014-06-22T15:28:36"/>
|
7
|
+
<analysis_summary analysis="interact" time="2014-06-22T15:28:36">
|
8
|
+
<interact_summary filename="/Users/icooke/Sources/protk/spec/data/mr176-BSA100fmole_BA3_01_8167.d_tandem_pproph.pep.xml" directory="">
|
9
|
+
<inputfile name="mr176-BSA100fmole_BA3_01_8167.d_tandem.pep.xml" directory="/Users/icooke/Sources/protk/spec/data"/>
|
10
|
+
</interact_summary>
|
11
|
+
</analysis_summary>
|
12
|
+
<dataset_derivation generation_no="0"/>
|
13
|
+
<msms_run_summary base_name="/Users/icooke/Sources/protk/spec/data/mr176-BSA100fmole_BA3_01_8167.d_tandem.tandem" search_engine="X! Tandem" raw_data_type="raw" raw_data=".?">
|
14
|
+
<sample_enzyme name="trypsin">
|
15
|
+
<specificity cut="KR" no_cut="P" sense="C"/>
|
16
|
+
</sample_enzyme>
|
17
|
+
<search_summary base_name="mr176-BSA100fmole_BA3_01_8167.d_tandem.tandem" search_engine="X! Tandem" precursor_mass_type="monoisotopic" fragment_mass_type="monoisotopic" search_id="1">
|
18
|
+
<search_database local_path="/Users/icooke/Sources/protk/spec/data/AASequences.fasta" type="AA"/>
|
19
|
+
<enzymatic_search_constraint enzyme="trypsin" max_num_internal_cleavages="2" min_number_termini="1"/>
|
20
|
+
<aminoacid_modification aminoacid="E" massdiff="-18.0106" mass="111.0320" variable="Y" symbol="^"/>
|
21
|
+
<!--X! Tandem n-terminal AA variable modification-->
|
22
|
+
<aminoacid_modification aminoacid="M" massdiff="15.9949" mass="147.0354" variable="Y"/>
|
23
|
+
<aminoacid_modification aminoacid="Q" massdiff="-17.0265" mass="111.0321" variable="Y" symbol="^"/>
|
24
|
+
<!--X! Tandem n-terminal AA variable modification-->
|
25
|
+
<terminal_modification terminus="n" massdiff="42.0106" mass="43.0184" protein_terminus="N" variable="Y" symbol="^"/>
|
26
|
+
|
27
|
+
</search_summary>
|
28
|
+
<analysis_timestamp analysis="peptideprophet" time="2014-06-22T15:28:36" id="1"/>
|
29
|
+
<analysis_timestamp analysis="database_refresh" time="2014-06-22T15:28:36" id="1">
|
30
|
+
<database_refresh_timestamp database="/Users/icooke/Sources/protk/spec/data/AASequences.fasta" min_num_enz_term="1"/>
|
31
|
+
</analysis_timestamp>
|
32
|
+
|
33
|
+
</msms_run_summary>
|
34
|
+
</msms_pipeline_analysis>
|
@@ -0,0 +1,39 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<protein_summary xmlns="http://regis-web.systemsbiology.net/protXML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://sashimi.sourceforge.net/schema_revision/protXML/protXML_v6.xsd" summary_xml="">
|
3
|
+
<protein_summary_header reference_database="FULLPATH_TO_REFERENCE_DB" residue_substitution_list="I -> L" source_files="FULLPATH_TO_SOURCE_PEPXML" source_files_alt="FULLPATH_TO_SOURCE_PEPXML" min_peptide_probability="" min_peptide_weight="" num_predicted_correct_prots="" num_input_1_spectra="" num_input_2_spectra="" num_input_3_spectra="" num_input_4_spectra="" num_input_5_spectra="" initial_min_peptide_prob="" total_no_spectrum_ids="" sample_enzyme="trypsin">
|
4
|
+
<program_details analysis="proteinprophet" time="2014-01-20T14:17:37" version=" Insilicos_LabKey_C++ (TPP v0.0 Development trunk rev 0, Build 201307090846 (linux))">
|
5
|
+
<proteinprophet_details occam_flag="Y" groups_flag="Y" degen_flag="Y" nsp_flag="Y" initial_peptide_wt_iters="2" nsp_distribution_iters="2" final_peptide_wt_iters="3">
|
6
|
+
<nsp_information neighboring_bin_smoothing="Y">
|
7
|
+
<nsp_distribution bin_no="0" nsp_lower_bound_incl="0.00" nsp_upper_bound_incl="0.00" pos_freq="0.057" neg_freq="0.625" pos_to_neg_ratio="0.09"/>
|
8
|
+
<nsp_distribution bin_no="1" nsp_lower_bound_excl="0.00" nsp_upper_bound_incl="0.31" pos_freq="0.037" neg_freq="0.152" pos_to_neg_ratio="0.24"/>
|
9
|
+
<nsp_distribution bin_no="2" nsp_lower_bound_excl="0.31" nsp_upper_bound_incl="1.00" pos_freq="0.077" neg_freq="0.032" pos_to_neg_ratio="2.42"/>
|
10
|
+
<nsp_distribution bin_no="3" nsp_lower_bound_excl="1.00" nsp_upper_bound_incl="2.50" pos_freq="0.113" neg_freq="0.033" pos_to_neg_ratio="3.39"/>
|
11
|
+
<nsp_distribution bin_no="4" nsp_lower_bound_excl="2.50" nsp_upper_bound_incl="4.63" pos_freq="0.123" neg_freq="0.032" pos_to_neg_ratio="3.91"/>
|
12
|
+
<nsp_distribution bin_no="5" nsp_lower_bound_excl="4.63" nsp_upper_bound_incl="7.90" pos_freq="0.143" neg_freq="0.032" pos_to_neg_ratio="4.50"/>
|
13
|
+
<nsp_distribution bin_no="6" nsp_lower_bound_excl="7.90" nsp_upper_bound_incl="14.92" pos_freq="0.196" neg_freq="0.041" pos_to_neg_ratio="4.78"/>
|
14
|
+
<nsp_distribution bin_no="7" nsp_lower_bound_excl="14.92" nsp_upper_bound_excl="inf" pos_freq="0.254" neg_freq="0.054" pos_to_neg_ratio="4.72" alt_pos_to_neg_ratio="4.78"/>
|
15
|
+
</nsp_information>
|
16
|
+
<ni_information>
|
17
|
+
</ni_information>
|
18
|
+
<protein_summary_data_filter min_probability="0.00" sensitivity="1.000" false_positive_error_rate="0.835" predicted_num_correct="1787" predicted_num_incorrect="9044"/>
|
19
|
+
<protein_summary_data_filter min_probability="0.10" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
|
20
|
+
<protein_summary_data_filter min_probability="0.20" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
|
21
|
+
<protein_summary_data_filter min_probability="0.30" sensitivity="0.956" false_positive_error_rate="0.151" predicted_num_correct="1709" predicted_num_incorrect="305"/>
|
22
|
+
<protein_summary_data_filter min_probability="0.40" sensitivity="0.916" false_positive_error_rate="0.095" predicted_num_correct="1638" predicted_num_incorrect="171"/>
|
23
|
+
<protein_summary_data_filter min_probability="0.50" sensitivity="0.887" false_positive_error_rate="0.063" predicted_num_correct="1585" predicted_num_incorrect="106"/>
|
24
|
+
<protein_summary_data_filter min_probability="0.60" sensitivity="0.853" false_positive_error_rate="0.036" predicted_num_correct="1525" predicted_num_incorrect="58"/>
|
25
|
+
<protein_summary_data_filter min_probability="0.70" sensitivity="0.826" false_positive_error_rate="0.020" predicted_num_correct="1477" predicted_num_incorrect="31"/>
|
26
|
+
<protein_summary_data_filter min_probability="0.80" sensitivity="0.805" false_positive_error_rate="0.012" predicted_num_correct="1438" predicted_num_incorrect="18"/>
|
27
|
+
<protein_summary_data_filter min_probability="0.90" sensitivity="0.773" false_positive_error_rate="0.006" predicted_num_correct="1381" predicted_num_incorrect="8"/>
|
28
|
+
<protein_summary_data_filter min_probability="0.95" sensitivity="0.749" false_positive_error_rate="0.004" predicted_num_correct="1339" predicted_num_incorrect="5"/>
|
29
|
+
<protein_summary_data_filter min_probability="0.96" sensitivity="0.738" false_positive_error_rate="0.003" predicted_num_correct="1318" predicted_num_incorrect="4"/>
|
30
|
+
<protein_summary_data_filter min_probability="0.97" sensitivity="0.728" false_positive_error_rate="0.002" predicted_num_correct="1302" predicted_num_incorrect="3"/>
|
31
|
+
<protein_summary_data_filter min_probability="0.98" sensitivity="0.711" false_positive_error_rate="0.002" predicted_num_correct="1272" predicted_num_incorrect="2"/>
|
32
|
+
<protein_summary_data_filter min_probability="0.99" sensitivity="0.609" false_positive_error_rate="0.000" predicted_num_correct="1088" predicted_num_incorrect="0"/>
|
33
|
+
<protein_summary_data_filter min_probability="1.00" sensitivity="0.164" false_positive_error_rate="0.000" predicted_num_correct="294" predicted_num_incorrect="0"/>
|
34
|
+
</proteinprophet_details>
|
35
|
+
</program_details>
|
36
|
+
</protein_summary_header>
|
37
|
+
<dataset_derivation generation_no="0">
|
38
|
+
</dataset_derivation>
|
39
|
+
</protein_summary>
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'libxml'
|
2
|
+
|
3
|
+
include LibXML
|
4
|
+
|
5
|
+
class MzIdentMLDoc < Object
|
6
|
+
|
7
|
+
MZID_NS_PREFIX="mzidentml"
|
8
|
+
MZID_NS='http://psidev.info/psi/pi/mzIdentML/1.1'
|
9
|
+
|
10
|
+
def initialize(path)
|
11
|
+
parser=XML::Parser.file(path)
|
12
|
+
@document=parser.parse
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
def spectrum_queries
|
17
|
+
@document.find("//#{MZID_NS_PREFIX}:SpectrumIdentificationResult","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
18
|
+
end
|
19
|
+
|
20
|
+
def peptide_evidence
|
21
|
+
@document.find("//#{MZID_NS_PREFIX}:PeptideEvidence","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
22
|
+
end
|
23
|
+
|
24
|
+
def psms
|
25
|
+
@document.find("//#{MZID_NS_PREFIX}:SpectrumIdentificationItem","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
26
|
+
end
|
27
|
+
|
28
|
+
def protein_groups
|
29
|
+
@document.find("//#{MZID_NS_PREFIX}:ProteinAmbiguityGroup","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def proteins
|
34
|
+
@document.find("//#{MZID_NS_PREFIX}:ProteinDetectionHypothesis","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
35
|
+
end
|
36
|
+
|
37
|
+
# Peptides are referenced in many ways in mzidentml.
|
38
|
+
# We define a "Peptide" as a peptide supporting a particular protein
|
39
|
+
# Such peptides may encompass several PSM's
|
40
|
+
#
|
41
|
+
def peptides
|
42
|
+
@document.find("//#{MZID_NS_PREFIX}:PeptideHypothesis","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
# -----------------------------------------------------------
|
48
|
+
#
|
49
|
+
# Class Level Utility methods for searching from a given node
|
50
|
+
#
|
51
|
+
# -----------------------------------------------------------
|
52
|
+
|
53
|
+
def self.find(node,expression,root=false)
|
54
|
+
pp = root ? "//" : "./"
|
55
|
+
node.find("#{pp}#{MZID_NS_PREFIX}:#{expression}","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
def self.get_cvParam(mzidnode,accession)
|
60
|
+
self.find(mzidnode,"cvParam[@accession=\'#{accession}\']")[0]
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.get_dbsequence(mzidnode,accession)
|
64
|
+
self.find(mzidnode,"DBSequence[@accession=\'#{accession}\']",true)[0]
|
65
|
+
end
|
66
|
+
|
67
|
+
# As per PeptideShaker. Assume group probability used for protein if it is group rep otherwise 0
|
68
|
+
def self.get_protein_probability(protein_node)
|
69
|
+
|
70
|
+
#MS:1002403
|
71
|
+
is_group_representative=(self.get_cvParam(protein_node,"MS:1002403")!=nil)
|
72
|
+
if is_group_representative
|
73
|
+
return self.get_cvParam(protein_node.parent,"MS:1002470").attributes['value'].to_f*0.01
|
74
|
+
else
|
75
|
+
return 0
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.get_proteins_for_group(group_node)
|
80
|
+
self.find(group_node,"ProteinDetectionHypothesis")
|
81
|
+
end
|
82
|
+
|
83
|
+
# def self.get_sister_proteins(protein_node)
|
84
|
+
# self.find(protein_node.parent,"ProteinDetectionHypothesis")
|
85
|
+
# end
|
86
|
+
|
87
|
+
def self.get_peptides_for_protein(protein_node)
|
88
|
+
self.find(protein_node,"PeptideHypothesis")
|
89
|
+
end
|
90
|
+
|
91
|
+
# <PeptideHypothesis peptideEvidence_ref="PepEv_1">
|
92
|
+
# <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
|
93
|
+
# </PeptideHypothesis>
|
94
|
+
def self.get_best_psm_for_peptide(peptide_node)
|
95
|
+
|
96
|
+
best_score=-1
|
97
|
+
best_psm=nil
|
98
|
+
self.find(peptide_node,"SpectrumIdentificationItemRef").each do |id_ref_node|
|
99
|
+
id_ref = id_ref_node.attributes['spectrumIdentificationItem_ref']
|
100
|
+
psm_node = self.find(peptide_node,"SpectrumIdentificationItem[@id=\'#{id_ref}\']",true)[0]
|
101
|
+
score = self.get_cvParam(psm_node,"MS:1002466")['value'].to_f
|
102
|
+
if score>best_score
|
103
|
+
best_psm=psm_node
|
104
|
+
best_score=score
|
105
|
+
end
|
106
|
+
end
|
107
|
+
best_psm
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.get_sequence_for_peptide(peptide_node)
|
111
|
+
evidence_ref = peptide_node.attributes['peptideEvidence_ref']
|
112
|
+
pep_ref = peptide_node.find("//#{MZID_NS_PREFIX}:PeptideEvidence[@id=\'#{evidence_ref}\']","#{MZID_NS_PREFIX}:#{MZID_NS}")[0].attributes['peptide_ref']
|
113
|
+
peptide=peptide_node.find("//#{MZID_NS_PREFIX}:Peptide[@id=\'#{pep_ref}\']","#{MZID_NS_PREFIX}:#{MZID_NS}")[0]
|
114
|
+
# require 'byebug';byebug
|
115
|
+
peptide.find("./#{MZID_NS_PREFIX}:PeptideSequence","#{MZID_NS_PREFIX}:#{MZID_NS}")[0].content
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.get_sequence_for_psm(psm_node)
|
119
|
+
pep_ref = psm_node.attributes['peptide_ref']
|
120
|
+
peptide=psm_node.find("//#{MZID_NS_PREFIX}:Peptide[@id=\'#{pep_ref}\']","#{MZID_NS_PREFIX}:#{MZID_NS}")[0]
|
121
|
+
peptide.find("./#{MZID_NS_PREFIX}:PeptideSequence","#{MZID_NS_PREFIX}:#{MZID_NS}")[0].content
|
122
|
+
end
|
123
|
+
|
124
|
+
def self.get_peptide_evidence_from_psm(psm_node)
|
125
|
+
pe_nodes = []
|
126
|
+
self.find(psm_node,"PeptideEvidenceRef").each do |pe_node|
|
127
|
+
ev_id=pe_node.attributes['peptideEvidence_ref']
|
128
|
+
pe_nodes << self.find(pe_node,"PeptideEvidence[@id=\'#{ev_id}\']",true)[0]
|
129
|
+
end
|
130
|
+
pe_nodes
|
131
|
+
end
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
|
140
|
+
end
|
data/lib/protk/mzml_parser.rb
CHANGED
@@ -14,6 +14,15 @@ class MzMLParser < Object
|
|
14
14
|
@file_reader=XML::Reader.document(doc)
|
15
15
|
end
|
16
16
|
|
17
|
+
def next_runid()
|
18
|
+
until @file_reader.name=="run"
|
19
|
+
if !@file_reader.read()
|
20
|
+
return nil
|
21
|
+
end
|
22
|
+
end
|
23
|
+
return @file_reader.get_attribute('id')
|
24
|
+
end
|
25
|
+
|
17
26
|
def next_spectrum()
|
18
27
|
|
19
28
|
until @file_reader.name=="spectrum"
|
data/lib/protk/peptide.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'libxml'
|
2
2
|
require 'bio'
|
3
3
|
require 'protk/bio_gff3_extensions'
|
4
|
+
require 'protk/mzidentml_doc'
|
4
5
|
require 'protk/error'
|
5
6
|
|
6
7
|
include LibXML
|
@@ -10,22 +11,55 @@ end
|
|
10
11
|
|
11
12
|
class Peptide
|
12
13
|
|
14
|
+
# Stripped sequence (no modifications)
|
13
15
|
attr_accessor :sequence
|
14
16
|
attr_accessor :protein_name
|
15
17
|
attr_accessor :charge
|
16
|
-
attr_accessor :
|
17
|
-
|
18
|
-
|
18
|
+
attr_accessor :probability
|
19
|
+
attr_accessor :theoretical_neutral_mass
|
20
|
+
|
21
|
+
def as_protxml
|
22
|
+
node = XML::Node.new('peptide')
|
23
|
+
node['peptide_sequence']=self.sequence.to_s
|
24
|
+
node['charge']=self.charge.to_s
|
25
|
+
node['nsp_adjusted_probability']=self.probability.to_s
|
26
|
+
node['calc_neutral_pep_mass']=self.theoretical_neutral_mass.to_s
|
27
|
+
node
|
28
|
+
end
|
19
29
|
|
20
30
|
class << self
|
21
31
|
def from_protxml(xmlnode)
|
22
32
|
pep=new()
|
23
33
|
pep.sequence=xmlnode['peptide_sequence']
|
24
|
-
pep.
|
34
|
+
pep.probability=xmlnode['nsp_adjusted_probability'].to_f
|
25
35
|
pep.charge=xmlnode['charge'].to_i
|
26
36
|
pep
|
27
37
|
end
|
28
38
|
|
39
|
+
# <ProteinDetectionHypothesis id="PAG_0_1" dBSequence_ref="JEMP01000193.1_rev_g3500.t1 280755" passThreshold="false">
|
40
|
+
# <PeptideHypothesis peptideEvidence_ref="PepEv_1">
|
41
|
+
# <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
|
42
|
+
# </PeptideHypothesis>
|
43
|
+
# <cvParam cvRef="PSI-MS" accession="MS:1002403" name="group representative"/>
|
44
|
+
# <cvParam cvRef="PSI-MS" accession="MS:1002401" name="leading protein"/>
|
45
|
+
# <cvParam cvRef="PSI-MS" accession="MS:1001093" name="sequence coverage" value="0.0"/>
|
46
|
+
# </ProteinDetectionHypothesis>
|
47
|
+
|
48
|
+
def from_mzid(xmlnode)
|
49
|
+
pep=new()
|
50
|
+
pep.sequence=MzIdentMLDoc.get_sequence_for_peptide(xmlnode)
|
51
|
+
best_psm = MzIdentMLDoc.get_best_psm_for_peptide(xmlnode)
|
52
|
+
# require 'byebug';byebug
|
53
|
+
pep.probability = MzIdentMLDoc.get_cvParam(best_psm,"MS:1002466")['value'].to_f
|
54
|
+
pep.theoretical_neutral_mass = MzIdentMLDoc.get_cvParam(best_psm,"MS:1001117")['value'].to_f
|
55
|
+
pep.charge = best_psm.attributes['chargeState'].to_i
|
56
|
+
pep.protein_name = MzIdentMLDoc.get_dbsequence(xmlnode.parent,xmlnode.parent.attributes['dBSequence_ref']).attributes['accession']
|
57
|
+
|
58
|
+
# pep.charge = MzIdentMLDoc.get_charge_for_psm(best_psm)
|
59
|
+
|
60
|
+
pep
|
61
|
+
end
|
62
|
+
|
29
63
|
def from_sequence(seq,charge=nil)
|
30
64
|
pep=new()
|
31
65
|
pep.sequence=seq
|
@@ -146,7 +180,7 @@ class Peptide
|
|
146
180
|
cds_id = parent_record.id
|
147
181
|
this_id = "#{cds_id}.#{self.sequence}"
|
148
182
|
this_id << ".#{self.charge}" unless self.charge.nil?
|
149
|
-
score = self.
|
183
|
+
score = self.probability.nil? ? "." : self.probability.to_s
|
150
184
|
gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
|
151
185
|
Bio::GFF::GFF3::Record.new(gff_string)
|
152
186
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
include LibXML
|
2
|
+
|
3
|
+
class PepXMLWriter < Object
|
4
|
+
|
5
|
+
PEPXML_NS_PREFIX="pepxml"
|
6
|
+
PEPXML_NS="http://regis-web.systemsbiology.net/pepXML"
|
7
|
+
|
8
|
+
attr :template_doc
|
9
|
+
|
10
|
+
def initialize
|
11
|
+
template_path="#{File.dirname(__FILE__)}/data/template_pep.xml"
|
12
|
+
template_parser=XML::Parser.file(template_path)
|
13
|
+
@template_doc=template_parser.parse
|
14
|
+
end
|
15
|
+
|
16
|
+
def append_spectrum_query(query_node)
|
17
|
+
@template_doc.root << query_node
|
18
|
+
end
|
19
|
+
|
20
|
+
def save(file_path)
|
21
|
+
@template_doc.save(file_path,:indent=>true,:encoding => XML::Encoding::UTF_8)
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|