protk 1.4.1 → 1.4.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ HYDROGEN_MASS=1.00794
data/lib/protk/protein.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'protk/peptide'
2
+ require 'protk/mzidentml_doc'
2
3
 
3
4
  include LibXML
4
5
 
@@ -14,6 +15,21 @@ class Protein
14
15
  attr_accessor :percent_coverage
15
16
  attr_accessor :peptides
16
17
 
18
+ def as_protxml
19
+ node = XML::Node.new('protein')
20
+ node['protein_name']=self.protein_name.to_s
21
+ node['n_indistinguishable_proteins']=self.n_indistinguishable_proteins.to_s
22
+ node['probability']=self.probability.to_s
23
+ node['percent_coverage']=self.percent_coverage.to_s
24
+ node['unique_stripped_peptides']=self.peptides.collect {|p| p.sequence }.join("+")
25
+ node['total_number_peptides']=self.peptides.length.to_s
26
+ self.peptides.each do |peptide|
27
+ node<<peptide.as_protxml
28
+ end
29
+ node
30
+ end
31
+
32
+
17
33
  class << self
18
34
 
19
35
  # <protein_group group_number="1" probability="1.0000">
@@ -46,6 +62,52 @@ class Protein
46
62
  prot.peptides = peptide_nodes.collect { |e| Peptide.from_protxml(e) }
47
63
  prot
48
64
  end
65
+
66
+
67
+ # <ProteinAmbiguityGroup id="PAG_0">
68
+ # <ProteinDetectionHypothesis id="PAG_0_1" dBSequence_ref="JEMP01000193.1_rev_g3500.t1 280755" passThreshold="false">
69
+ # <PeptideHypothesis peptideEvidence_ref="PepEv_1">
70
+ # <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
71
+ # </PeptideHypothesis>
72
+ # <cvParam cvRef="PSI-MS" accession="MS:1002403" name="group representative"/>
73
+ # <cvParam cvRef="PSI-MS" accession="MS:1002401" name="leading protein"/>
74
+ # <cvParam cvRef="PSI-MS" accession="MS:1001093" name="sequence coverage" value="0.0"/>
75
+ # </ProteinDetectionHypothesis>
76
+ # <cvParam cvRef="PSI-MS" accession="MS:1002470" name="PeptideShaker protein group score" value="0.0"/>
77
+ # <cvParam cvRef="PSI-MS" accession="MS:1002471" name="PeptideShaker protein group confidence" value="0.0"/>
78
+ # <cvParam cvRef="PSI-MS" accession="MS:1002545" name="PeptideShaker protein confidence type" value="Not Validated"/>
79
+ # <cvParam cvRef="PSI-MS" accession="MS:1002415" name="protein group passes threshold" value="false"/>
80
+ # </ProteinAmbiguityGroup>
81
+
82
+
83
+ # Note:
84
+ # This is hacked together to work for a specific PeptideShaker output type
85
+ # Refactor and properly respect cvParams for real conversion
86
+ #
87
+ def from_mzid(xmlnode)
88
+
89
+ coverage_cvparam=""
90
+ prot=new()
91
+ groupnode = xmlnode.parent
92
+
93
+ prot.group_number=groupnode.attributes['id'].split("_").last.to_i+1
94
+ prot.protein_name=MzIdentMLDoc.get_dbsequence(xmlnode,xmlnode.attributes['dBSequence_ref']).attributes['accession']
95
+ prot.n_indistinguishable_proteins=MzIdentMLDoc.get_proteins_for_group(groupnode).length
96
+ prot.group_probability=MzIdentMLDoc.get_cvParam(groupnode,"MS:1002470").attributes['value'].to_f
97
+
98
+ coverage_node=MzIdentMLDoc.get_cvParam(xmlnode,"MS:1001093")
99
+
100
+ prot.percent_coverage=coverage_node.attributes['value'].to_f if coverage_node
101
+ prot.probability = MzIdentMLDoc.get_protein_probability(xmlnode)
102
+ # require 'byebug';byebug
103
+
104
+ peptide_nodes=MzIdentMLDoc.get_peptides_for_protein(xmlnode)
105
+
106
+ prot.peptides = peptide_nodes.collect { |e| Peptide.from_mzid(e) }
107
+ prot
108
+ end
109
+
110
+
49
111
  private :new
50
112
  end
51
113
 
@@ -62,11 +124,12 @@ class Protein
62
124
  if best_peptides[seq].nil?
63
125
  best_peptides[seq]=peptide
64
126
  else
65
- best_peptides[seq]=peptide if peptide.nsp_adjusted_probability > best_peptides[seq].nsp_adjusted_probability
127
+ best_peptides[seq]=peptide if peptide.probability > best_peptides[seq].probability
66
128
  end
67
129
  end
68
130
 
69
131
  best_peptides.values
70
132
  end
71
133
 
134
+
72
135
  end
@@ -0,0 +1,70 @@
1
+
2
+ require 'protk/peptide'
3
+ require 'protk/protein'
4
+ require 'protk/mzidentml_doc'
5
+ require 'protk/protxml_writer'
6
+
7
+ include LibXML
8
+
9
+
10
+ class ProteinGroup
11
+
12
+ attr_accessor :group_number
13
+ attr_accessor :group_probability
14
+ attr_accessor :proteins
15
+
16
+ class << self
17
+
18
+ # <ProteinAmbiguityGroup id="PAG_0">
19
+ # <ProteinDetectionHypothesis id="PAG_0_1" dBSequence_ref="JEMP01000193.1_rev_g3500.t1 280755" passThreshold="false">
20
+ # <PeptideHypothesis peptideEvidence_ref="PepEv_1">
21
+ # <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
22
+ # </PeptideHypothesis>
23
+ # <cvParam cvRef="PSI-MS" accession="MS:1002403" name="group representative"/>
24
+ # <cvParam cvRef="PSI-MS" accession="MS:1002401" name="leading protein"/>
25
+ # <cvParam cvRef="PSI-MS" accession="MS:1001093" name="sequence coverage" value="0.0"/>
26
+ # </ProteinDetectionHypothesis>
27
+ # <cvParam cvRef="PSI-MS" accession="MS:1002470" name="PeptideShaker protein group score" value="0.0"/>
28
+ # <cvParam cvRef="PSI-MS" accession="MS:1002471" name="PeptideShaker protein group confidence" value="0.0"/>
29
+ # <cvParam cvRef="PSI-MS" accession="MS:1002545" name="PeptideShaker protein confidence type" value="Not Validated"/>
30
+ # <cvParam cvRef="PSI-MS" accession="MS:1002415" name="protein group passes threshold" value="false"/>
31
+ # </ProteinAmbiguityGroup>
32
+
33
+
34
+ # Note:
35
+ # This is hacked together to work for a specific PeptideShaker output type
36
+ # Refactor and properly respect cvParams for real conversion
37
+ #
38
+ def from_mzid(groupnode)
39
+
40
+ group=new()
41
+
42
+ group.group_number=groupnode.attributes['id'].split("_").last.to_i+1
43
+ group.group_probability=MzIdentMLDoc.get_cvParam(groupnode,"MS:1002470").attributes['value'].to_f
44
+
45
+ # require 'byebug';byebug
46
+
47
+ protein_nodes=MzIdentMLDoc.get_proteins_for_group(groupnode)
48
+
49
+ group.proteins = protein_nodes.collect { |e| Protein.from_mzid(e) }
50
+ group
51
+ end
52
+
53
+
54
+ private :new
55
+ end
56
+
57
+ def initialize()
58
+
59
+ end
60
+
61
+ def as_protxml()
62
+ node = XML::Node.new('protein_group')
63
+ node["group_number"] = self.group_number.to_s
64
+ node["group_probability"] = self.group_probability.to_s
65
+ self.proteins.each { |prot| node << prot.as_protxml }
66
+ node
67
+ end
68
+
69
+
70
+ end
@@ -0,0 +1,27 @@
1
+ include LibXML
2
+
3
+ class ProtXMLWriter < Object
4
+
5
+ PROTXML_NS_PREFIX="protxml"
6
+ PROTXML_NS="http://regis-web.systemsbiology.net/protXML"
7
+
8
+ attr :template_doc
9
+ attr :protein_summary_node
10
+
11
+ def initialize
12
+ template_path="#{File.dirname(__FILE__)}/data/template_prot.xml"
13
+ template_parser=XML::Parser.file(template_path)
14
+ @template_doc=template_parser.parse
15
+ @protein_summary_node=@template_doc.root
16
+ end
17
+
18
+ def append_protein_group(pg_node)
19
+ # require 'byebug';byebug
20
+ @protein_summary_node << pg_node
21
+ end
22
+
23
+ def save(file_path)
24
+ @template_doc.save(file_path,:indent=>true,:encoding => XML::Encoding::UTF_8)
25
+ end
26
+
27
+ end
data/lib/protk/psm.rb ADDED
@@ -0,0 +1,222 @@
1
+
2
+ require 'protk/mzidentml_doc'
3
+ require 'libxml'
4
+
5
+ include LibXML
6
+
7
+
8
+ class String
9
+ def to_bool
10
+ return true if self == true || self =~ (/^(true|t|yes|y|1)$/i)
11
+ return false if self == false || self =~ (/^(false|f|no|n|0)$/i)
12
+ raise ArgumentError.new("invalid value for Boolean: \"#{self}\"")
13
+ end
14
+ end
15
+
16
+ class PeptideEvidence
17
+ attr_accessor :peptide_prev_aa
18
+ attr_accessor :peptide_next_aa
19
+ attr_accessor :protein
20
+ attr_accessor :protein_descr
21
+ # attr_accessor :peptide_sequence
22
+ attr_accessor :is_decoy
23
+
24
+ # <PeptideEvidence isDecoy="false" pre="K" post="G" start="712"
25
+ # end="722" peptide_ref="KSPVYKVHFTR"
26
+ # dBSequence_ref="JEMP01000193.1_rev_g3500.t1" id="PepEv_1" />
27
+ class << self
28
+
29
+ def from_mzid(pe_node)
30
+ pe = new()
31
+ pe.peptide_prev_aa=pe_node.attributes['pre']
32
+ pe.peptide_next_aa=pe_node.attributes['post']
33
+ pe.is_decoy=pe_node.attributes['isDecoy'].to_bool
34
+
35
+ # peptide_ref = pe_node.attributes['peptide_ref']
36
+ prot_ref = pe_node.attributes['dBSequence_ref']
37
+ # pep_node = MzIdentMLDoc.find(pe_node,"Peptide[@id=\'#{peptide_ref}\']",true)[0]
38
+ prot_node = MzIdentMLDoc.find(pe_node,"DBSequence[@id=\'#{prot_ref}\']",true)[0]
39
+
40
+
41
+ # <DBSequence id="JEMP01000193.1_rev_g3500.t1"
42
+ # accession="JEMP01000193.1_rev_g3500.t1"
43
+ # searchDatabase_ref="SearchDB_1">
44
+ # <cvParam cvRef="PSI-MS" accession="MS:1001088"
45
+ # name="protein description" value="280755|283436" />
46
+ # </DBSequence>
47
+ pe.protein=prot_node.attributes['accession']
48
+ pe.protein_descr=MzIdentMLDoc.get_cvParam(prot_node,"MS:1001088")['value']
49
+
50
+
51
+ # pe.peptide_sequence=pep_node
52
+
53
+ pe
54
+ end
55
+
56
+
57
+ private :new
58
+ end
59
+
60
+ def initialize()
61
+
62
+ end
63
+
64
+ # <alternative_protein protein="lcl|JEMP01000005.1_rev_g4624.t1"
65
+ # protein_descr="652491|654142" num_tol_term="2" peptide_prev_aa="K" peptide_next_aa="Y"/>
66
+ # We use this only for alternative_proteins
67
+ # The first peptide_evidence item is baked into the attributes of a spectrum_query
68
+ def as_pepxml()
69
+ alt_node = XML::Node.new('alternative_protein')
70
+ alt_node['protein']=self.protein
71
+ alt_node['protein_descr']=self.protein_descr
72
+ alt_node['peptide_prev_aa']=self.peptide_prev_aa
73
+ alt_node['peptide_next_aa']=self.peptide_next_aa
74
+
75
+
76
+ alt_node
77
+ end
78
+
79
+ end
80
+
81
+ # <spectrum_query spectrum="mr176-BSA100fmole_BA3_01_8167.00003.00003.2" start_scan="3" end_scan="3"
82
+ #precursor_neutral_mass="1398.7082" assumed_charge="2" index="2" experiment_label="mr176">
83
+ # <search_result>
84
+ # <search_hit hit_rank="1" peptide="SQVFQLESTFDV" peptide_prev_aa="R" peptide_next_aa="K" protein="tr|Q90853|Q90853_CHICK"
85
+ # protein_descr="Homeobox protein OS=Gallus gallus GN=GH6 PE=2 SV=1" num_tot_proteins="1"
86
+ # num_matched_ions="9" tot_num_ions="22" calc_neutral_pep_mass="1380.6557" massdiff="18.053" num_tol_term="1"
87
+ # num_missed_cleavages="0" is_rejected="0">
88
+ # <search_score name="hyperscore" value="23.9"/>
89
+ # <search_score name="nextscore" value="19.3"/>
90
+ # <search_score name="bscore" value="9.6"/>
91
+ # <search_score name="yscore" value="7.6"/>
92
+ # <search_score name="cscore" value="0"/>
93
+ # <search_score name="zscore" value="0"/>
94
+ # <search_score name="ascore" value="0"/>
95
+ # <search_score name="xscore" value="0"/>
96
+ # <search_score name="expect" value="0.099"/>
97
+ # <analysis_result analysis="peptideprophet">
98
+ # <peptideprophet_result probability="0.9997" all_ntt_prob="(0.0000,0.9997,0.9999)">
99
+ # <search_score_summary>
100
+ # <parameter name="fval" value="2.3571"/>
101
+ # <parameter name="ntt" value="1"/>
102
+ # <parameter name="nmc" value="0"/>
103
+ # <parameter name="massd" value="18.053"/>
104
+ # </search_score_summary>
105
+ # </peptideprophet_result>
106
+ # </analysis_result>
107
+ # </search_hit>
108
+ # </search_result>
109
+ # </spectrum_query>
110
+
111
+ class PSM
112
+
113
+
114
+ attr_accessor :peptide
115
+ attr_accessor :calculated_mz
116
+ attr_accessor :experimental_mz
117
+ attr_accessor :charge
118
+
119
+ attr_accessor :scores
120
+ attr_accessor :peptide_evidence
121
+
122
+ class << self
123
+
124
+ # <SpectrumIdentificationResult spectraData_ref="ma201_Vp_1-10.mzML.mgf"
125
+ # spectrumID="index=3152" id="SIR_1">
126
+ # <SpectrumIdentificationItem passThreshold="false"
127
+ # rank="1" peptide_ref="KSPVYKVHFTR"
128
+ # calculatedMassToCharge="1360.7615466836999"
129
+ # experimentalMassToCharge="1362.805053710938"
130
+ # chargeState="1" id="SII_1_1">
131
+ # <PeptideEvidenceRef peptideEvidence_ref="PepEv_1" />
132
+ # <Fragmentation>
133
+ # <IonType charge="1" index="1 4">
134
+ # <FragmentArray measure_ref="Measure_MZ"
135
+ # values="175.2081208 560.3388993" />
136
+ # <FragmentArray measure_ref="Measure_Int"
137
+ # values="94.0459823608 116.2766723633" />
138
+ # <FragmentArray measure_ref="Measure_Error"
139
+ # values="0.08916864948798775 0.0449421494880653" />
140
+ # <cvParam cvRef="PSI-MS" accession="MS:1001220"
141
+ # name="frag: y ion" />
142
+ # </IonType>
143
+ # </Fragmentation>
144
+ # <cvParam cvRef="PSI-MS" accession="MS:1002466"
145
+ # name="PeptideShaker PSM score" value="0.0" />
146
+ # <cvParam cvRef="PSI-MS" accession="MS:1002467"
147
+ # name="PeptideShaker PSM confidence" value="0.0" />
148
+ # <cvParam cvRef="PSI-MS" accession="MS:1002052"
149
+ # name="MS-GF:SpecEValue" value="1.4757611E-6" />
150
+ # <cvParam cvRef="PSI-MS" accession="MS:1001117"
151
+ # name="theoretical mass" value="1360.7615466836999" />
152
+ # <cvParam cvRef="PSI-MS" accession="MS:1002543"
153
+ # name="PeptideShaker PSM confidence type"
154
+ # value="Not Validated" />
155
+ # </SpectrumIdentificationItem>
156
+ # <cvParam cvRef="PSI-MS" accession="MS:1000796"
157
+ # name="spectrum title"
158
+ # value="Suresh Vp 1 to 10_BAF.3535.3535.1" />
159
+ # <cvParam cvRef="PSI-MS" accession="MS:1000894"
160
+ # name="retention time" value="6855.00001" unitCvRef="UO"
161
+ # unitAccession="UO:0000010" unitName="seconds" />
162
+ # </SpectrumIdentificationResult>
163
+
164
+
165
+
166
+ def from_mzid(psm_node)
167
+ psm = new()
168
+ psm.peptide = MzIdentMLDoc.get_sequence_for_psm(psm_node)
169
+ peptide_evidence_nodes = MzIdentMLDoc.get_peptide_evidence_from_psm(psm_node)
170
+ psm.peptide_evidence = peptide_evidence_nodes.collect { |pe| PeptideEvidence.from_mzid(pe) }
171
+
172
+ psm.calculated_mz = psm_node.attributes['calculatedMassToCharge'].to_f
173
+ psm.experimental_mz = psm_node.attributes['experimentalMassToCharge'].to_f
174
+ psm.charge = psm_node.attributes['chargeState'].to_i
175
+
176
+ psm
177
+ end
178
+
179
+
180
+ private :new
181
+ end
182
+
183
+ def initialize()
184
+
185
+ end
186
+
187
+ # <search_hit hit_rank="1" peptide="GGYNQDGGSGGGYQGGGGYSGGGGGYQGGQR"
188
+ # peptide_prev_aa="R" peptide_next_aa="N"
189
+ # protein="lcl|JEMP01000008.1_fwd_g5144.t1"
190
+ # num_tot_proteins="1"
191
+ # calc_neutral_pep_mass="2768.11967665812"
192
+ # massdiff="0.120361328125"
193
+ # protein_descr="4860|5785"
194
+ # num_tol_term="2"
195
+ # num_missed_cleavages="0">
196
+
197
+ # From what I can tell, search_hit is always trivially wrapped in search_result 1:1
198
+ #
199
+ def as_pepxml()
200
+ hit_node = XML::Node.new('search_hit')
201
+ hit_node['peptide']=self.peptide.to_s
202
+
203
+ # require 'byebug';byebug
204
+ first_evidence = self.peptide_evidence.first
205
+
206
+ hit_node['peptide_prev_aa']=first_evidence.peptide_prev_aa
207
+ hit_node['peptide_next_aa']=first_evidence.peptide_next_aa
208
+ hit_node['protein']=first_evidence.protein
209
+ hit_node['protein_descr']=first_evidence.protein_descr
210
+
211
+ hit_node['num_tot_proteins']=self.peptide_evidence.length.to_s
212
+
213
+ alt_evidence = peptide_evidence.drop(1)
214
+ alt_evidence.each { |ae| hit_node << ae.as_pepxml }
215
+
216
+ result_node = XML::Node.new('search_result')
217
+ result_node << hit_node
218
+ result_node
219
+ end
220
+
221
+
222
+ end
@@ -5,8 +5,7 @@
5
5
  # Provides common functionality used by all msms search tools.
6
6
  #
7
7
  # It allows;
8
- # 1. Specification of the search database using a simple name ... this class provides the necessary search for the actual file
9
- # 2. Output files to be specified via a prefix or suffix to be added to the name of the corresponding input file
8
+ # 1. Output files to be specified via a prefix or suffix to be added to the name of the corresponding input file
10
9
  #
11
10
 
12
11
  require 'optparse'
@@ -21,10 +20,6 @@ class SearchTool < Tool
21
20
  def initialize(option_support=[])
22
21
  super(option_support)
23
22
 
24
- # if (option_support.include? :database)
25
- # add_value_option(:database,"sphuman",['-d', '--database dbname', 'Specify the database to use for this search. Can be a named protk database or the path to a fasta file'])
26
- # end
27
-
28
23
  if ( option_support.include? :enzyme )
29
24
  add_value_option(:enzyme,"Trypsin",['--enzyme enz', 'Enzyme'])
30
25
  end
@@ -0,0 +1,35 @@
1
+
2
+ class Sniffer
3
+
4
+ @sniff_lines = 100
5
+
6
+ # Return nil if undetectable
7
+ # Return detected format otherwise
8
+ def self.sniff_format(filepath)
9
+ if self.is_mgf_format(filepath)
10
+ return "mgf"
11
+ elsif self.is_mzml_format(filepath)
12
+ return "mzML"
13
+ end
14
+ return nil
15
+ end
16
+
17
+
18
+ def self.is_mzml_format(filepath)
19
+ lines = File.foreach(filepath).first(@sniff_lines).join("\n")
20
+ if lines =~ /\<mzML.*http\:\/\/psi\.hupo\.org\/ms\/mzml/
21
+ return true
22
+ end
23
+ return false
24
+ end
25
+
26
+ def self.is_mgf_format(filepath)
27
+ lines = File.foreach(filepath).first(@sniff_lines).join("\n")
28
+ if lines =~ /^BEGIN IONS/
29
+ return true
30
+ end
31
+ return false
32
+ end
33
+
34
+
35
+ end