protk 1.4.1 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ HYDROGEN_MASS=1.00794
data/lib/protk/protein.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'protk/peptide'
2
+ require 'protk/mzidentml_doc'
2
3
 
3
4
  include LibXML
4
5
 
@@ -14,6 +15,21 @@ class Protein
14
15
  attr_accessor :percent_coverage
15
16
  attr_accessor :peptides
16
17
 
18
+ def as_protxml
19
+ node = XML::Node.new('protein')
20
+ node['protein_name']=self.protein_name.to_s
21
+ node['n_indistinguishable_proteins']=self.n_indistinguishable_proteins.to_s
22
+ node['probability']=self.probability.to_s
23
+ node['percent_coverage']=self.percent_coverage.to_s
24
+ node['unique_stripped_peptides']=self.peptides.collect {|p| p.sequence }.join("+")
25
+ node['total_number_peptides']=self.peptides.length.to_s
26
+ self.peptides.each do |peptide|
27
+ node<<peptide.as_protxml
28
+ end
29
+ node
30
+ end
31
+
32
+
17
33
  class << self
18
34
 
19
35
  # <protein_group group_number="1" probability="1.0000">
@@ -46,6 +62,52 @@ class Protein
46
62
  prot.peptides = peptide_nodes.collect { |e| Peptide.from_protxml(e) }
47
63
  prot
48
64
  end
65
+
66
+
67
+ # <ProteinAmbiguityGroup id="PAG_0">
68
+ # <ProteinDetectionHypothesis id="PAG_0_1" dBSequence_ref="JEMP01000193.1_rev_g3500.t1 280755" passThreshold="false">
69
+ # <PeptideHypothesis peptideEvidence_ref="PepEv_1">
70
+ # <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
71
+ # </PeptideHypothesis>
72
+ # <cvParam cvRef="PSI-MS" accession="MS:1002403" name="group representative"/>
73
+ # <cvParam cvRef="PSI-MS" accession="MS:1002401" name="leading protein"/>
74
+ # <cvParam cvRef="PSI-MS" accession="MS:1001093" name="sequence coverage" value="0.0"/>
75
+ # </ProteinDetectionHypothesis>
76
+ # <cvParam cvRef="PSI-MS" accession="MS:1002470" name="PeptideShaker protein group score" value="0.0"/>
77
+ # <cvParam cvRef="PSI-MS" accession="MS:1002471" name="PeptideShaker protein group confidence" value="0.0"/>
78
+ # <cvParam cvRef="PSI-MS" accession="MS:1002545" name="PeptideShaker protein confidence type" value="Not Validated"/>
79
+ # <cvParam cvRef="PSI-MS" accession="MS:1002415" name="protein group passes threshold" value="false"/>
80
+ # </ProteinAmbiguityGroup>
81
+
82
+
83
+ # Note:
84
+ # This is hacked together to work for a specific PeptideShaker output type
85
+ # Refactor and properly respect cvParams for real conversion
86
+ #
87
+ def from_mzid(xmlnode)
88
+
89
+ coverage_cvparam=""
90
+ prot=new()
91
+ groupnode = xmlnode.parent
92
+
93
+ prot.group_number=groupnode.attributes['id'].split("_").last.to_i+1
94
+ prot.protein_name=MzIdentMLDoc.get_dbsequence(xmlnode,xmlnode.attributes['dBSequence_ref']).attributes['accession']
95
+ prot.n_indistinguishable_proteins=MzIdentMLDoc.get_proteins_for_group(groupnode).length
96
+ prot.group_probability=MzIdentMLDoc.get_cvParam(groupnode,"MS:1002470").attributes['value'].to_f
97
+
98
+ coverage_node=MzIdentMLDoc.get_cvParam(xmlnode,"MS:1001093")
99
+
100
+ prot.percent_coverage=coverage_node.attributes['value'].to_f if coverage_node
101
+ prot.probability = MzIdentMLDoc.get_protein_probability(xmlnode)
102
+ # require 'byebug';byebug
103
+
104
+ peptide_nodes=MzIdentMLDoc.get_peptides_for_protein(xmlnode)
105
+
106
+ prot.peptides = peptide_nodes.collect { |e| Peptide.from_mzid(e) }
107
+ prot
108
+ end
109
+
110
+
49
111
  private :new
50
112
  end
51
113
 
@@ -62,11 +124,12 @@ class Protein
62
124
  if best_peptides[seq].nil?
63
125
  best_peptides[seq]=peptide
64
126
  else
65
- best_peptides[seq]=peptide if peptide.nsp_adjusted_probability > best_peptides[seq].nsp_adjusted_probability
127
+ best_peptides[seq]=peptide if peptide.probability > best_peptides[seq].probability
66
128
  end
67
129
  end
68
130
 
69
131
  best_peptides.values
70
132
  end
71
133
 
134
+
72
135
  end
@@ -0,0 +1,70 @@
1
+
2
+ require 'protk/peptide'
3
+ require 'protk/protein'
4
+ require 'protk/mzidentml_doc'
5
+ require 'protk/protxml_writer'
6
+
7
+ include LibXML
8
+
9
+
10
+ class ProteinGroup
11
+
12
+ attr_accessor :group_number
13
+ attr_accessor :group_probability
14
+ attr_accessor :proteins
15
+
16
+ class << self
17
+
18
+ # <ProteinAmbiguityGroup id="PAG_0">
19
+ # <ProteinDetectionHypothesis id="PAG_0_1" dBSequence_ref="JEMP01000193.1_rev_g3500.t1 280755" passThreshold="false">
20
+ # <PeptideHypothesis peptideEvidence_ref="PepEv_1">
21
+ # <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
22
+ # </PeptideHypothesis>
23
+ # <cvParam cvRef="PSI-MS" accession="MS:1002403" name="group representative"/>
24
+ # <cvParam cvRef="PSI-MS" accession="MS:1002401" name="leading protein"/>
25
+ # <cvParam cvRef="PSI-MS" accession="MS:1001093" name="sequence coverage" value="0.0"/>
26
+ # </ProteinDetectionHypothesis>
27
+ # <cvParam cvRef="PSI-MS" accession="MS:1002470" name="PeptideShaker protein group score" value="0.0"/>
28
+ # <cvParam cvRef="PSI-MS" accession="MS:1002471" name="PeptideShaker protein group confidence" value="0.0"/>
29
+ # <cvParam cvRef="PSI-MS" accession="MS:1002545" name="PeptideShaker protein confidence type" value="Not Validated"/>
30
+ # <cvParam cvRef="PSI-MS" accession="MS:1002415" name="protein group passes threshold" value="false"/>
31
+ # </ProteinAmbiguityGroup>
32
+
33
+
34
+ # Note:
35
+ # This is hacked together to work for a specific PeptideShaker output type
36
+ # Refactor and properly respect cvParams for real conversion
37
+ #
38
+ def from_mzid(groupnode)
39
+
40
+ group=new()
41
+
42
+ group.group_number=groupnode.attributes['id'].split("_").last.to_i+1
43
+ group.group_probability=MzIdentMLDoc.get_cvParam(groupnode,"MS:1002470").attributes['value'].to_f
44
+
45
+ # require 'byebug';byebug
46
+
47
+ protein_nodes=MzIdentMLDoc.get_proteins_for_group(groupnode)
48
+
49
+ group.proteins = protein_nodes.collect { |e| Protein.from_mzid(e) }
50
+ group
51
+ end
52
+
53
+
54
+ private :new
55
+ end
56
+
57
+ def initialize()
58
+
59
+ end
60
+
61
+ def as_protxml()
62
+ node = XML::Node.new('protein_group')
63
+ node["group_number"] = self.group_number.to_s
64
+ node["group_probability"] = self.group_probability.to_s
65
+ self.proteins.each { |prot| node << prot.as_protxml }
66
+ node
67
+ end
68
+
69
+
70
+ end
@@ -0,0 +1,27 @@
1
+ include LibXML
2
+
3
+ class ProtXMLWriter < Object
4
+
5
+ PROTXML_NS_PREFIX="protxml"
6
+ PROTXML_NS="http://regis-web.systemsbiology.net/protXML"
7
+
8
+ attr :template_doc
9
+ attr :protein_summary_node
10
+
11
+ def initialize
12
+ template_path="#{File.dirname(__FILE__)}/data/template_prot.xml"
13
+ template_parser=XML::Parser.file(template_path)
14
+ @template_doc=template_parser.parse
15
+ @protein_summary_node=@template_doc.root
16
+ end
17
+
18
+ def append_protein_group(pg_node)
19
+ # require 'byebug';byebug
20
+ @protein_summary_node << pg_node
21
+ end
22
+
23
+ def save(file_path)
24
+ @template_doc.save(file_path,:indent=>true,:encoding => XML::Encoding::UTF_8)
25
+ end
26
+
27
+ end
data/lib/protk/psm.rb ADDED
@@ -0,0 +1,222 @@
1
+
2
+ require 'protk/mzidentml_doc'
3
+ require 'libxml'
4
+
5
+ include LibXML
6
+
7
+
8
+ class String
9
+ def to_bool
10
+ return true if self == true || self =~ (/^(true|t|yes|y|1)$/i)
11
+ return false if self == false || self =~ (/^(false|f|no|n|0)$/i)
12
+ raise ArgumentError.new("invalid value for Boolean: \"#{self}\"")
13
+ end
14
+ end
15
+
16
+ class PeptideEvidence
17
+ attr_accessor :peptide_prev_aa
18
+ attr_accessor :peptide_next_aa
19
+ attr_accessor :protein
20
+ attr_accessor :protein_descr
21
+ # attr_accessor :peptide_sequence
22
+ attr_accessor :is_decoy
23
+
24
+ # <PeptideEvidence isDecoy="false" pre="K" post="G" start="712"
25
+ # end="722" peptide_ref="KSPVYKVHFTR"
26
+ # dBSequence_ref="JEMP01000193.1_rev_g3500.t1" id="PepEv_1" />
27
+ class << self
28
+
29
+ def from_mzid(pe_node)
30
+ pe = new()
31
+ pe.peptide_prev_aa=pe_node.attributes['pre']
32
+ pe.peptide_next_aa=pe_node.attributes['post']
33
+ pe.is_decoy=pe_node.attributes['isDecoy'].to_bool
34
+
35
+ # peptide_ref = pe_node.attributes['peptide_ref']
36
+ prot_ref = pe_node.attributes['dBSequence_ref']
37
+ # pep_node = MzIdentMLDoc.find(pe_node,"Peptide[@id=\'#{peptide_ref}\']",true)[0]
38
+ prot_node = MzIdentMLDoc.find(pe_node,"DBSequence[@id=\'#{prot_ref}\']",true)[0]
39
+
40
+
41
+ # <DBSequence id="JEMP01000193.1_rev_g3500.t1"
42
+ # accession="JEMP01000193.1_rev_g3500.t1"
43
+ # searchDatabase_ref="SearchDB_1">
44
+ # <cvParam cvRef="PSI-MS" accession="MS:1001088"
45
+ # name="protein description" value="280755|283436" />
46
+ # </DBSequence>
47
+ pe.protein=prot_node.attributes['accession']
48
+ pe.protein_descr=MzIdentMLDoc.get_cvParam(prot_node,"MS:1001088")['value']
49
+
50
+
51
+ # pe.peptide_sequence=pep_node
52
+
53
+ pe
54
+ end
55
+
56
+
57
+ private :new
58
+ end
59
+
60
+ def initialize()
61
+
62
+ end
63
+
64
+ # <alternative_protein protein="lcl|JEMP01000005.1_rev_g4624.t1"
65
+ # protein_descr="652491|654142" num_tol_term="2" peptide_prev_aa="K" peptide_next_aa="Y"/>
66
+ # We use this only for alternative_proteins
67
+ # The first peptide_evidence item is baked into the attributes of a spectrum_query
68
+ def as_pepxml()
69
+ alt_node = XML::Node.new('alternative_protein')
70
+ alt_node['protein']=self.protein
71
+ alt_node['protein_descr']=self.protein_descr
72
+ alt_node['peptide_prev_aa']=self.peptide_prev_aa
73
+ alt_node['peptide_next_aa']=self.peptide_next_aa
74
+
75
+
76
+ alt_node
77
+ end
78
+
79
+ end
80
+
81
+ # <spectrum_query spectrum="mr176-BSA100fmole_BA3_01_8167.00003.00003.2" start_scan="3" end_scan="3"
82
+ #precursor_neutral_mass="1398.7082" assumed_charge="2" index="2" experiment_label="mr176">
83
+ # <search_result>
84
+ # <search_hit hit_rank="1" peptide="SQVFQLESTFDV" peptide_prev_aa="R" peptide_next_aa="K" protein="tr|Q90853|Q90853_CHICK"
85
+ # protein_descr="Homeobox protein OS=Gallus gallus GN=GH6 PE=2 SV=1" num_tot_proteins="1"
86
+ # num_matched_ions="9" tot_num_ions="22" calc_neutral_pep_mass="1380.6557" massdiff="18.053" num_tol_term="1"
87
+ # num_missed_cleavages="0" is_rejected="0">
88
+ # <search_score name="hyperscore" value="23.9"/>
89
+ # <search_score name="nextscore" value="19.3"/>
90
+ # <search_score name="bscore" value="9.6"/>
91
+ # <search_score name="yscore" value="7.6"/>
92
+ # <search_score name="cscore" value="0"/>
93
+ # <search_score name="zscore" value="0"/>
94
+ # <search_score name="ascore" value="0"/>
95
+ # <search_score name="xscore" value="0"/>
96
+ # <search_score name="expect" value="0.099"/>
97
+ # <analysis_result analysis="peptideprophet">
98
+ # <peptideprophet_result probability="0.9997" all_ntt_prob="(0.0000,0.9997,0.9999)">
99
+ # <search_score_summary>
100
+ # <parameter name="fval" value="2.3571"/>
101
+ # <parameter name="ntt" value="1"/>
102
+ # <parameter name="nmc" value="0"/>
103
+ # <parameter name="massd" value="18.053"/>
104
+ # </search_score_summary>
105
+ # </peptideprophet_result>
106
+ # </analysis_result>
107
+ # </search_hit>
108
+ # </search_result>
109
+ # </spectrum_query>
110
+
111
+ class PSM
112
+
113
+
114
+ attr_accessor :peptide
115
+ attr_accessor :calculated_mz
116
+ attr_accessor :experimental_mz
117
+ attr_accessor :charge
118
+
119
+ attr_accessor :scores
120
+ attr_accessor :peptide_evidence
121
+
122
+ class << self
123
+
124
+ # <SpectrumIdentificationResult spectraData_ref="ma201_Vp_1-10.mzML.mgf"
125
+ # spectrumID="index=3152" id="SIR_1">
126
+ # <SpectrumIdentificationItem passThreshold="false"
127
+ # rank="1" peptide_ref="KSPVYKVHFTR"
128
+ # calculatedMassToCharge="1360.7615466836999"
129
+ # experimentalMassToCharge="1362.805053710938"
130
+ # chargeState="1" id="SII_1_1">
131
+ # <PeptideEvidenceRef peptideEvidence_ref="PepEv_1" />
132
+ # <Fragmentation>
133
+ # <IonType charge="1" index="1 4">
134
+ # <FragmentArray measure_ref="Measure_MZ"
135
+ # values="175.2081208 560.3388993" />
136
+ # <FragmentArray measure_ref="Measure_Int"
137
+ # values="94.0459823608 116.2766723633" />
138
+ # <FragmentArray measure_ref="Measure_Error"
139
+ # values="0.08916864948798775 0.0449421494880653" />
140
+ # <cvParam cvRef="PSI-MS" accession="MS:1001220"
141
+ # name="frag: y ion" />
142
+ # </IonType>
143
+ # </Fragmentation>
144
+ # <cvParam cvRef="PSI-MS" accession="MS:1002466"
145
+ # name="PeptideShaker PSM score" value="0.0" />
146
+ # <cvParam cvRef="PSI-MS" accession="MS:1002467"
147
+ # name="PeptideShaker PSM confidence" value="0.0" />
148
+ # <cvParam cvRef="PSI-MS" accession="MS:1002052"
149
+ # name="MS-GF:SpecEValue" value="1.4757611E-6" />
150
+ # <cvParam cvRef="PSI-MS" accession="MS:1001117"
151
+ # name="theoretical mass" value="1360.7615466836999" />
152
+ # <cvParam cvRef="PSI-MS" accession="MS:1002543"
153
+ # name="PeptideShaker PSM confidence type"
154
+ # value="Not Validated" />
155
+ # </SpectrumIdentificationItem>
156
+ # <cvParam cvRef="PSI-MS" accession="MS:1000796"
157
+ # name="spectrum title"
158
+ # value="Suresh Vp 1 to 10_BAF.3535.3535.1" />
159
+ # <cvParam cvRef="PSI-MS" accession="MS:1000894"
160
+ # name="retention time" value="6855.00001" unitCvRef="UO"
161
+ # unitAccession="UO:0000010" unitName="seconds" />
162
+ # </SpectrumIdentificationResult>
163
+
164
+
165
+
166
+ def from_mzid(psm_node)
167
+ psm = new()
168
+ psm.peptide = MzIdentMLDoc.get_sequence_for_psm(psm_node)
169
+ peptide_evidence_nodes = MzIdentMLDoc.get_peptide_evidence_from_psm(psm_node)
170
+ psm.peptide_evidence = peptide_evidence_nodes.collect { |pe| PeptideEvidence.from_mzid(pe) }
171
+
172
+ psm.calculated_mz = psm_node.attributes['calculatedMassToCharge'].to_f
173
+ psm.experimental_mz = psm_node.attributes['experimentalMassToCharge'].to_f
174
+ psm.charge = psm_node.attributes['chargeState'].to_i
175
+
176
+ psm
177
+ end
178
+
179
+
180
+ private :new
181
+ end
182
+
183
+ def initialize()
184
+
185
+ end
186
+
187
+ # <search_hit hit_rank="1" peptide="GGYNQDGGSGGGYQGGGGYSGGGGGYQGGQR"
188
+ # peptide_prev_aa="R" peptide_next_aa="N"
189
+ # protein="lcl|JEMP01000008.1_fwd_g5144.t1"
190
+ # num_tot_proteins="1"
191
+ # calc_neutral_pep_mass="2768.11967665812"
192
+ # massdiff="0.120361328125"
193
+ # protein_descr="4860|5785"
194
+ # num_tol_term="2"
195
+ # num_missed_cleavages="0">
196
+
197
+ # From what I can tell, search_hit is always trivially wrapped in search_result 1:1
198
+ #
199
+ def as_pepxml()
200
+ hit_node = XML::Node.new('search_hit')
201
+ hit_node['peptide']=self.peptide.to_s
202
+
203
+ # require 'byebug';byebug
204
+ first_evidence = self.peptide_evidence.first
205
+
206
+ hit_node['peptide_prev_aa']=first_evidence.peptide_prev_aa
207
+ hit_node['peptide_next_aa']=first_evidence.peptide_next_aa
208
+ hit_node['protein']=first_evidence.protein
209
+ hit_node['protein_descr']=first_evidence.protein_descr
210
+
211
+ hit_node['num_tot_proteins']=self.peptide_evidence.length.to_s
212
+
213
+ alt_evidence = peptide_evidence.drop(1)
214
+ alt_evidence.each { |ae| hit_node << ae.as_pepxml }
215
+
216
+ result_node = XML::Node.new('search_result')
217
+ result_node << hit_node
218
+ result_node
219
+ end
220
+
221
+
222
+ end
@@ -5,8 +5,7 @@
5
5
  # Provides common functionality used by all msms search tools.
6
6
  #
7
7
  # It allows;
8
- # 1. Specification of the search database using a simple name ... this class provides the necessary search for the actual file
9
- # 2. Output files to be specified via a prefix or suffix to be added to the name of the corresponding input file
8
+ # 1. Output files to be specified via a prefix or suffix to be added to the name of the corresponding input file
10
9
  #
11
10
 
12
11
  require 'optparse'
@@ -21,10 +20,6 @@ class SearchTool < Tool
21
20
  def initialize(option_support=[])
22
21
  super(option_support)
23
22
 
24
- # if (option_support.include? :database)
25
- # add_value_option(:database,"sphuman",['-d', '--database dbname', 'Specify the database to use for this search. Can be a named protk database or the path to a fasta file'])
26
- # end
27
-
28
23
  if ( option_support.include? :enzyme )
29
24
  add_value_option(:enzyme,"Trypsin",['--enzyme enz', 'Enzyme'])
30
25
  end
@@ -0,0 +1,35 @@
1
+
2
+ class Sniffer
3
+
4
+ @sniff_lines = 100
5
+
6
+ # Return nil if undetectable
7
+ # Return detected format otherwise
8
+ def self.sniff_format(filepath)
9
+ if self.is_mgf_format(filepath)
10
+ return "mgf"
11
+ elsif self.is_mzml_format(filepath)
12
+ return "mzML"
13
+ end
14
+ return nil
15
+ end
16
+
17
+
18
+ def self.is_mzml_format(filepath)
19
+ lines = File.foreach(filepath).first(@sniff_lines).join("\n")
20
+ if lines =~ /\<mzML.*http\:\/\/psi\.hupo\.org\/ms\/mzml/
21
+ return true
22
+ end
23
+ return false
24
+ end
25
+
26
+ def self.is_mgf_format(filepath)
27
+ lines = File.foreach(filepath).first(@sniff_lines).join("\n")
28
+ if lines =~ /^BEGIN IONS/
29
+ return true
30
+ end
31
+ return false
32
+ end
33
+
34
+
35
+ end