protk 1.4.3 → 1.4.4.beta1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 31df514a2203236ea9ac25f8d5cc9c282378e04d
4
- data.tar.gz: f1bb438ef01003afc166eb5b0342dbd8f6ecd09b
3
+ metadata.gz: 6b769808b35e55e6f9c74b11704e57153e6276cc
4
+ data.tar.gz: ba8003573127b2912a64995a76e2204af3213851
5
5
  SHA512:
6
- metadata.gz: 7f7f0fe81411f17b89037162ad7bf5374be69309888bda2f84b058d69773dd1790469d40dd5797e0306ce49ecc95426cfb80b65bfcb95ac0a052be90db40ea42
7
- data.tar.gz: f2db6018ac90079e925f7c5c071be3fbedd77bc3787cfc58a91aa38048f43f60426020a3bff3a5a58c4bea6d71aaad7c0ab10437e050eaf1998792ca6ab2e1dd
6
+ metadata.gz: 7de51d2b7a77625abd3f0042057a7fc689f986e8fbf30b257b4d48930983445a26c6dac23fe8b1f7e3e96855fa47152f3731a2c29443c72cac84aaf56ae751c5
7
+ data.tar.gz: 9a4d1fd1644cbb2e6a067173fa59a8bb99570eb4b1d4878272267b36140a01c4893568ca12a0afef77db1b1c626745f162f183ba1cd9afcaef78a0802072f605
@@ -12,6 +12,7 @@ require 'protk/constants'
12
12
  require 'protk/command_runner'
13
13
  require 'protk/mzidentml_doc'
14
14
  require 'protk/protein_group'
15
+ require 'protk/protein_summary'
15
16
  require 'protk/tool'
16
17
 
17
18
  include LibXML
@@ -45,6 +46,10 @@ prot_xml_writer = ProtXMLWriter.new
45
46
  $protk.log "Parsing MzIdentML input file" , :info
46
47
  mzid_doc = MzIdentMLDoc.new(input_file)
47
48
 
49
+ header = ProteinSummary.from_mzid(mzid_doc)
50
+
51
+ prot_xml_writer.append_header(header)
52
+
48
53
  protein_groups = mzid_doc.protein_groups
49
54
 
50
55
  n_prots = protein_groups.length
@@ -77,4 +82,6 @@ end
77
82
 
78
83
  $protk.log "Writing #{n_written} proteins to #{output_file_name}", :info
79
84
 
85
+ prot_xml_writer.append_dataset_derivation
86
+
80
87
  prot_xml_writer.save(output_file_name)
@@ -1,39 +1,3 @@
1
1
  <?xml version="1.0" encoding="UTF-8"?>
2
2
  <protein_summary xmlns="http://regis-web.systemsbiology.net/protXML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://sashimi.sourceforge.net/schema_revision/protXML/protXML_v6.xsd" summary_xml="">
3
- <protein_summary_header reference_database="FULLPATH_TO_REFERENCE_DB" residue_substitution_list="I -&gt; L" source_files="FULLPATH_TO_SOURCE_PEPXML" source_files_alt="FULLPATH_TO_SOURCE_PEPXML" min_peptide_probability="" min_peptide_weight="" num_predicted_correct_prots="" num_input_1_spectra="" num_input_2_spectra="" num_input_3_spectra="" num_input_4_spectra="" num_input_5_spectra="" initial_min_peptide_prob="" total_no_spectrum_ids="" sample_enzyme="trypsin">
4
- <program_details analysis="proteinprophet" time="2014-01-20T14:17:37" version=" Insilicos_LabKey_C++ (TPP v0.0 Development trunk rev 0, Build 201307090846 (linux))">
5
- <proteinprophet_details occam_flag="Y" groups_flag="Y" degen_flag="Y" nsp_flag="Y" initial_peptide_wt_iters="2" nsp_distribution_iters="2" final_peptide_wt_iters="3">
6
- <nsp_information neighboring_bin_smoothing="Y">
7
- <nsp_distribution bin_no="0" nsp_lower_bound_incl="0.00" nsp_upper_bound_incl="0.00" pos_freq="0.057" neg_freq="0.625" pos_to_neg_ratio="0.09"/>
8
- <nsp_distribution bin_no="1" nsp_lower_bound_excl="0.00" nsp_upper_bound_incl="0.31" pos_freq="0.037" neg_freq="0.152" pos_to_neg_ratio="0.24"/>
9
- <nsp_distribution bin_no="2" nsp_lower_bound_excl="0.31" nsp_upper_bound_incl="1.00" pos_freq="0.077" neg_freq="0.032" pos_to_neg_ratio="2.42"/>
10
- <nsp_distribution bin_no="3" nsp_lower_bound_excl="1.00" nsp_upper_bound_incl="2.50" pos_freq="0.113" neg_freq="0.033" pos_to_neg_ratio="3.39"/>
11
- <nsp_distribution bin_no="4" nsp_lower_bound_excl="2.50" nsp_upper_bound_incl="4.63" pos_freq="0.123" neg_freq="0.032" pos_to_neg_ratio="3.91"/>
12
- <nsp_distribution bin_no="5" nsp_lower_bound_excl="4.63" nsp_upper_bound_incl="7.90" pos_freq="0.143" neg_freq="0.032" pos_to_neg_ratio="4.50"/>
13
- <nsp_distribution bin_no="6" nsp_lower_bound_excl="7.90" nsp_upper_bound_incl="14.92" pos_freq="0.196" neg_freq="0.041" pos_to_neg_ratio="4.78"/>
14
- <nsp_distribution bin_no="7" nsp_lower_bound_excl="14.92" nsp_upper_bound_excl="inf" pos_freq="0.254" neg_freq="0.054" pos_to_neg_ratio="4.72" alt_pos_to_neg_ratio="4.78"/>
15
- </nsp_information>
16
- <ni_information>
17
- </ni_information>
18
- <protein_summary_data_filter min_probability="0.00" sensitivity="1.000" false_positive_error_rate="0.835" predicted_num_correct="1787" predicted_num_incorrect="9044"/>
19
- <protein_summary_data_filter min_probability="0.10" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
20
- <protein_summary_data_filter min_probability="0.20" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
21
- <protein_summary_data_filter min_probability="0.30" sensitivity="0.956" false_positive_error_rate="0.151" predicted_num_correct="1709" predicted_num_incorrect="305"/>
22
- <protein_summary_data_filter min_probability="0.40" sensitivity="0.916" false_positive_error_rate="0.095" predicted_num_correct="1638" predicted_num_incorrect="171"/>
23
- <protein_summary_data_filter min_probability="0.50" sensitivity="0.887" false_positive_error_rate="0.063" predicted_num_correct="1585" predicted_num_incorrect="106"/>
24
- <protein_summary_data_filter min_probability="0.60" sensitivity="0.853" false_positive_error_rate="0.036" predicted_num_correct="1525" predicted_num_incorrect="58"/>
25
- <protein_summary_data_filter min_probability="0.70" sensitivity="0.826" false_positive_error_rate="0.020" predicted_num_correct="1477" predicted_num_incorrect="31"/>
26
- <protein_summary_data_filter min_probability="0.80" sensitivity="0.805" false_positive_error_rate="0.012" predicted_num_correct="1438" predicted_num_incorrect="18"/>
27
- <protein_summary_data_filter min_probability="0.90" sensitivity="0.773" false_positive_error_rate="0.006" predicted_num_correct="1381" predicted_num_incorrect="8"/>
28
- <protein_summary_data_filter min_probability="0.95" sensitivity="0.749" false_positive_error_rate="0.004" predicted_num_correct="1339" predicted_num_incorrect="5"/>
29
- <protein_summary_data_filter min_probability="0.96" sensitivity="0.738" false_positive_error_rate="0.003" predicted_num_correct="1318" predicted_num_incorrect="4"/>
30
- <protein_summary_data_filter min_probability="0.97" sensitivity="0.728" false_positive_error_rate="0.002" predicted_num_correct="1302" predicted_num_incorrect="3"/>
31
- <protein_summary_data_filter min_probability="0.98" sensitivity="0.711" false_positive_error_rate="0.002" predicted_num_correct="1272" predicted_num_incorrect="2"/>
32
- <protein_summary_data_filter min_probability="0.99" sensitivity="0.609" false_positive_error_rate="0.000" predicted_num_correct="1088" predicted_num_incorrect="0"/>
33
- <protein_summary_data_filter min_probability="1.00" sensitivity="0.164" false_positive_error_rate="0.000" predicted_num_correct="294" predicted_num_incorrect="0"/>
34
- </proteinprophet_details>
35
- </program_details>
36
- </protein_summary_header>
37
- <dataset_derivation generation_no="0">
38
- </dataset_derivation>
39
3
  </protein_summary>
@@ -37,6 +37,21 @@ class MzIdentMLDoc < Object
37
37
  @document=parser.parse
38
38
  end
39
39
 
40
+ def source_files
41
+ @document.find("//#{MZID_NS_PREFIX}:SourceFile","#{MZID_NS_PREFIX}:#{MZID_NS}")
42
+ end
43
+
44
+ def search_databases
45
+ @document.find("//#{MZID_NS_PREFIX}:SearchDatabase","#{MZID_NS_PREFIX}:#{MZID_NS}")
46
+ end
47
+
48
+ def enzymes
49
+ @document.find("//#{MZID_NS_PREFIX}:Enzyme","#{MZID_NS_PREFIX}:#{MZID_NS}")
50
+ end
51
+
52
+ def analysis_software
53
+ @document.find("//#{MZID_NS_PREFIX}:AnalysisSoftware","#{MZID_NS_PREFIX}:#{MZID_NS}")
54
+ end
40
55
 
41
56
  def spectrum_queries
42
57
  @document.find("//#{MZID_NS_PREFIX}:SpectrumIdentificationResult","#{MZID_NS_PREFIX}:#{MZID_NS}")
@@ -130,10 +145,7 @@ class MzIdentMLDoc < Object
130
145
  # <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
131
146
  # </PeptideHypothesis>
132
147
  def get_best_psm_for_peptide(peptide_node)
133
-
134
-
135
-
136
- best_score=-1
148
+ best_score=nil
137
149
  best_psm=nil
138
150
  spectrumidrefs = self.find(peptide_node,"SpectrumIdentificationItemRef")
139
151
  Constants.instance.log "Searching from among #{spectrumidrefs.length} for best psm" , :debug
@@ -143,7 +155,7 @@ class MzIdentMLDoc < Object
143
155
  # psm_node = self.find(peptide_node,"SpectrumIdentificationItem[@id=\'#{id_ref}\']",true)[0]
144
156
  psm_node = self.psms_cache[id_ref]
145
157
  score = self.get_cvParam(psm_node,"MS:1002466")['value'].to_f
146
- if score>best_score
158
+ if ( best_score == nil ) || ( score > best_score )
147
159
  best_psm=psm_node
148
160
  best_score=score
149
161
  end
@@ -49,7 +49,7 @@ class Peptide
49
49
  pep=new()
50
50
  pep.sequence=mzid_doc.get_sequence_for_peptide(xmlnode)
51
51
  best_psm = mzid_doc.get_best_psm_for_peptide(xmlnode)
52
- # require 'byebug';byebug
52
+ # require 'byebug';byebug if !best_psm
53
53
  pep.probability = mzid_doc.get_cvParam(best_psm,"MS:1002466")['value'].to_f
54
54
  pep.theoretical_neutral_mass = mzid_doc.get_cvParam(best_psm,"MS:1001117")['value'].to_f
55
55
  pep.charge = best_psm.attributes['chargeState'].to_i
@@ -0,0 +1,113 @@
1
+
2
+ require 'protk/mzidentml_doc'
3
+ require 'protk/protxml_writer'
4
+
5
+ include LibXML
6
+
7
+ # Represents the protein_summary node of a protXML document
8
+ # This is the root of the document
9
+ #
10
+ class ProteinSummary
11
+
12
+ attr_accessor :reference_database
13
+ attr_accessor :residue_substitution_list
14
+ attr_accessor :source_files
15
+ attr_accessor :source_files_alt
16
+ attr_accessor :min_peptide_probability
17
+ attr_accessor :min_peptide_weight
18
+ attr_accessor :num_predicted_correct_prots
19
+ attr_accessor :num_input_1_spectra
20
+ attr_accessor :num_input_2_spectra
21
+ attr_accessor :num_input_3_spectra
22
+ attr_accessor :num_input_4_spectra
23
+ attr_accessor :num_input_5_spectra
24
+ attr_accessor :initial_min_peptide_prob
25
+ attr_accessor :total_no_spectrum_ids
26
+ attr_accessor :sample_enzyme
27
+
28
+ attr_accessor :program_name
29
+ attr_accessor :analysis_time
30
+ attr_accessor :program_version
31
+
32
+
33
+ class << self
34
+
35
+ def from_mzid(mzid_doc)
36
+
37
+ summary = new()
38
+ # Things we cant retrieve
39
+ summary.residue_substitution_list = ""
40
+ summary.min_peptide_probability = ""
41
+ summary.min_peptide_weight = ""
42
+ summary.num_predicted_correct_prots = ""
43
+ summary.num_input_1_spectra = ""
44
+ summary.num_input_2_spectra = ""
45
+ summary.num_input_3_spectra = ""
46
+ summary.num_input_4_spectra = ""
47
+ summary.num_input_5_spectra = ""
48
+ summary.initial_min_peptide_prob = ""
49
+ summary.total_no_spectrum_ids = ""
50
+ summary.analysis_time = ""
51
+
52
+ db = mzid_doc.search_databases.first
53
+ summary.reference_database = db.attributes['location']
54
+
55
+ summary.source_files = mzid_doc.source_files.collect { |sf| sf.attributes['location'] }
56
+ summary.source_files_alt = summary.source_files
57
+
58
+ summary.sample_enzyme = mzid_doc.enzymes.first.attributes['name']
59
+ if mzid_doc.enzymes.first.attributes['semiSpecific']=="true"
60
+ summary.sample_enzyme = "semi#{summary.sample_enzyme}"
61
+ end
62
+
63
+ analysis_software = mzid_doc.analysis_software.first
64
+ summary.program_name = analysis_software.attributes['name']
65
+ summary.program_version = analysis_software.attributes['version']
66
+
67
+ summary
68
+ end
69
+
70
+
71
+ private :new
72
+ end
73
+
74
+ def initialize()
75
+
76
+ end
77
+
78
+ def as_protxml()
79
+ node = XML::Node.new('protein_summary_header')
80
+ # node.space_preserve=true
81
+ node["reference_database"] = self.reference_database
82
+ node["min_peptide_probability"] = self.min_peptide_probability
83
+ node["min_peptide_weight"] = self.min_peptide_weight
84
+ node["num_predicted_correct_prots"] = self.num_predicted_correct_prots
85
+ node["num_input_1_spectra"] = self.num_input_1_spectra
86
+ node["num_input_2_spectra"] = self.num_input_2_spectra
87
+ node["num_input_3_spectra"] = self.num_input_3_spectra
88
+ node["num_input_4_spectra"] = self.num_input_4_spectra
89
+ node["num_input_5_spectra"] = self.num_input_5_spectra
90
+ node["initial_min_peptide_prob"] = self.initial_min_peptide_prob
91
+ node["total_no_spectrum_ids"] = self.total_no_spectrum_ids
92
+ node["sample_enzyme"] = self.sample_enzyme
93
+
94
+
95
+ cnode = XML::Node.new('program_details')
96
+ # node.space_preserve=true
97
+ cnode["program_name"] = self.program_name
98
+ cnode["analysis_time"] = self.analysis_time
99
+ cnode["program_version"] = self.program_version
100
+ # require 'byebug';byebug
101
+
102
+ node << cnode
103
+
104
+ # ddnode = XML::Node.new('dataset_derivation')
105
+ # ddnode["generation_no"]="0"
106
+
107
+ # node << ddnode
108
+
109
+ node
110
+ end
111
+
112
+
113
+ end
@@ -7,12 +7,22 @@ class ProtXMLWriter < Object
7
7
 
8
8
  attr :template_doc
9
9
  attr :protein_summary_node
10
+ XML.indent_tree_output = true
10
11
 
11
12
  def initialize
12
13
  template_path="#{File.dirname(__FILE__)}/data/template_prot.xml"
13
- template_parser=XML::Parser.file(template_path)
14
+ template_parser=XML::Parser.file(template_path)#,:options => XML::Parser::Options::NOBLANKS)
14
15
  @template_doc=template_parser.parse
15
16
  @protein_summary_node=@template_doc.root
17
+ # @protein_summary_node.space_preserve=true
18
+ @protein_summary_node.content=""
19
+ puts @template_doc
20
+
21
+ end
22
+
23
+ def append_header(header_node)
24
+ # require 'byebug';byebug
25
+ @protein_summary_node << header_node.as_protxml
16
26
  end
17
27
 
18
28
  def append_protein_group(pg_node)
@@ -20,7 +30,17 @@ class ProtXMLWriter < Object
20
30
  @protein_summary_node << pg_node
21
31
  end
22
32
 
33
+ def append_dataset_derivation()
34
+ ddnode = XML::Node.new('dataset_derivation')
35
+ ddnode["generation_no"]="0"
36
+ @protein_summary_node << ddnode
37
+ end
38
+
23
39
  def save(file_path)
40
+ # puts XML.indent_tree_output
41
+ # puts "|#{XML.default_tree_indent_string}|"
42
+ XML.indent_tree_output = true
43
+ # puts @template_doc.to_s
24
44
  @template_doc.save(file_path,:indent=>true,:encoding => XML::Encoding::UTF_8)
25
45
  end
26
46
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: protk
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.3
4
+ version: 1.4.4.beta1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ira Cooke
@@ -299,6 +299,7 @@ files:
299
299
  - lib/protk/prophet_tool.rb
300
300
  - lib/protk/protein.rb
301
301
  - lib/protk/protein_group.rb
302
+ - lib/protk/protein_summary.rb
302
303
  - lib/protk/protein_to_genome_mapper.rb
303
304
  - lib/protk/protxml_to_gff_tool.rb
304
305
  - lib/protk/protxml_writer.rb
@@ -328,9 +329,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
328
329
  version: '0'
329
330
  required_rubygems_version: !ruby/object:Gem::Requirement
330
331
  requirements:
331
- - - '>='
332
+ - - '>'
332
333
  - !ruby/object:Gem::Version
333
- version: '0'
334
+ version: 1.3.1
334
335
  requirements: []
335
336
  rubyforge_project:
336
337
  rubygems_version: 2.2.1