protk 1.4.3 → 1.4.4.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 31df514a2203236ea9ac25f8d5cc9c282378e04d
4
- data.tar.gz: f1bb438ef01003afc166eb5b0342dbd8f6ecd09b
3
+ metadata.gz: 6b769808b35e55e6f9c74b11704e57153e6276cc
4
+ data.tar.gz: ba8003573127b2912a64995a76e2204af3213851
5
5
  SHA512:
6
- metadata.gz: 7f7f0fe81411f17b89037162ad7bf5374be69309888bda2f84b058d69773dd1790469d40dd5797e0306ce49ecc95426cfb80b65bfcb95ac0a052be90db40ea42
7
- data.tar.gz: f2db6018ac90079e925f7c5c071be3fbedd77bc3787cfc58a91aa38048f43f60426020a3bff3a5a58c4bea6d71aaad7c0ab10437e050eaf1998792ca6ab2e1dd
6
+ metadata.gz: 7de51d2b7a77625abd3f0042057a7fc689f986e8fbf30b257b4d48930983445a26c6dac23fe8b1f7e3e96855fa47152f3731a2c29443c72cac84aaf56ae751c5
7
+ data.tar.gz: 9a4d1fd1644cbb2e6a067173fa59a8bb99570eb4b1d4878272267b36140a01c4893568ca12a0afef77db1b1c626745f162f183ba1cd9afcaef78a0802072f605
@@ -12,6 +12,7 @@ require 'protk/constants'
12
12
  require 'protk/command_runner'
13
13
  require 'protk/mzidentml_doc'
14
14
  require 'protk/protein_group'
15
+ require 'protk/protein_summary'
15
16
  require 'protk/tool'
16
17
 
17
18
  include LibXML
@@ -45,6 +46,10 @@ prot_xml_writer = ProtXMLWriter.new
45
46
  $protk.log "Parsing MzIdentML input file" , :info
46
47
  mzid_doc = MzIdentMLDoc.new(input_file)
47
48
 
49
+ header = ProteinSummary.from_mzid(mzid_doc)
50
+
51
+ prot_xml_writer.append_header(header)
52
+
48
53
  protein_groups = mzid_doc.protein_groups
49
54
 
50
55
  n_prots = protein_groups.length
@@ -77,4 +82,6 @@ end
77
82
 
78
83
  $protk.log "Writing #{n_written} proteins to #{output_file_name}", :info
79
84
 
85
+ prot_xml_writer.append_dataset_derivation
86
+
80
87
  prot_xml_writer.save(output_file_name)
@@ -1,39 +1,3 @@
1
1
  <?xml version="1.0" encoding="UTF-8"?>
2
2
  <protein_summary xmlns="http://regis-web.systemsbiology.net/protXML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://sashimi.sourceforge.net/schema_revision/protXML/protXML_v6.xsd" summary_xml="">
3
- <protein_summary_header reference_database="FULLPATH_TO_REFERENCE_DB" residue_substitution_list="I -&gt; L" source_files="FULLPATH_TO_SOURCE_PEPXML" source_files_alt="FULLPATH_TO_SOURCE_PEPXML" min_peptide_probability="" min_peptide_weight="" num_predicted_correct_prots="" num_input_1_spectra="" num_input_2_spectra="" num_input_3_spectra="" num_input_4_spectra="" num_input_5_spectra="" initial_min_peptide_prob="" total_no_spectrum_ids="" sample_enzyme="trypsin">
4
- <program_details analysis="proteinprophet" time="2014-01-20T14:17:37" version=" Insilicos_LabKey_C++ (TPP v0.0 Development trunk rev 0, Build 201307090846 (linux))">
5
- <proteinprophet_details occam_flag="Y" groups_flag="Y" degen_flag="Y" nsp_flag="Y" initial_peptide_wt_iters="2" nsp_distribution_iters="2" final_peptide_wt_iters="3">
6
- <nsp_information neighboring_bin_smoothing="Y">
7
- <nsp_distribution bin_no="0" nsp_lower_bound_incl="0.00" nsp_upper_bound_incl="0.00" pos_freq="0.057" neg_freq="0.625" pos_to_neg_ratio="0.09"/>
8
- <nsp_distribution bin_no="1" nsp_lower_bound_excl="0.00" nsp_upper_bound_incl="0.31" pos_freq="0.037" neg_freq="0.152" pos_to_neg_ratio="0.24"/>
9
- <nsp_distribution bin_no="2" nsp_lower_bound_excl="0.31" nsp_upper_bound_incl="1.00" pos_freq="0.077" neg_freq="0.032" pos_to_neg_ratio="2.42"/>
10
- <nsp_distribution bin_no="3" nsp_lower_bound_excl="1.00" nsp_upper_bound_incl="2.50" pos_freq="0.113" neg_freq="0.033" pos_to_neg_ratio="3.39"/>
11
- <nsp_distribution bin_no="4" nsp_lower_bound_excl="2.50" nsp_upper_bound_incl="4.63" pos_freq="0.123" neg_freq="0.032" pos_to_neg_ratio="3.91"/>
12
- <nsp_distribution bin_no="5" nsp_lower_bound_excl="4.63" nsp_upper_bound_incl="7.90" pos_freq="0.143" neg_freq="0.032" pos_to_neg_ratio="4.50"/>
13
- <nsp_distribution bin_no="6" nsp_lower_bound_excl="7.90" nsp_upper_bound_incl="14.92" pos_freq="0.196" neg_freq="0.041" pos_to_neg_ratio="4.78"/>
14
- <nsp_distribution bin_no="7" nsp_lower_bound_excl="14.92" nsp_upper_bound_excl="inf" pos_freq="0.254" neg_freq="0.054" pos_to_neg_ratio="4.72" alt_pos_to_neg_ratio="4.78"/>
15
- </nsp_information>
16
- <ni_information>
17
- </ni_information>
18
- <protein_summary_data_filter min_probability="0.00" sensitivity="1.000" false_positive_error_rate="0.835" predicted_num_correct="1787" predicted_num_incorrect="9044"/>
19
- <protein_summary_data_filter min_probability="0.10" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
20
- <protein_summary_data_filter min_probability="0.20" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
21
- <protein_summary_data_filter min_probability="0.30" sensitivity="0.956" false_positive_error_rate="0.151" predicted_num_correct="1709" predicted_num_incorrect="305"/>
22
- <protein_summary_data_filter min_probability="0.40" sensitivity="0.916" false_positive_error_rate="0.095" predicted_num_correct="1638" predicted_num_incorrect="171"/>
23
- <protein_summary_data_filter min_probability="0.50" sensitivity="0.887" false_positive_error_rate="0.063" predicted_num_correct="1585" predicted_num_incorrect="106"/>
24
- <protein_summary_data_filter min_probability="0.60" sensitivity="0.853" false_positive_error_rate="0.036" predicted_num_correct="1525" predicted_num_incorrect="58"/>
25
- <protein_summary_data_filter min_probability="0.70" sensitivity="0.826" false_positive_error_rate="0.020" predicted_num_correct="1477" predicted_num_incorrect="31"/>
26
- <protein_summary_data_filter min_probability="0.80" sensitivity="0.805" false_positive_error_rate="0.012" predicted_num_correct="1438" predicted_num_incorrect="18"/>
27
- <protein_summary_data_filter min_probability="0.90" sensitivity="0.773" false_positive_error_rate="0.006" predicted_num_correct="1381" predicted_num_incorrect="8"/>
28
- <protein_summary_data_filter min_probability="0.95" sensitivity="0.749" false_positive_error_rate="0.004" predicted_num_correct="1339" predicted_num_incorrect="5"/>
29
- <protein_summary_data_filter min_probability="0.96" sensitivity="0.738" false_positive_error_rate="0.003" predicted_num_correct="1318" predicted_num_incorrect="4"/>
30
- <protein_summary_data_filter min_probability="0.97" sensitivity="0.728" false_positive_error_rate="0.002" predicted_num_correct="1302" predicted_num_incorrect="3"/>
31
- <protein_summary_data_filter min_probability="0.98" sensitivity="0.711" false_positive_error_rate="0.002" predicted_num_correct="1272" predicted_num_incorrect="2"/>
32
- <protein_summary_data_filter min_probability="0.99" sensitivity="0.609" false_positive_error_rate="0.000" predicted_num_correct="1088" predicted_num_incorrect="0"/>
33
- <protein_summary_data_filter min_probability="1.00" sensitivity="0.164" false_positive_error_rate="0.000" predicted_num_correct="294" predicted_num_incorrect="0"/>
34
- </proteinprophet_details>
35
- </program_details>
36
- </protein_summary_header>
37
- <dataset_derivation generation_no="0">
38
- </dataset_derivation>
39
3
  </protein_summary>
@@ -37,6 +37,21 @@ class MzIdentMLDoc < Object
37
37
  @document=parser.parse
38
38
  end
39
39
 
40
+ def source_files
41
+ @document.find("//#{MZID_NS_PREFIX}:SourceFile","#{MZID_NS_PREFIX}:#{MZID_NS}")
42
+ end
43
+
44
+ def search_databases
45
+ @document.find("//#{MZID_NS_PREFIX}:SearchDatabase","#{MZID_NS_PREFIX}:#{MZID_NS}")
46
+ end
47
+
48
+ def enzymes
49
+ @document.find("//#{MZID_NS_PREFIX}:Enzyme","#{MZID_NS_PREFIX}:#{MZID_NS}")
50
+ end
51
+
52
+ def analysis_software
53
+ @document.find("//#{MZID_NS_PREFIX}:AnalysisSoftware","#{MZID_NS_PREFIX}:#{MZID_NS}")
54
+ end
40
55
 
41
56
  def spectrum_queries
42
57
  @document.find("//#{MZID_NS_PREFIX}:SpectrumIdentificationResult","#{MZID_NS_PREFIX}:#{MZID_NS}")
@@ -130,10 +145,7 @@ class MzIdentMLDoc < Object
130
145
  # <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
131
146
  # </PeptideHypothesis>
132
147
  def get_best_psm_for_peptide(peptide_node)
133
-
134
-
135
-
136
- best_score=-1
148
+ best_score=nil
137
149
  best_psm=nil
138
150
  spectrumidrefs = self.find(peptide_node,"SpectrumIdentificationItemRef")
139
151
  Constants.instance.log "Searching from among #{spectrumidrefs.length} for best psm" , :debug
@@ -143,7 +155,7 @@ class MzIdentMLDoc < Object
143
155
  # psm_node = self.find(peptide_node,"SpectrumIdentificationItem[@id=\'#{id_ref}\']",true)[0]
144
156
  psm_node = self.psms_cache[id_ref]
145
157
  score = self.get_cvParam(psm_node,"MS:1002466")['value'].to_f
146
- if score>best_score
158
+ if ( best_score == nil ) || ( score > best_score )
147
159
  best_psm=psm_node
148
160
  best_score=score
149
161
  end
@@ -49,7 +49,7 @@ class Peptide
49
49
  pep=new()
50
50
  pep.sequence=mzid_doc.get_sequence_for_peptide(xmlnode)
51
51
  best_psm = mzid_doc.get_best_psm_for_peptide(xmlnode)
52
- # require 'byebug';byebug
52
+ # require 'byebug';byebug if !best_psm
53
53
  pep.probability = mzid_doc.get_cvParam(best_psm,"MS:1002466")['value'].to_f
54
54
  pep.theoretical_neutral_mass = mzid_doc.get_cvParam(best_psm,"MS:1001117")['value'].to_f
55
55
  pep.charge = best_psm.attributes['chargeState'].to_i
@@ -0,0 +1,113 @@
1
+
2
+ require 'protk/mzidentml_doc'
3
+ require 'protk/protxml_writer'
4
+
5
+ include LibXML
6
+
7
+ # Represents the protein_summary node of a protXML document
8
+ # This is the root of the document
9
+ #
10
+ class ProteinSummary
11
+
12
+ attr_accessor :reference_database
13
+ attr_accessor :residue_substitution_list
14
+ attr_accessor :source_files
15
+ attr_accessor :source_files_alt
16
+ attr_accessor :min_peptide_probability
17
+ attr_accessor :min_peptide_weight
18
+ attr_accessor :num_predicted_correct_prots
19
+ attr_accessor :num_input_1_spectra
20
+ attr_accessor :num_input_2_spectra
21
+ attr_accessor :num_input_3_spectra
22
+ attr_accessor :num_input_4_spectra
23
+ attr_accessor :num_input_5_spectra
24
+ attr_accessor :initial_min_peptide_prob
25
+ attr_accessor :total_no_spectrum_ids
26
+ attr_accessor :sample_enzyme
27
+
28
+ attr_accessor :program_name
29
+ attr_accessor :analysis_time
30
+ attr_accessor :program_version
31
+
32
+
33
+ class << self
34
+
35
+ def from_mzid(mzid_doc)
36
+
37
+ summary = new()
38
+ # Things we cant retrieve
39
+ summary.residue_substitution_list = ""
40
+ summary.min_peptide_probability = ""
41
+ summary.min_peptide_weight = ""
42
+ summary.num_predicted_correct_prots = ""
43
+ summary.num_input_1_spectra = ""
44
+ summary.num_input_2_spectra = ""
45
+ summary.num_input_3_spectra = ""
46
+ summary.num_input_4_spectra = ""
47
+ summary.num_input_5_spectra = ""
48
+ summary.initial_min_peptide_prob = ""
49
+ summary.total_no_spectrum_ids = ""
50
+ summary.analysis_time = ""
51
+
52
+ db = mzid_doc.search_databases.first
53
+ summary.reference_database = db.attributes['location']
54
+
55
+ summary.source_files = mzid_doc.source_files.collect { |sf| sf.attributes['location'] }
56
+ summary.source_files_alt = summary.source_files
57
+
58
+ summary.sample_enzyme = mzid_doc.enzymes.first.attributes['name']
59
+ if mzid_doc.enzymes.first.attributes['semiSpecific']=="true"
60
+ summary.sample_enzyme = "semi#{summary.sample_enzyme}"
61
+ end
62
+
63
+ analysis_software = mzid_doc.analysis_software.first
64
+ summary.program_name = analysis_software.attributes['name']
65
+ summary.program_version = analysis_software.attributes['version']
66
+
67
+ summary
68
+ end
69
+
70
+
71
+ private :new
72
+ end
73
+
74
+ def initialize()
75
+
76
+ end
77
+
78
+ def as_protxml()
79
+ node = XML::Node.new('protein_summary_header')
80
+ # node.space_preserve=true
81
+ node["reference_database"] = self.reference_database
82
+ node["min_peptide_probability"] = self.min_peptide_probability
83
+ node["min_peptide_weight"] = self.min_peptide_weight
84
+ node["num_predicted_correct_prots"] = self.num_predicted_correct_prots
85
+ node["num_input_1_spectra"] = self.num_input_1_spectra
86
+ node["num_input_2_spectra"] = self.num_input_2_spectra
87
+ node["num_input_3_spectra"] = self.num_input_3_spectra
88
+ node["num_input_4_spectra"] = self.num_input_4_spectra
89
+ node["num_input_5_spectra"] = self.num_input_5_spectra
90
+ node["initial_min_peptide_prob"] = self.initial_min_peptide_prob
91
+ node["total_no_spectrum_ids"] = self.total_no_spectrum_ids
92
+ node["sample_enzyme"] = self.sample_enzyme
93
+
94
+
95
+ cnode = XML::Node.new('program_details')
96
+ # node.space_preserve=true
97
+ cnode["program_name"] = self.program_name
98
+ cnode["analysis_time"] = self.analysis_time
99
+ cnode["program_version"] = self.program_version
100
+ # require 'byebug';byebug
101
+
102
+ node << cnode
103
+
104
+ # ddnode = XML::Node.new('dataset_derivation')
105
+ # ddnode["generation_no"]="0"
106
+
107
+ # node << ddnode
108
+
109
+ node
110
+ end
111
+
112
+
113
+ end
@@ -7,12 +7,22 @@ class ProtXMLWriter < Object
7
7
 
8
8
  attr :template_doc
9
9
  attr :protein_summary_node
10
+ XML.indent_tree_output = true
10
11
 
11
12
  def initialize
12
13
  template_path="#{File.dirname(__FILE__)}/data/template_prot.xml"
13
- template_parser=XML::Parser.file(template_path)
14
+ template_parser=XML::Parser.file(template_path)#,:options => XML::Parser::Options::NOBLANKS)
14
15
  @template_doc=template_parser.parse
15
16
  @protein_summary_node=@template_doc.root
17
+ # @protein_summary_node.space_preserve=true
18
+ @protein_summary_node.content=""
19
+ puts @template_doc
20
+
21
+ end
22
+
23
+ def append_header(header_node)
24
+ # require 'byebug';byebug
25
+ @protein_summary_node << header_node.as_protxml
16
26
  end
17
27
 
18
28
  def append_protein_group(pg_node)
@@ -20,7 +30,17 @@ class ProtXMLWriter < Object
20
30
  @protein_summary_node << pg_node
21
31
  end
22
32
 
33
+ def append_dataset_derivation()
34
+ ddnode = XML::Node.new('dataset_derivation')
35
+ ddnode["generation_no"]="0"
36
+ @protein_summary_node << ddnode
37
+ end
38
+
23
39
  def save(file_path)
40
+ # puts XML.indent_tree_output
41
+ # puts "|#{XML.default_tree_indent_string}|"
42
+ XML.indent_tree_output = true
43
+ # puts @template_doc.to_s
24
44
  @template_doc.save(file_path,:indent=>true,:encoding => XML::Encoding::UTF_8)
25
45
  end
26
46
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: protk
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.3
4
+ version: 1.4.4.beta1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ira Cooke
@@ -299,6 +299,7 @@ files:
299
299
  - lib/protk/prophet_tool.rb
300
300
  - lib/protk/protein.rb
301
301
  - lib/protk/protein_group.rb
302
+ - lib/protk/protein_summary.rb
302
303
  - lib/protk/protein_to_genome_mapper.rb
303
304
  - lib/protk/protxml_to_gff_tool.rb
304
305
  - lib/protk/protxml_writer.rb
@@ -328,9 +329,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
328
329
  version: '0'
329
330
  required_rubygems_version: !ruby/object:Gem::Requirement
330
331
  requirements:
331
- - - '>='
332
+ - - '>'
332
333
  - !ruby/object:Gem::Version
333
- version: '0'
334
+ version: 1.3.1
334
335
  requirements: []
335
336
  rubyforge_project:
336
337
  rubygems_version: 2.2.1