protk 1.4.3 → 1.4.4.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/mzid_to_protxml.rb +7 -0
- data/lib/protk/data/template_prot.xml +0 -36
- data/lib/protk/mzidentml_doc.rb +17 -5
- data/lib/protk/peptide.rb +1 -1
- data/lib/protk/protein_summary.rb +113 -0
- data/lib/protk/protxml_writer.rb +21 -1
- metadata +4 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6b769808b35e55e6f9c74b11704e57153e6276cc
|
|
4
|
+
data.tar.gz: ba8003573127b2912a64995a76e2204af3213851
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7de51d2b7a77625abd3f0042057a7fc689f986e8fbf30b257b4d48930983445a26c6dac23fe8b1f7e3e96855fa47152f3731a2c29443c72cac84aaf56ae751c5
|
|
7
|
+
data.tar.gz: 9a4d1fd1644cbb2e6a067173fa59a8bb99570eb4b1d4878272267b36140a01c4893568ca12a0afef77db1b1c626745f162f183ba1cd9afcaef78a0802072f605
|
data/bin/mzid_to_protxml.rb
CHANGED
|
@@ -12,6 +12,7 @@ require 'protk/constants'
|
|
|
12
12
|
require 'protk/command_runner'
|
|
13
13
|
require 'protk/mzidentml_doc'
|
|
14
14
|
require 'protk/protein_group'
|
|
15
|
+
require 'protk/protein_summary'
|
|
15
16
|
require 'protk/tool'
|
|
16
17
|
|
|
17
18
|
include LibXML
|
|
@@ -45,6 +46,10 @@ prot_xml_writer = ProtXMLWriter.new
|
|
|
45
46
|
$protk.log "Parsing MzIdentML input file" , :info
|
|
46
47
|
mzid_doc = MzIdentMLDoc.new(input_file)
|
|
47
48
|
|
|
49
|
+
header = ProteinSummary.from_mzid(mzid_doc)
|
|
50
|
+
|
|
51
|
+
prot_xml_writer.append_header(header)
|
|
52
|
+
|
|
48
53
|
protein_groups = mzid_doc.protein_groups
|
|
49
54
|
|
|
50
55
|
n_prots = protein_groups.length
|
|
@@ -77,4 +82,6 @@ end
|
|
|
77
82
|
|
|
78
83
|
$protk.log "Writing #{n_written} proteins to #{output_file_name}", :info
|
|
79
84
|
|
|
85
|
+
prot_xml_writer.append_dataset_derivation
|
|
86
|
+
|
|
80
87
|
prot_xml_writer.save(output_file_name)
|
|
@@ -1,39 +1,3 @@
|
|
|
1
1
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
2
|
<protein_summary xmlns="http://regis-web.systemsbiology.net/protXML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://sashimi.sourceforge.net/schema_revision/protXML/protXML_v6.xsd" summary_xml="">
|
|
3
|
-
<protein_summary_header reference_database="FULLPATH_TO_REFERENCE_DB" residue_substitution_list="I -> L" source_files="FULLPATH_TO_SOURCE_PEPXML" source_files_alt="FULLPATH_TO_SOURCE_PEPXML" min_peptide_probability="" min_peptide_weight="" num_predicted_correct_prots="" num_input_1_spectra="" num_input_2_spectra="" num_input_3_spectra="" num_input_4_spectra="" num_input_5_spectra="" initial_min_peptide_prob="" total_no_spectrum_ids="" sample_enzyme="trypsin">
|
|
4
|
-
<program_details analysis="proteinprophet" time="2014-01-20T14:17:37" version=" Insilicos_LabKey_C++ (TPP v0.0 Development trunk rev 0, Build 201307090846 (linux))">
|
|
5
|
-
<proteinprophet_details occam_flag="Y" groups_flag="Y" degen_flag="Y" nsp_flag="Y" initial_peptide_wt_iters="2" nsp_distribution_iters="2" final_peptide_wt_iters="3">
|
|
6
|
-
<nsp_information neighboring_bin_smoothing="Y">
|
|
7
|
-
<nsp_distribution bin_no="0" nsp_lower_bound_incl="0.00" nsp_upper_bound_incl="0.00" pos_freq="0.057" neg_freq="0.625" pos_to_neg_ratio="0.09"/>
|
|
8
|
-
<nsp_distribution bin_no="1" nsp_lower_bound_excl="0.00" nsp_upper_bound_incl="0.31" pos_freq="0.037" neg_freq="0.152" pos_to_neg_ratio="0.24"/>
|
|
9
|
-
<nsp_distribution bin_no="2" nsp_lower_bound_excl="0.31" nsp_upper_bound_incl="1.00" pos_freq="0.077" neg_freq="0.032" pos_to_neg_ratio="2.42"/>
|
|
10
|
-
<nsp_distribution bin_no="3" nsp_lower_bound_excl="1.00" nsp_upper_bound_incl="2.50" pos_freq="0.113" neg_freq="0.033" pos_to_neg_ratio="3.39"/>
|
|
11
|
-
<nsp_distribution bin_no="4" nsp_lower_bound_excl="2.50" nsp_upper_bound_incl="4.63" pos_freq="0.123" neg_freq="0.032" pos_to_neg_ratio="3.91"/>
|
|
12
|
-
<nsp_distribution bin_no="5" nsp_lower_bound_excl="4.63" nsp_upper_bound_incl="7.90" pos_freq="0.143" neg_freq="0.032" pos_to_neg_ratio="4.50"/>
|
|
13
|
-
<nsp_distribution bin_no="6" nsp_lower_bound_excl="7.90" nsp_upper_bound_incl="14.92" pos_freq="0.196" neg_freq="0.041" pos_to_neg_ratio="4.78"/>
|
|
14
|
-
<nsp_distribution bin_no="7" nsp_lower_bound_excl="14.92" nsp_upper_bound_excl="inf" pos_freq="0.254" neg_freq="0.054" pos_to_neg_ratio="4.72" alt_pos_to_neg_ratio="4.78"/>
|
|
15
|
-
</nsp_information>
|
|
16
|
-
<ni_information>
|
|
17
|
-
</ni_information>
|
|
18
|
-
<protein_summary_data_filter min_probability="0.00" sensitivity="1.000" false_positive_error_rate="0.835" predicted_num_correct="1787" predicted_num_incorrect="9044"/>
|
|
19
|
-
<protein_summary_data_filter min_probability="0.10" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
|
|
20
|
-
<protein_summary_data_filter min_probability="0.20" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
|
|
21
|
-
<protein_summary_data_filter min_probability="0.30" sensitivity="0.956" false_positive_error_rate="0.151" predicted_num_correct="1709" predicted_num_incorrect="305"/>
|
|
22
|
-
<protein_summary_data_filter min_probability="0.40" sensitivity="0.916" false_positive_error_rate="0.095" predicted_num_correct="1638" predicted_num_incorrect="171"/>
|
|
23
|
-
<protein_summary_data_filter min_probability="0.50" sensitivity="0.887" false_positive_error_rate="0.063" predicted_num_correct="1585" predicted_num_incorrect="106"/>
|
|
24
|
-
<protein_summary_data_filter min_probability="0.60" sensitivity="0.853" false_positive_error_rate="0.036" predicted_num_correct="1525" predicted_num_incorrect="58"/>
|
|
25
|
-
<protein_summary_data_filter min_probability="0.70" sensitivity="0.826" false_positive_error_rate="0.020" predicted_num_correct="1477" predicted_num_incorrect="31"/>
|
|
26
|
-
<protein_summary_data_filter min_probability="0.80" sensitivity="0.805" false_positive_error_rate="0.012" predicted_num_correct="1438" predicted_num_incorrect="18"/>
|
|
27
|
-
<protein_summary_data_filter min_probability="0.90" sensitivity="0.773" false_positive_error_rate="0.006" predicted_num_correct="1381" predicted_num_incorrect="8"/>
|
|
28
|
-
<protein_summary_data_filter min_probability="0.95" sensitivity="0.749" false_positive_error_rate="0.004" predicted_num_correct="1339" predicted_num_incorrect="5"/>
|
|
29
|
-
<protein_summary_data_filter min_probability="0.96" sensitivity="0.738" false_positive_error_rate="0.003" predicted_num_correct="1318" predicted_num_incorrect="4"/>
|
|
30
|
-
<protein_summary_data_filter min_probability="0.97" sensitivity="0.728" false_positive_error_rate="0.002" predicted_num_correct="1302" predicted_num_incorrect="3"/>
|
|
31
|
-
<protein_summary_data_filter min_probability="0.98" sensitivity="0.711" false_positive_error_rate="0.002" predicted_num_correct="1272" predicted_num_incorrect="2"/>
|
|
32
|
-
<protein_summary_data_filter min_probability="0.99" sensitivity="0.609" false_positive_error_rate="0.000" predicted_num_correct="1088" predicted_num_incorrect="0"/>
|
|
33
|
-
<protein_summary_data_filter min_probability="1.00" sensitivity="0.164" false_positive_error_rate="0.000" predicted_num_correct="294" predicted_num_incorrect="0"/>
|
|
34
|
-
</proteinprophet_details>
|
|
35
|
-
</program_details>
|
|
36
|
-
</protein_summary_header>
|
|
37
|
-
<dataset_derivation generation_no="0">
|
|
38
|
-
</dataset_derivation>
|
|
39
3
|
</protein_summary>
|
data/lib/protk/mzidentml_doc.rb
CHANGED
|
@@ -37,6 +37,21 @@ class MzIdentMLDoc < Object
|
|
|
37
37
|
@document=parser.parse
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
+
def source_files
|
|
41
|
+
@document.find("//#{MZID_NS_PREFIX}:SourceFile","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def search_databases
|
|
45
|
+
@document.find("//#{MZID_NS_PREFIX}:SearchDatabase","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def enzymes
|
|
49
|
+
@document.find("//#{MZID_NS_PREFIX}:Enzyme","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def analysis_software
|
|
53
|
+
@document.find("//#{MZID_NS_PREFIX}:AnalysisSoftware","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
|
54
|
+
end
|
|
40
55
|
|
|
41
56
|
def spectrum_queries
|
|
42
57
|
@document.find("//#{MZID_NS_PREFIX}:SpectrumIdentificationResult","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
|
@@ -130,10 +145,7 @@ class MzIdentMLDoc < Object
|
|
|
130
145
|
# <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
|
|
131
146
|
# </PeptideHypothesis>
|
|
132
147
|
def get_best_psm_for_peptide(peptide_node)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
best_score=-1
|
|
148
|
+
best_score=nil
|
|
137
149
|
best_psm=nil
|
|
138
150
|
spectrumidrefs = self.find(peptide_node,"SpectrumIdentificationItemRef")
|
|
139
151
|
Constants.instance.log "Searching from among #{spectrumidrefs.length} for best psm" , :debug
|
|
@@ -143,7 +155,7 @@ class MzIdentMLDoc < Object
|
|
|
143
155
|
# psm_node = self.find(peptide_node,"SpectrumIdentificationItem[@id=\'#{id_ref}\']",true)[0]
|
|
144
156
|
psm_node = self.psms_cache[id_ref]
|
|
145
157
|
score = self.get_cvParam(psm_node,"MS:1002466")['value'].to_f
|
|
146
|
-
if score>best_score
|
|
158
|
+
if ( best_score == nil ) || ( score > best_score )
|
|
147
159
|
best_psm=psm_node
|
|
148
160
|
best_score=score
|
|
149
161
|
end
|
data/lib/protk/peptide.rb
CHANGED
|
@@ -49,7 +49,7 @@ class Peptide
|
|
|
49
49
|
pep=new()
|
|
50
50
|
pep.sequence=mzid_doc.get_sequence_for_peptide(xmlnode)
|
|
51
51
|
best_psm = mzid_doc.get_best_psm_for_peptide(xmlnode)
|
|
52
|
-
# require 'byebug';byebug
|
|
52
|
+
# require 'byebug';byebug if !best_psm
|
|
53
53
|
pep.probability = mzid_doc.get_cvParam(best_psm,"MS:1002466")['value'].to_f
|
|
54
54
|
pep.theoretical_neutral_mass = mzid_doc.get_cvParam(best_psm,"MS:1001117")['value'].to_f
|
|
55
55
|
pep.charge = best_psm.attributes['chargeState'].to_i
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
|
|
2
|
+
require 'protk/mzidentml_doc'
|
|
3
|
+
require 'protk/protxml_writer'
|
|
4
|
+
|
|
5
|
+
include LibXML
|
|
6
|
+
|
|
7
|
+
# Represents the protein_summary node of a protXML document
|
|
8
|
+
# This is the root of the document
|
|
9
|
+
#
|
|
10
|
+
class ProteinSummary
|
|
11
|
+
|
|
12
|
+
attr_accessor :reference_database
|
|
13
|
+
attr_accessor :residue_substitution_list
|
|
14
|
+
attr_accessor :source_files
|
|
15
|
+
attr_accessor :source_files_alt
|
|
16
|
+
attr_accessor :min_peptide_probability
|
|
17
|
+
attr_accessor :min_peptide_weight
|
|
18
|
+
attr_accessor :num_predicted_correct_prots
|
|
19
|
+
attr_accessor :num_input_1_spectra
|
|
20
|
+
attr_accessor :num_input_2_spectra
|
|
21
|
+
attr_accessor :num_input_3_spectra
|
|
22
|
+
attr_accessor :num_input_4_spectra
|
|
23
|
+
attr_accessor :num_input_5_spectra
|
|
24
|
+
attr_accessor :initial_min_peptide_prob
|
|
25
|
+
attr_accessor :total_no_spectrum_ids
|
|
26
|
+
attr_accessor :sample_enzyme
|
|
27
|
+
|
|
28
|
+
attr_accessor :program_name
|
|
29
|
+
attr_accessor :analysis_time
|
|
30
|
+
attr_accessor :program_version
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class << self
|
|
34
|
+
|
|
35
|
+
def from_mzid(mzid_doc)
|
|
36
|
+
|
|
37
|
+
summary = new()
|
|
38
|
+
# Things we cant retrieve
|
|
39
|
+
summary.residue_substitution_list = ""
|
|
40
|
+
summary.min_peptide_probability = ""
|
|
41
|
+
summary.min_peptide_weight = ""
|
|
42
|
+
summary.num_predicted_correct_prots = ""
|
|
43
|
+
summary.num_input_1_spectra = ""
|
|
44
|
+
summary.num_input_2_spectra = ""
|
|
45
|
+
summary.num_input_3_spectra = ""
|
|
46
|
+
summary.num_input_4_spectra = ""
|
|
47
|
+
summary.num_input_5_spectra = ""
|
|
48
|
+
summary.initial_min_peptide_prob = ""
|
|
49
|
+
summary.total_no_spectrum_ids = ""
|
|
50
|
+
summary.analysis_time = ""
|
|
51
|
+
|
|
52
|
+
db = mzid_doc.search_databases.first
|
|
53
|
+
summary.reference_database = db.attributes['location']
|
|
54
|
+
|
|
55
|
+
summary.source_files = mzid_doc.source_files.collect { |sf| sf.attributes['location'] }
|
|
56
|
+
summary.source_files_alt = summary.source_files
|
|
57
|
+
|
|
58
|
+
summary.sample_enzyme = mzid_doc.enzymes.first.attributes['name']
|
|
59
|
+
if mzid_doc.enzymes.first.attributes['semiSpecific']=="true"
|
|
60
|
+
summary.sample_enzyme = "semi#{summary.sample_enzyme}"
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
analysis_software = mzid_doc.analysis_software.first
|
|
64
|
+
summary.program_name = analysis_software.attributes['name']
|
|
65
|
+
summary.program_version = analysis_software.attributes['version']
|
|
66
|
+
|
|
67
|
+
summary
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
private :new
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def initialize()
|
|
75
|
+
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def as_protxml()
|
|
79
|
+
node = XML::Node.new('protein_summary_header')
|
|
80
|
+
# node.space_preserve=true
|
|
81
|
+
node["reference_database"] = self.reference_database
|
|
82
|
+
node["min_peptide_probability"] = self.min_peptide_probability
|
|
83
|
+
node["min_peptide_weight"] = self.min_peptide_weight
|
|
84
|
+
node["num_predicted_correct_prots"] = self.num_predicted_correct_prots
|
|
85
|
+
node["num_input_1_spectra"] = self.num_input_1_spectra
|
|
86
|
+
node["num_input_2_spectra"] = self.num_input_2_spectra
|
|
87
|
+
node["num_input_3_spectra"] = self.num_input_3_spectra
|
|
88
|
+
node["num_input_4_spectra"] = self.num_input_4_spectra
|
|
89
|
+
node["num_input_5_spectra"] = self.num_input_5_spectra
|
|
90
|
+
node["initial_min_peptide_prob"] = self.initial_min_peptide_prob
|
|
91
|
+
node["total_no_spectrum_ids"] = self.total_no_spectrum_ids
|
|
92
|
+
node["sample_enzyme"] = self.sample_enzyme
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
cnode = XML::Node.new('program_details')
|
|
96
|
+
# node.space_preserve=true
|
|
97
|
+
cnode["program_name"] = self.program_name
|
|
98
|
+
cnode["analysis_time"] = self.analysis_time
|
|
99
|
+
cnode["program_version"] = self.program_version
|
|
100
|
+
# require 'byebug';byebug
|
|
101
|
+
|
|
102
|
+
node << cnode
|
|
103
|
+
|
|
104
|
+
# ddnode = XML::Node.new('dataset_derivation')
|
|
105
|
+
# ddnode["generation_no"]="0"
|
|
106
|
+
|
|
107
|
+
# node << ddnode
|
|
108
|
+
|
|
109
|
+
node
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
end
|
data/lib/protk/protxml_writer.rb
CHANGED
|
@@ -7,12 +7,22 @@ class ProtXMLWriter < Object
|
|
|
7
7
|
|
|
8
8
|
attr :template_doc
|
|
9
9
|
attr :protein_summary_node
|
|
10
|
+
XML.indent_tree_output = true
|
|
10
11
|
|
|
11
12
|
def initialize
|
|
12
13
|
template_path="#{File.dirname(__FILE__)}/data/template_prot.xml"
|
|
13
|
-
template_parser=XML::Parser.file(template_path)
|
|
14
|
+
template_parser=XML::Parser.file(template_path)#,:options => XML::Parser::Options::NOBLANKS)
|
|
14
15
|
@template_doc=template_parser.parse
|
|
15
16
|
@protein_summary_node=@template_doc.root
|
|
17
|
+
# @protein_summary_node.space_preserve=true
|
|
18
|
+
@protein_summary_node.content=""
|
|
19
|
+
puts @template_doc
|
|
20
|
+
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def append_header(header_node)
|
|
24
|
+
# require 'byebug';byebug
|
|
25
|
+
@protein_summary_node << header_node.as_protxml
|
|
16
26
|
end
|
|
17
27
|
|
|
18
28
|
def append_protein_group(pg_node)
|
|
@@ -20,7 +30,17 @@ class ProtXMLWriter < Object
|
|
|
20
30
|
@protein_summary_node << pg_node
|
|
21
31
|
end
|
|
22
32
|
|
|
33
|
+
def append_dataset_derivation()
|
|
34
|
+
ddnode = XML::Node.new('dataset_derivation')
|
|
35
|
+
ddnode["generation_no"]="0"
|
|
36
|
+
@protein_summary_node << ddnode
|
|
37
|
+
end
|
|
38
|
+
|
|
23
39
|
def save(file_path)
|
|
40
|
+
# puts XML.indent_tree_output
|
|
41
|
+
# puts "|#{XML.default_tree_indent_string}|"
|
|
42
|
+
XML.indent_tree_output = true
|
|
43
|
+
# puts @template_doc.to_s
|
|
24
44
|
@template_doc.save(file_path,:indent=>true,:encoding => XML::Encoding::UTF_8)
|
|
25
45
|
end
|
|
26
46
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: protk
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.4.
|
|
4
|
+
version: 1.4.4.beta1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ira Cooke
|
|
@@ -299,6 +299,7 @@ files:
|
|
|
299
299
|
- lib/protk/prophet_tool.rb
|
|
300
300
|
- lib/protk/protein.rb
|
|
301
301
|
- lib/protk/protein_group.rb
|
|
302
|
+
- lib/protk/protein_summary.rb
|
|
302
303
|
- lib/protk/protein_to_genome_mapper.rb
|
|
303
304
|
- lib/protk/protxml_to_gff_tool.rb
|
|
304
305
|
- lib/protk/protxml_writer.rb
|
|
@@ -328,9 +329,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
328
329
|
version: '0'
|
|
329
330
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
330
331
|
requirements:
|
|
331
|
-
- - '
|
|
332
|
+
- - '>'
|
|
332
333
|
- !ruby/object:Gem::Version
|
|
333
|
-
version:
|
|
334
|
+
version: 1.3.1
|
|
334
335
|
requirements: []
|
|
335
336
|
rubyforge_project:
|
|
336
337
|
rubygems_version: 2.2.1
|