protk 1.4.3 → 1.4.4.beta1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/mzid_to_protxml.rb +7 -0
- data/lib/protk/data/template_prot.xml +0 -36
- data/lib/protk/mzidentml_doc.rb +17 -5
- data/lib/protk/peptide.rb +1 -1
- data/lib/protk/protein_summary.rb +113 -0
- data/lib/protk/protxml_writer.rb +21 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b769808b35e55e6f9c74b11704e57153e6276cc
|
4
|
+
data.tar.gz: ba8003573127b2912a64995a76e2204af3213851
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7de51d2b7a77625abd3f0042057a7fc689f986e8fbf30b257b4d48930983445a26c6dac23fe8b1f7e3e96855fa47152f3731a2c29443c72cac84aaf56ae751c5
|
7
|
+
data.tar.gz: 9a4d1fd1644cbb2e6a067173fa59a8bb99570eb4b1d4878272267b36140a01c4893568ca12a0afef77db1b1c626745f162f183ba1cd9afcaef78a0802072f605
|
data/bin/mzid_to_protxml.rb
CHANGED
@@ -12,6 +12,7 @@ require 'protk/constants'
|
|
12
12
|
require 'protk/command_runner'
|
13
13
|
require 'protk/mzidentml_doc'
|
14
14
|
require 'protk/protein_group'
|
15
|
+
require 'protk/protein_summary'
|
15
16
|
require 'protk/tool'
|
16
17
|
|
17
18
|
include LibXML
|
@@ -45,6 +46,10 @@ prot_xml_writer = ProtXMLWriter.new
|
|
45
46
|
$protk.log "Parsing MzIdentML input file" , :info
|
46
47
|
mzid_doc = MzIdentMLDoc.new(input_file)
|
47
48
|
|
49
|
+
header = ProteinSummary.from_mzid(mzid_doc)
|
50
|
+
|
51
|
+
prot_xml_writer.append_header(header)
|
52
|
+
|
48
53
|
protein_groups = mzid_doc.protein_groups
|
49
54
|
|
50
55
|
n_prots = protein_groups.length
|
@@ -77,4 +82,6 @@ end
|
|
77
82
|
|
78
83
|
$protk.log "Writing #{n_written} proteins to #{output_file_name}", :info
|
79
84
|
|
85
|
+
prot_xml_writer.append_dataset_derivation
|
86
|
+
|
80
87
|
prot_xml_writer.save(output_file_name)
|
@@ -1,39 +1,3 @@
|
|
1
1
|
<?xml version="1.0" encoding="UTF-8"?>
|
2
2
|
<protein_summary xmlns="http://regis-web.systemsbiology.net/protXML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://sashimi.sourceforge.net/schema_revision/protXML/protXML_v6.xsd" summary_xml="">
|
3
|
-
<protein_summary_header reference_database="FULLPATH_TO_REFERENCE_DB" residue_substitution_list="I -> L" source_files="FULLPATH_TO_SOURCE_PEPXML" source_files_alt="FULLPATH_TO_SOURCE_PEPXML" min_peptide_probability="" min_peptide_weight="" num_predicted_correct_prots="" num_input_1_spectra="" num_input_2_spectra="" num_input_3_spectra="" num_input_4_spectra="" num_input_5_spectra="" initial_min_peptide_prob="" total_no_spectrum_ids="" sample_enzyme="trypsin">
|
4
|
-
<program_details analysis="proteinprophet" time="2014-01-20T14:17:37" version=" Insilicos_LabKey_C++ (TPP v0.0 Development trunk rev 0, Build 201307090846 (linux))">
|
5
|
-
<proteinprophet_details occam_flag="Y" groups_flag="Y" degen_flag="Y" nsp_flag="Y" initial_peptide_wt_iters="2" nsp_distribution_iters="2" final_peptide_wt_iters="3">
|
6
|
-
<nsp_information neighboring_bin_smoothing="Y">
|
7
|
-
<nsp_distribution bin_no="0" nsp_lower_bound_incl="0.00" nsp_upper_bound_incl="0.00" pos_freq="0.057" neg_freq="0.625" pos_to_neg_ratio="0.09"/>
|
8
|
-
<nsp_distribution bin_no="1" nsp_lower_bound_excl="0.00" nsp_upper_bound_incl="0.31" pos_freq="0.037" neg_freq="0.152" pos_to_neg_ratio="0.24"/>
|
9
|
-
<nsp_distribution bin_no="2" nsp_lower_bound_excl="0.31" nsp_upper_bound_incl="1.00" pos_freq="0.077" neg_freq="0.032" pos_to_neg_ratio="2.42"/>
|
10
|
-
<nsp_distribution bin_no="3" nsp_lower_bound_excl="1.00" nsp_upper_bound_incl="2.50" pos_freq="0.113" neg_freq="0.033" pos_to_neg_ratio="3.39"/>
|
11
|
-
<nsp_distribution bin_no="4" nsp_lower_bound_excl="2.50" nsp_upper_bound_incl="4.63" pos_freq="0.123" neg_freq="0.032" pos_to_neg_ratio="3.91"/>
|
12
|
-
<nsp_distribution bin_no="5" nsp_lower_bound_excl="4.63" nsp_upper_bound_incl="7.90" pos_freq="0.143" neg_freq="0.032" pos_to_neg_ratio="4.50"/>
|
13
|
-
<nsp_distribution bin_no="6" nsp_lower_bound_excl="7.90" nsp_upper_bound_incl="14.92" pos_freq="0.196" neg_freq="0.041" pos_to_neg_ratio="4.78"/>
|
14
|
-
<nsp_distribution bin_no="7" nsp_lower_bound_excl="14.92" nsp_upper_bound_excl="inf" pos_freq="0.254" neg_freq="0.054" pos_to_neg_ratio="4.72" alt_pos_to_neg_ratio="4.78"/>
|
15
|
-
</nsp_information>
|
16
|
-
<ni_information>
|
17
|
-
</ni_information>
|
18
|
-
<protein_summary_data_filter min_probability="0.00" sensitivity="1.000" false_positive_error_rate="0.835" predicted_num_correct="1787" predicted_num_incorrect="9044"/>
|
19
|
-
<protein_summary_data_filter min_probability="0.10" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
|
20
|
-
<protein_summary_data_filter min_probability="0.20" sensitivity="1.000" false_positive_error_rate="0.235" predicted_num_correct="1787" predicted_num_incorrect="548"/>
|
21
|
-
<protein_summary_data_filter min_probability="0.30" sensitivity="0.956" false_positive_error_rate="0.151" predicted_num_correct="1709" predicted_num_incorrect="305"/>
|
22
|
-
<protein_summary_data_filter min_probability="0.40" sensitivity="0.916" false_positive_error_rate="0.095" predicted_num_correct="1638" predicted_num_incorrect="171"/>
|
23
|
-
<protein_summary_data_filter min_probability="0.50" sensitivity="0.887" false_positive_error_rate="0.063" predicted_num_correct="1585" predicted_num_incorrect="106"/>
|
24
|
-
<protein_summary_data_filter min_probability="0.60" sensitivity="0.853" false_positive_error_rate="0.036" predicted_num_correct="1525" predicted_num_incorrect="58"/>
|
25
|
-
<protein_summary_data_filter min_probability="0.70" sensitivity="0.826" false_positive_error_rate="0.020" predicted_num_correct="1477" predicted_num_incorrect="31"/>
|
26
|
-
<protein_summary_data_filter min_probability="0.80" sensitivity="0.805" false_positive_error_rate="0.012" predicted_num_correct="1438" predicted_num_incorrect="18"/>
|
27
|
-
<protein_summary_data_filter min_probability="0.90" sensitivity="0.773" false_positive_error_rate="0.006" predicted_num_correct="1381" predicted_num_incorrect="8"/>
|
28
|
-
<protein_summary_data_filter min_probability="0.95" sensitivity="0.749" false_positive_error_rate="0.004" predicted_num_correct="1339" predicted_num_incorrect="5"/>
|
29
|
-
<protein_summary_data_filter min_probability="0.96" sensitivity="0.738" false_positive_error_rate="0.003" predicted_num_correct="1318" predicted_num_incorrect="4"/>
|
30
|
-
<protein_summary_data_filter min_probability="0.97" sensitivity="0.728" false_positive_error_rate="0.002" predicted_num_correct="1302" predicted_num_incorrect="3"/>
|
31
|
-
<protein_summary_data_filter min_probability="0.98" sensitivity="0.711" false_positive_error_rate="0.002" predicted_num_correct="1272" predicted_num_incorrect="2"/>
|
32
|
-
<protein_summary_data_filter min_probability="0.99" sensitivity="0.609" false_positive_error_rate="0.000" predicted_num_correct="1088" predicted_num_incorrect="0"/>
|
33
|
-
<protein_summary_data_filter min_probability="1.00" sensitivity="0.164" false_positive_error_rate="0.000" predicted_num_correct="294" predicted_num_incorrect="0"/>
|
34
|
-
</proteinprophet_details>
|
35
|
-
</program_details>
|
36
|
-
</protein_summary_header>
|
37
|
-
<dataset_derivation generation_no="0">
|
38
|
-
</dataset_derivation>
|
39
3
|
</protein_summary>
|
data/lib/protk/mzidentml_doc.rb
CHANGED
@@ -37,6 +37,21 @@ class MzIdentMLDoc < Object
|
|
37
37
|
@document=parser.parse
|
38
38
|
end
|
39
39
|
|
40
|
+
def source_files
|
41
|
+
@document.find("//#{MZID_NS_PREFIX}:SourceFile","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
42
|
+
end
|
43
|
+
|
44
|
+
def search_databases
|
45
|
+
@document.find("//#{MZID_NS_PREFIX}:SearchDatabase","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
46
|
+
end
|
47
|
+
|
48
|
+
def enzymes
|
49
|
+
@document.find("//#{MZID_NS_PREFIX}:Enzyme","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
50
|
+
end
|
51
|
+
|
52
|
+
def analysis_software
|
53
|
+
@document.find("//#{MZID_NS_PREFIX}:AnalysisSoftware","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
54
|
+
end
|
40
55
|
|
41
56
|
def spectrum_queries
|
42
57
|
@document.find("//#{MZID_NS_PREFIX}:SpectrumIdentificationResult","#{MZID_NS_PREFIX}:#{MZID_NS}")
|
@@ -130,10 +145,7 @@ class MzIdentMLDoc < Object
|
|
130
145
|
# <SpectrumIdentificationItemRef spectrumIdentificationItem_ref="SII_1_1"/>
|
131
146
|
# </PeptideHypothesis>
|
132
147
|
def get_best_psm_for_peptide(peptide_node)
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
best_score=-1
|
148
|
+
best_score=nil
|
137
149
|
best_psm=nil
|
138
150
|
spectrumidrefs = self.find(peptide_node,"SpectrumIdentificationItemRef")
|
139
151
|
Constants.instance.log "Searching from among #{spectrumidrefs.length} for best psm" , :debug
|
@@ -143,7 +155,7 @@ class MzIdentMLDoc < Object
|
|
143
155
|
# psm_node = self.find(peptide_node,"SpectrumIdentificationItem[@id=\'#{id_ref}\']",true)[0]
|
144
156
|
psm_node = self.psms_cache[id_ref]
|
145
157
|
score = self.get_cvParam(psm_node,"MS:1002466")['value'].to_f
|
146
|
-
if score>best_score
|
158
|
+
if ( best_score == nil ) || ( score > best_score )
|
147
159
|
best_psm=psm_node
|
148
160
|
best_score=score
|
149
161
|
end
|
data/lib/protk/peptide.rb
CHANGED
@@ -49,7 +49,7 @@ class Peptide
|
|
49
49
|
pep=new()
|
50
50
|
pep.sequence=mzid_doc.get_sequence_for_peptide(xmlnode)
|
51
51
|
best_psm = mzid_doc.get_best_psm_for_peptide(xmlnode)
|
52
|
-
# require 'byebug';byebug
|
52
|
+
# require 'byebug';byebug if !best_psm
|
53
53
|
pep.probability = mzid_doc.get_cvParam(best_psm,"MS:1002466")['value'].to_f
|
54
54
|
pep.theoretical_neutral_mass = mzid_doc.get_cvParam(best_psm,"MS:1001117")['value'].to_f
|
55
55
|
pep.charge = best_psm.attributes['chargeState'].to_i
|
@@ -0,0 +1,113 @@
|
|
1
|
+
|
2
|
+
require 'protk/mzidentml_doc'
|
3
|
+
require 'protk/protxml_writer'
|
4
|
+
|
5
|
+
include LibXML
|
6
|
+
|
7
|
+
# Represents the protein_summary node of a protXML document
|
8
|
+
# This is the root of the document
|
9
|
+
#
|
10
|
+
class ProteinSummary
|
11
|
+
|
12
|
+
attr_accessor :reference_database
|
13
|
+
attr_accessor :residue_substitution_list
|
14
|
+
attr_accessor :source_files
|
15
|
+
attr_accessor :source_files_alt
|
16
|
+
attr_accessor :min_peptide_probability
|
17
|
+
attr_accessor :min_peptide_weight
|
18
|
+
attr_accessor :num_predicted_correct_prots
|
19
|
+
attr_accessor :num_input_1_spectra
|
20
|
+
attr_accessor :num_input_2_spectra
|
21
|
+
attr_accessor :num_input_3_spectra
|
22
|
+
attr_accessor :num_input_4_spectra
|
23
|
+
attr_accessor :num_input_5_spectra
|
24
|
+
attr_accessor :initial_min_peptide_prob
|
25
|
+
attr_accessor :total_no_spectrum_ids
|
26
|
+
attr_accessor :sample_enzyme
|
27
|
+
|
28
|
+
attr_accessor :program_name
|
29
|
+
attr_accessor :analysis_time
|
30
|
+
attr_accessor :program_version
|
31
|
+
|
32
|
+
|
33
|
+
class << self
|
34
|
+
|
35
|
+
def from_mzid(mzid_doc)
|
36
|
+
|
37
|
+
summary = new()
|
38
|
+
# Things we cant retrieve
|
39
|
+
summary.residue_substitution_list = ""
|
40
|
+
summary.min_peptide_probability = ""
|
41
|
+
summary.min_peptide_weight = ""
|
42
|
+
summary.num_predicted_correct_prots = ""
|
43
|
+
summary.num_input_1_spectra = ""
|
44
|
+
summary.num_input_2_spectra = ""
|
45
|
+
summary.num_input_3_spectra = ""
|
46
|
+
summary.num_input_4_spectra = ""
|
47
|
+
summary.num_input_5_spectra = ""
|
48
|
+
summary.initial_min_peptide_prob = ""
|
49
|
+
summary.total_no_spectrum_ids = ""
|
50
|
+
summary.analysis_time = ""
|
51
|
+
|
52
|
+
db = mzid_doc.search_databases.first
|
53
|
+
summary.reference_database = db.attributes['location']
|
54
|
+
|
55
|
+
summary.source_files = mzid_doc.source_files.collect { |sf| sf.attributes['location'] }
|
56
|
+
summary.source_files_alt = summary.source_files
|
57
|
+
|
58
|
+
summary.sample_enzyme = mzid_doc.enzymes.first.attributes['name']
|
59
|
+
if mzid_doc.enzymes.first.attributes['semiSpecific']=="true"
|
60
|
+
summary.sample_enzyme = "semi#{summary.sample_enzyme}"
|
61
|
+
end
|
62
|
+
|
63
|
+
analysis_software = mzid_doc.analysis_software.first
|
64
|
+
summary.program_name = analysis_software.attributes['name']
|
65
|
+
summary.program_version = analysis_software.attributes['version']
|
66
|
+
|
67
|
+
summary
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
private :new
|
72
|
+
end
|
73
|
+
|
74
|
+
def initialize()
|
75
|
+
|
76
|
+
end
|
77
|
+
|
78
|
+
def as_protxml()
|
79
|
+
node = XML::Node.new('protein_summary_header')
|
80
|
+
# node.space_preserve=true
|
81
|
+
node["reference_database"] = self.reference_database
|
82
|
+
node["min_peptide_probability"] = self.min_peptide_probability
|
83
|
+
node["min_peptide_weight"] = self.min_peptide_weight
|
84
|
+
node["num_predicted_correct_prots"] = self.num_predicted_correct_prots
|
85
|
+
node["num_input_1_spectra"] = self.num_input_1_spectra
|
86
|
+
node["num_input_2_spectra"] = self.num_input_2_spectra
|
87
|
+
node["num_input_3_spectra"] = self.num_input_3_spectra
|
88
|
+
node["num_input_4_spectra"] = self.num_input_4_spectra
|
89
|
+
node["num_input_5_spectra"] = self.num_input_5_spectra
|
90
|
+
node["initial_min_peptide_prob"] = self.initial_min_peptide_prob
|
91
|
+
node["total_no_spectrum_ids"] = self.total_no_spectrum_ids
|
92
|
+
node["sample_enzyme"] = self.sample_enzyme
|
93
|
+
|
94
|
+
|
95
|
+
cnode = XML::Node.new('program_details')
|
96
|
+
# node.space_preserve=true
|
97
|
+
cnode["program_name"] = self.program_name
|
98
|
+
cnode["analysis_time"] = self.analysis_time
|
99
|
+
cnode["program_version"] = self.program_version
|
100
|
+
# require 'byebug';byebug
|
101
|
+
|
102
|
+
node << cnode
|
103
|
+
|
104
|
+
# ddnode = XML::Node.new('dataset_derivation')
|
105
|
+
# ddnode["generation_no"]="0"
|
106
|
+
|
107
|
+
# node << ddnode
|
108
|
+
|
109
|
+
node
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
end
|
data/lib/protk/protxml_writer.rb
CHANGED
@@ -7,12 +7,22 @@ class ProtXMLWriter < Object
|
|
7
7
|
|
8
8
|
attr :template_doc
|
9
9
|
attr :protein_summary_node
|
10
|
+
XML.indent_tree_output = true
|
10
11
|
|
11
12
|
def initialize
|
12
13
|
template_path="#{File.dirname(__FILE__)}/data/template_prot.xml"
|
13
|
-
template_parser=XML::Parser.file(template_path)
|
14
|
+
template_parser=XML::Parser.file(template_path)#,:options => XML::Parser::Options::NOBLANKS)
|
14
15
|
@template_doc=template_parser.parse
|
15
16
|
@protein_summary_node=@template_doc.root
|
17
|
+
# @protein_summary_node.space_preserve=true
|
18
|
+
@protein_summary_node.content=""
|
19
|
+
puts @template_doc
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
def append_header(header_node)
|
24
|
+
# require 'byebug';byebug
|
25
|
+
@protein_summary_node << header_node.as_protxml
|
16
26
|
end
|
17
27
|
|
18
28
|
def append_protein_group(pg_node)
|
@@ -20,7 +30,17 @@ class ProtXMLWriter < Object
|
|
20
30
|
@protein_summary_node << pg_node
|
21
31
|
end
|
22
32
|
|
33
|
+
def append_dataset_derivation()
|
34
|
+
ddnode = XML::Node.new('dataset_derivation')
|
35
|
+
ddnode["generation_no"]="0"
|
36
|
+
@protein_summary_node << ddnode
|
37
|
+
end
|
38
|
+
|
23
39
|
def save(file_path)
|
40
|
+
# puts XML.indent_tree_output
|
41
|
+
# puts "|#{XML.default_tree_indent_string}|"
|
42
|
+
XML.indent_tree_output = true
|
43
|
+
# puts @template_doc.to_s
|
24
44
|
@template_doc.save(file_path,:indent=>true,:encoding => XML::Encoding::UTF_8)
|
25
45
|
end
|
26
46
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: protk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.4.beta1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ira Cooke
|
@@ -299,6 +299,7 @@ files:
|
|
299
299
|
- lib/protk/prophet_tool.rb
|
300
300
|
- lib/protk/protein.rb
|
301
301
|
- lib/protk/protein_group.rb
|
302
|
+
- lib/protk/protein_summary.rb
|
302
303
|
- lib/protk/protein_to_genome_mapper.rb
|
303
304
|
- lib/protk/protxml_to_gff_tool.rb
|
304
305
|
- lib/protk/protxml_writer.rb
|
@@ -328,9 +329,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
328
329
|
version: '0'
|
329
330
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
330
331
|
requirements:
|
331
|
-
- - '
|
332
|
+
- - '>'
|
332
333
|
- !ruby/object:Gem::Version
|
333
|
-
version:
|
334
|
+
version: 1.3.1
|
334
335
|
requirements: []
|
335
336
|
rubyforge_project:
|
336
337
|
rubygems_version: 2.2.1
|