divvy_proteomics 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +6 -6
- data/README.md +14 -4
- data/VERSION +1 -1
- data/bin/divvy_spectra +28 -15
- data/divvy_proteomics.gemspec +30 -22
- data/lib/divvy_proteomics.rb +16 -0
- data/lib/divvyable_protein.rb +40 -0
- data/lib/dta_select_output.rb +12 -50
- data/lib/pep_xml.rb +130 -0
- data/spec/data/contaminant.pep.xml +13 -0
- data/spec/data/minimal.pep.xml +14 -0
- data/spec/data/minimal2.pep.xml +12 -0
- data/spec/data/minimal3.pep.xml +21 -0
- data/spec/divvy_proteomics_spec.rb +0 -2
- data/spec/pep_xml_spec.rb +99 -0
- data/spec/spec_helper.rb +3 -1
- metadata +36 -29
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57c4b7c64ec34ec42b4b28f6a31537a8901d5a7c
|
4
|
+
data.tar.gz: c216e563aa04c13e7b935852862994579e45d61f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7a1e7dd2be6565d9503ce1e5fc43cb2362920fa8146f66d7c179c0a812de5e1ea2be9f42d70f63bb2bf3811e44f5800f01e4bbbb8731a4365d129c01c138edae
|
7
|
+
data.tar.gz: 846cf30c524054c62205de65aba81d37bdd1f2857cec1e623730d167efa6eacc71491a999806e73547b50d0c36a4b87993245d20217d47090c8e3da6eecf31b3
|
data/Gemfile
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
|
3
|
-
gem 'bio-logger', "
|
3
|
+
gem 'bio-logger', "~> 1.0"
|
4
4
|
|
5
5
|
# Add dependencies to develop your gem here.
|
6
6
|
# Include everything needed to run rake, tests, features, etc.
|
7
7
|
group :development do
|
8
|
-
gem 'systemu', "
|
9
|
-
gem "rspec", "
|
10
|
-
gem "rdoc", "
|
11
|
-
gem "bundler", "
|
12
|
-
gem "jeweler", "
|
8
|
+
gem 'systemu', "~> 2.6"
|
9
|
+
gem "rspec", "~> 2.14"
|
10
|
+
gem "rdoc", "~> 3.12"
|
11
|
+
gem "bundler", "~> 1.5"
|
12
|
+
gem "jeweler", "~> 2.0"
|
13
13
|
end
|
data/README.md
CHANGED
@@ -13,20 +13,30 @@ $ gem install divvy_spectra
|
|
13
13
|
$ divvy_spectra <DTASelectFile>
|
14
14
|
```
|
15
15
|
Output is a table, with a row for each protein with a few columns, including number of unique spectra and the
|
16
|
-
estimated number of spectral counts after sorting out the non-uniqueness.
|
16
|
+
estimated number of spectral counts after sorting out the non-uniqueness. Using the ```--pep-xml``` flag, PepXML files
|
17
|
+
are can be used as input also:
|
18
|
+
|
19
|
+
```
|
20
|
+
$ divvy_spectra --pep-xml <PepXML_file>
|
21
|
+
```
|
17
22
|
|
18
23
|
Full usage information:
|
19
24
|
```
|
20
|
-
$ divvy_spectra -h
|
21
25
|
|
22
|
-
Usage: divvy_spectra [options] <
|
26
|
+
Usage: divvy_spectra [options] <input_file>
|
23
27
|
|
24
|
-
Takes a tab separated file containing a (possibly modified) output from a DTAselect run, and use some algorithm to divy up the spectra that match multiple peptides.
|
28
|
+
Takes a tab separated file containing a (possibly modified) output from a DTAselect run (or a pepXML file and add the flag --pep-xml), and use some algorithm to divy up the spectra that match multiple peptides.
|
25
29
|
|
26
30
|
--merge-proteins FILE_OF_IDENTIFIERS
|
27
31
|
Provide a space/tab separated file where the identifiers on each row should be treated as one protein
|
28
32
|
--whitelist FILE_OF_PROTEINS_TO_REPORT
|
29
33
|
Only report proteins that are in this whitelist, after divvying with everything
|
34
|
+
--contaminant-regexes REGEXES
|
35
|
+
Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: ]
|
36
|
+
|
37
|
+
Optional arguments:
|
38
|
+
|
39
|
+
--pep-xml Input file is pep XML, rather than a DTA select output file [default: false]
|
30
40
|
|
31
41
|
Verbosity:
|
32
42
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/bin/divvy_spectra
CHANGED
@@ -5,23 +5,24 @@ require 'bio-logger'
|
|
5
5
|
require 'pp'
|
6
6
|
require 'set'
|
7
7
|
|
8
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME =
|
8
|
+
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'divvy_proteomics'
|
9
9
|
|
10
10
|
rootpath = File.dirname(File.dirname(__FILE__))
|
11
11
|
$: << File.join(rootpath,'lib')
|
12
|
-
require '
|
12
|
+
require 'divvy_proteomics'
|
13
13
|
|
14
14
|
# Parse command line options into the options hash
|
15
15
|
options = {
|
16
16
|
:logger => 'stderr',
|
17
17
|
:log_level => 'info',
|
18
18
|
:contaminant_regexes => [/^CNTM:/],
|
19
|
+
:input_is_pep_xml => false,
|
19
20
|
}
|
20
21
|
o = OptionParser.new do |opts|
|
21
22
|
opts.banner = "
|
22
|
-
Usage: #{SCRIPT_NAME} [options] <
|
23
|
+
Usage: #{SCRIPT_NAME} [options] <input_file>
|
23
24
|
|
24
|
-
Takes a tab separated file containing a (possibly modified) output from a DTAselect run, and use some algorithm to divy up the spectra that match multiple peptides.\n\n"
|
25
|
+
Takes a tab separated file containing a (possibly modified) output from a DTAselect run (or a pepXML file and add the flag --pep-xml), and use some algorithm to divy up the spectra that match multiple peptides.\n\n"
|
25
26
|
|
26
27
|
opts.on("--merge-proteins FILE_OF_IDENTIFIERS", "Provide a space/tab separated file where the identifiers on each row should be treated as one protein") do |file|
|
27
28
|
options[:merge_proteins_file] = file
|
@@ -32,6 +33,10 @@ o = OptionParser.new do |opts|
|
|
32
33
|
opts.on("--contaminant-regexes REGEXES", "Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: #{options[:contaminant_prefixes]}]") do |str|
|
33
34
|
options[:contaminant_regexes] = str.split(/,/).collect{|s| /#{s}/}
|
34
35
|
end
|
36
|
+
opts.separator "\nOptional arguments:\n\n"
|
37
|
+
opts.on("--pep-xml", "Input file is pep XML, rather than a DTA select output file [default: #{options[:input_is_pep_xml]}]") do |arg|
|
38
|
+
options[:input_is_pep_xml] = true
|
39
|
+
end
|
35
40
|
|
36
41
|
|
37
42
|
# logger options
|
@@ -74,7 +79,12 @@ if options[:whitelist_file]
|
|
74
79
|
end
|
75
80
|
|
76
81
|
# Parse the csv file
|
77
|
-
parsed =
|
82
|
+
parsed = nil
|
83
|
+
if options[:input_is_pep_xml]
|
84
|
+
parsed = Bio::PepXML.parse(ARGF)
|
85
|
+
else
|
86
|
+
parsed = Bio::DTASelect::OutputFile.parse(ARGF)
|
87
|
+
end
|
78
88
|
|
79
89
|
# Hashes of identifiers to objects
|
80
90
|
proteins = parsed.protein_name_to_object
|
@@ -90,12 +100,13 @@ mergers.each do |secondary_id, primary_id|
|
|
90
100
|
|
91
101
|
# Invalidate some things about the primary ID because they are no longer valid
|
92
102
|
current_protein = proteins[primary_id]
|
93
|
-
|
94
|
-
current_protein.
|
95
|
-
current_protein.
|
96
|
-
current_protein.
|
97
|
-
current_protein.
|
98
|
-
current_protein.
|
103
|
+
# These variables are not used and are not present in pepXML files, so don't mess with them.
|
104
|
+
# current_protein.sequence_count = nil
|
105
|
+
# current_protein.sequence_coverage = nil
|
106
|
+
# current_protein.length = nil
|
107
|
+
# current_protein.molwt = nil
|
108
|
+
# current_protein.pi = nil
|
109
|
+
# current_protein.validation_status = nil
|
99
110
|
# Keep the primary proteins' description, I reckon
|
100
111
|
|
101
112
|
# When there is spectra that are in the secondary but not the primary, add them to the primary's repertoire.
|
@@ -172,11 +183,13 @@ number_non_shared_peptides = all_peptides.select{|pep| pep.parent_proteins.lengt
|
|
172
183
|
total_peptides = number_shared_peptides+number_non_shared_peptides
|
173
184
|
log.info "Found #{number_shared_peptides} (#{number_shared_peptides.to_f/total_peptides*100}%) shared peptides and #{number_non_shared_peptides} (#{number_non_shared_peptides.to_f/total_peptides*100}%) non-shared peptides"
|
174
185
|
|
175
|
-
|
176
|
-
|
177
|
-
|
186
|
+
unless options[:input_is_pep_xml]
|
187
|
+
# Find non-starred peptides that occur only once in the file - maybe not possible given a correctly formatted file?
|
188
|
+
non_starred_but_uniquely_identified_peptides = hits.values.select do |peptide|
|
189
|
+
peptide.dtaselect_attributes['Unique'] == nil and peptide.parent_proteins.length == 1
|
190
|
+
end
|
191
|
+
log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
|
178
192
|
end
|
179
|
-
log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
|
180
193
|
|
181
194
|
# OK, finished parsing the file. Now output the score for each protein
|
182
195
|
puts [
|
data/divvy_proteomics.gemspec
CHANGED
@@ -2,14 +2,16 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
+
# stub: divvy_proteomics 0.3.0 ruby lib
|
5
6
|
|
6
7
|
Gem::Specification.new do |s|
|
7
8
|
s.name = "divvy_proteomics"
|
8
|
-
s.version = "0.
|
9
|
+
s.version = "0.3.0"
|
9
10
|
|
10
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
+
s.require_paths = ["lib"]
|
11
13
|
s.authors = ["Ben J Woodcroft"]
|
12
|
-
s.date = "
|
14
|
+
s.date = "2014-01-07"
|
13
15
|
s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
|
14
16
|
s.email = "donttrustben@gmail.com"
|
15
17
|
s.executables = ["divvy_spectra"]
|
@@ -28,8 +30,14 @@ Gem::Specification.new do |s|
|
|
28
30
|
"bin/divvy_spectra",
|
29
31
|
"divvy_proteomics.gemspec",
|
30
32
|
"lib/divvy_proteomics.rb",
|
33
|
+
"lib/divvyable_protein.rb",
|
31
34
|
"lib/dta_select_output.rb",
|
35
|
+
"lib/pep_xml.rb",
|
36
|
+
"spec/data/contaminant.pep.xml",
|
32
37
|
"spec/data/merge_definition.csv",
|
38
|
+
"spec/data/minimal.pep.xml",
|
39
|
+
"spec/data/minimal2.pep.xml",
|
40
|
+
"spec/data/minimal3.pep.xml",
|
33
41
|
"spec/data/multiply_mapped_spectra.csv",
|
34
42
|
"spec/data/new_format.csv",
|
35
43
|
"spec/data/new_format_some_all_shared_spectra.csv",
|
@@ -39,39 +47,39 @@ Gem::Specification.new do |s|
|
|
39
47
|
"spec/data/three_proteins_meant_for_merge.csv",
|
40
48
|
"spec/data/three_proteins_with_contaminant.csv",
|
41
49
|
"spec/divvy_proteomics_spec.rb",
|
50
|
+
"spec/pep_xml_spec.rb",
|
42
51
|
"spec/spec_helper.rb"
|
43
52
|
]
|
44
53
|
s.homepage = "http://github.com/wwood/divvy_proteomics"
|
45
54
|
s.licenses = ["MIT"]
|
46
|
-
s.
|
47
|
-
s.rubygems_version = "2.0.3"
|
55
|
+
s.rubygems_version = "2.2.0"
|
48
56
|
s.summary = "divvy up spectra from DTASelect files in a parsimonious way"
|
49
57
|
|
50
58
|
if s.respond_to? :specification_version then
|
51
59
|
s.specification_version = 4
|
52
60
|
|
53
61
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
54
|
-
s.add_runtime_dependency(%q<bio-logger>, ["
|
55
|
-
s.add_development_dependency(%q<systemu>, ["
|
56
|
-
s.add_development_dependency(%q<rspec>, ["
|
57
|
-
s.add_development_dependency(%q<rdoc>, ["
|
58
|
-
s.add_development_dependency(%q<bundler>, ["
|
59
|
-
s.add_development_dependency(%q<jeweler>, ["
|
62
|
+
s.add_runtime_dependency(%q<bio-logger>, ["~> 1.0"])
|
63
|
+
s.add_development_dependency(%q<systemu>, ["~> 2.6"])
|
64
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.14"])
|
65
|
+
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
66
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.5"])
|
67
|
+
s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
|
60
68
|
else
|
61
|
-
s.add_dependency(%q<bio-logger>, ["
|
62
|
-
s.add_dependency(%q<systemu>, ["
|
63
|
-
s.add_dependency(%q<rspec>, ["
|
64
|
-
s.add_dependency(%q<rdoc>, ["
|
65
|
-
s.add_dependency(%q<bundler>, ["
|
66
|
-
s.add_dependency(%q<jeweler>, ["
|
69
|
+
s.add_dependency(%q<bio-logger>, ["~> 1.0"])
|
70
|
+
s.add_dependency(%q<systemu>, ["~> 2.6"])
|
71
|
+
s.add_dependency(%q<rspec>, ["~> 2.14"])
|
72
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
73
|
+
s.add_dependency(%q<bundler>, ["~> 1.5"])
|
74
|
+
s.add_dependency(%q<jeweler>, ["~> 2.0"])
|
67
75
|
end
|
68
76
|
else
|
69
|
-
s.add_dependency(%q<bio-logger>, ["
|
70
|
-
s.add_dependency(%q<systemu>, ["
|
71
|
-
s.add_dependency(%q<rspec>, ["
|
72
|
-
s.add_dependency(%q<rdoc>, ["
|
73
|
-
s.add_dependency(%q<bundler>, ["
|
74
|
-
s.add_dependency(%q<jeweler>, ["
|
77
|
+
s.add_dependency(%q<bio-logger>, ["~> 1.0"])
|
78
|
+
s.add_dependency(%q<systemu>, ["~> 2.6"])
|
79
|
+
s.add_dependency(%q<rspec>, ["~> 2.14"])
|
80
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
81
|
+
s.add_dependency(%q<bundler>, ["~> 1.5"])
|
82
|
+
s.add_dependency(%q<jeweler>, ["~> 2.0"])
|
75
83
|
end
|
76
84
|
end
|
77
85
|
|
data/lib/divvy_proteomics.rb
CHANGED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'bio-logger'
|
2
|
+
Bio::Log::LoggerPlus.new('divvy_proteomics')
|
3
|
+
module Bio
|
4
|
+
module DivvyProteomics
|
5
|
+
module Logging
|
6
|
+
def log
|
7
|
+
Bio::Log::LoggerPlus['divvy_proteomics']
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
require 'divvyable_protein'
|
14
|
+
require 'dta_select_output'
|
15
|
+
require 'pep_xml'
|
16
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Bio::DivvyProteomics::DivvyableProtein
|
2
|
+
def unique_spectra
|
3
|
+
return 0 if @peptides.nil? or @peptides.empty?
|
4
|
+
num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
5
|
+
num ||= 0
|
6
|
+
return num
|
7
|
+
end
|
8
|
+
|
9
|
+
def non_unique_spectra
|
10
|
+
return 0 if @peptides.nil? or @peptides.empty?
|
11
|
+
num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
12
|
+
num ||= 0
|
13
|
+
|
14
|
+
return num
|
15
|
+
end
|
16
|
+
|
17
|
+
# Are there any peptides that are assigned exclusively to this protein?
|
18
|
+
def uniquely_identified_by_any_peptides?
|
19
|
+
unique_spectra > 0
|
20
|
+
end
|
21
|
+
|
22
|
+
def estimated_spectral_count
|
23
|
+
# How many unique spectra are there for each protein that shares a peptide with the current peptide
|
24
|
+
return 0 if @peptides.nil? or @peptides.empty?
|
25
|
+
peptide_shares = []
|
26
|
+
# If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
|
27
|
+
if !uniquely_identified_by_any_peptides?
|
28
|
+
# Don't attempt to divvy these up, because there are too many assumptions involved
|
29
|
+
return 0
|
30
|
+
else
|
31
|
+
peptides.each do |peptide|
|
32
|
+
log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
|
33
|
+
log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
|
34
|
+
total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
|
35
|
+
peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
|
36
|
+
end
|
37
|
+
return peptide_shares.reduce(:+)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/dta_select_output.rb
CHANGED
@@ -1,12 +1,9 @@
|
|
1
1
|
|
2
2
|
|
3
3
|
|
4
|
+
|
5
|
+
|
4
6
|
module Bio::DTASelect
|
5
|
-
module Logging
|
6
|
-
def log
|
7
|
-
Bio::Log::LoggerPlus['divvy_spectra']
|
8
|
-
end
|
9
|
-
end
|
10
7
|
|
11
8
|
class OutputFile
|
12
9
|
def self.log
|
@@ -14,7 +11,8 @@ module Bio::DTASelect
|
|
14
11
|
end
|
15
12
|
|
16
13
|
class SelectedProtein
|
17
|
-
include Bio::
|
14
|
+
include Bio::DivvyProteomics::Logging
|
15
|
+
include Bio::DivvyProteomics::DivvyableProtein
|
18
16
|
|
19
17
|
attr_accessor :identifier
|
20
18
|
|
@@ -26,43 +24,7 @@ module Bio::DTASelect
|
|
26
24
|
@peptides = []
|
27
25
|
end
|
28
26
|
|
29
|
-
def unique_spectra
|
30
|
-
return 0 if @peptides.nil? or @peptides.empty?
|
31
|
-
num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
32
|
-
num ||= 0
|
33
|
-
return num
|
34
|
-
end
|
35
|
-
|
36
|
-
def non_unique_spectra
|
37
|
-
return 0 if @peptides.nil? or @peptides.empty?
|
38
|
-
num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
39
|
-
num ||= 0
|
40
|
-
return num
|
41
|
-
end
|
42
27
|
|
43
|
-
# Are there any peptides that are assigned exclusively to this protein?
|
44
|
-
def uniquely_identified_by_any_peptides?
|
45
|
-
unique_spectra > 0
|
46
|
-
end
|
47
|
-
|
48
|
-
def estimated_spectral_count
|
49
|
-
# How many unique spectra are there for each protein that shares a peptide with the current peptide
|
50
|
-
return 0 if @peptides.nil? or @peptides.empty?
|
51
|
-
peptide_shares = []
|
52
|
-
# If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
|
53
|
-
if !uniquely_identified_by_any_peptides?
|
54
|
-
# Don't attempt to divvy these up, because there are too many assumptions involved
|
55
|
-
return 0
|
56
|
-
else
|
57
|
-
peptides.each do |peptide|
|
58
|
-
log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
|
59
|
-
log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
|
60
|
-
total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
|
61
|
-
peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
|
62
|
-
end
|
63
|
-
return peptide_shares.reduce(:+)
|
64
|
-
end
|
65
|
-
end
|
66
28
|
|
67
29
|
def log
|
68
30
|
Bio::Log::LoggerPlus[LOG_NAME]
|
@@ -70,7 +32,7 @@ module Bio::DTASelect
|
|
70
32
|
end
|
71
33
|
|
72
34
|
class Peptide
|
73
|
-
include Bio::
|
35
|
+
include Bio::DivvyProteomics::Logging
|
74
36
|
|
75
37
|
attr_accessor :identifier
|
76
38
|
|
@@ -98,7 +60,7 @@ module Bio::DTASelect
|
|
98
60
|
end
|
99
61
|
|
100
62
|
class Result
|
101
|
-
include Bio::
|
63
|
+
include Bio::DivvyProteomics::Logging
|
102
64
|
|
103
65
|
# hash of protein identifier to Protein object
|
104
66
|
attr_accessor :protein_name_to_object
|
@@ -123,7 +85,7 @@ module Bio::DTASelect
|
|
123
85
|
# Parse each line of the DTAselect file
|
124
86
|
io.each_line do |line|
|
125
87
|
splits = line.chomp.split("\t")
|
126
|
-
log.debug "Parsing line `#{line.chomp}'"
|
88
|
+
log.debug "Parsing line `#{line.chomp}'" if log.debug?
|
127
89
|
|
128
90
|
if reading_header
|
129
91
|
log.debug "reading header"
|
@@ -146,7 +108,7 @@ module Bio::DTASelect
|
|
146
108
|
if !last_line_was_protein_name
|
147
109
|
# Sometimes several proteins are given all in the one header line
|
148
110
|
# start a new protein
|
149
|
-
log.debug "New protein now being parsed"
|
111
|
+
log.debug "New protein now being parsed" if log.debug?
|
150
112
|
current_proteins = []
|
151
113
|
end
|
152
114
|
|
@@ -174,13 +136,13 @@ module Bio::DTASelect
|
|
174
136
|
|
175
137
|
|
176
138
|
elsif splits[1] == 'Proteins'
|
177
|
-
# Done processing, except for the bits down the bottom which aren't parsed (yet)
|
139
|
+
# Done processing, except for the bits down the bottom which aren't parsed (yet, at least)
|
178
140
|
break
|
179
141
|
|
180
142
|
|
181
143
|
|
182
144
|
else
|
183
|
-
log.debug "New spectra now being parsed"
|
145
|
+
log.debug "New spectra now being parsed" if log.debug?
|
184
146
|
last_line_was_protein_name = false
|
185
147
|
|
186
148
|
# Record a spectra
|
@@ -204,11 +166,11 @@ module Bio::DTASelect
|
|
204
166
|
pep.parent_proteins.push current_protein
|
205
167
|
current_protein.peptides.push pep
|
206
168
|
end
|
207
|
-
log.debug "Parsed this peptide #{pep.inspect}"
|
169
|
+
log.debug "Parsed this peptide #{pep.inspect}" if log.debug?
|
208
170
|
end
|
209
171
|
end
|
210
172
|
|
211
|
-
log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}"
|
173
|
+
log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}" if log.debug?
|
212
174
|
return result
|
213
175
|
end
|
214
176
|
end
|
data/lib/pep_xml.rb
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
class Bio::PepXML
|
4
|
+
include Bio::DivvyProteomics::Logging
|
5
|
+
|
6
|
+
attr_accessor :protein_name_to_object, :peptide_name_to_object
|
7
|
+
|
8
|
+
class Protein
|
9
|
+
include Bio::DivvyProteomics::Logging
|
10
|
+
include Bio::DivvyProteomics::DivvyableProtein
|
11
|
+
|
12
|
+
# Array of peptide objects that have been assigned to this protein
|
13
|
+
attr_accessor :peptides
|
14
|
+
|
15
|
+
attr_accessor :identifier, :descriptive_name
|
16
|
+
end
|
17
|
+
|
18
|
+
# Named 'Peptide' but really mean Spectra. Just too hard to change
|
19
|
+
class Peptide
|
20
|
+
attr_accessor :parent_proteins
|
21
|
+
|
22
|
+
# Name of the spectra
|
23
|
+
attr_accessor :identifier
|
24
|
+
|
25
|
+
def initialize
|
26
|
+
@parent_proteins = []
|
27
|
+
end
|
28
|
+
|
29
|
+
#TODO: right now this just always returns 1. It should really be working out redundancy
|
30
|
+
#properly by comparison of peptide sequences, but this isn't yet parsed this info
|
31
|
+
def redundancy
|
32
|
+
1
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.log
|
37
|
+
Bio::PepXML.new.log
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.parse(io)
|
41
|
+
protein_name_to_object = {}
|
42
|
+
peptide_name_to_object = {}
|
43
|
+
|
44
|
+
#pep.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit'){|e|
|
45
|
+
# c+=1; p e.attributes['protein_descr'].strip;
|
46
|
+
# e.elements.each{|e|
|
47
|
+
# p e.name, e.attributes['protein_descr'].strip};break}
|
48
|
+
xml = REXML::Document.new(io)
|
49
|
+
|
50
|
+
parse_name_and_description = lambda do |e|
|
51
|
+
name = e.attributes['protein'].strip
|
52
|
+
description = e.attributes['protein_descr'].strip
|
53
|
+
if name.nil? or name == ''
|
54
|
+
name = e.attributes['protein_descr'].strip
|
55
|
+
else
|
56
|
+
description = name+' '+description
|
57
|
+
end
|
58
|
+
name.gsub!(/\t.*/,'')
|
59
|
+
description.gsub!(/[\t\n]/,' ')
|
60
|
+
|
61
|
+
[name, description]
|
62
|
+
end
|
63
|
+
|
64
|
+
#TODO: some better sanity checking here would be ideal.
|
65
|
+
num_hits_parsed = 0
|
66
|
+
xml.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit') do |hit|
|
67
|
+
hit_number = hit.attributes['hit_rank']
|
68
|
+
raise "Parsing error on #{hit}" if hit_number.nil?
|
69
|
+
next if hit_number != "1"
|
70
|
+
|
71
|
+
# Parse the primary hit
|
72
|
+
name1, description1 = parse_name_and_description.call(hit)
|
73
|
+
raise "No protein name found in this xml fragment: #{hit.to_s}" if name1.nil?
|
74
|
+
spectrum_name = hit.parent.parent.attributes['spectrum'].strip
|
75
|
+
raise "Parsing error (couldn't find spectrum name) with spectra #{hit.inspect}" if spectrum_name.nil?
|
76
|
+
|
77
|
+
# It is possible to have multiple peptides both hit the spectra with hit_rank="1"
|
78
|
+
# This happens when when e.g. leucine and isoleucine are possible.
|
79
|
+
spectrum = peptide_name_to_object[spectrum_name]
|
80
|
+
if spectrum.nil?
|
81
|
+
spectrum = Peptide.new
|
82
|
+
spectrum.identifier = spectrum_name
|
83
|
+
peptide_name_to_object[spectrum_name] = spectrum
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
protein1 = protein_name_to_object[name1]
|
88
|
+
if protein1.nil?
|
89
|
+
protein1 = Protein.new
|
90
|
+
protein1.identifier = name1
|
91
|
+
protein1.descriptive_name = description1
|
92
|
+
protein1.peptides = []
|
93
|
+
protein_name_to_object[name1] = protein1
|
94
|
+
end
|
95
|
+
protein1.peptides.push spectrum
|
96
|
+
spectrum.parent_proteins ||= []
|
97
|
+
spectrum.parent_proteins.push protein1
|
98
|
+
|
99
|
+
|
100
|
+
# Parse the alternate hits. Only look at children with protein_descr attributes - these are
|
101
|
+
# these are the alternate proteins
|
102
|
+
hit.each_element_with_attribute('protein_descr') do |e|
|
103
|
+
name, description = parse_name_and_description.call(e)
|
104
|
+
|
105
|
+
alternate = protein_name_to_object[name]
|
106
|
+
if alternate.nil?
|
107
|
+
alternate = Protein.new
|
108
|
+
alternate.identifier = name
|
109
|
+
alternate.descriptive_name = description
|
110
|
+
alternate.peptides = []
|
111
|
+
protein_name_to_object[name] = alternate
|
112
|
+
end
|
113
|
+
alternate.peptides.push spectrum
|
114
|
+
spectrum.parent_proteins.push alternate
|
115
|
+
end
|
116
|
+
|
117
|
+
# Don't count the same protein multiple times - might happen when a spectru
|
118
|
+
spectrum.parent_proteins.uniq!
|
119
|
+
|
120
|
+
num_hits_parsed += 1
|
121
|
+
end
|
122
|
+
log.info "Parsed #{num_hits_parsed} search hits"
|
123
|
+
|
124
|
+
pepxml = Bio::PepXML.new
|
125
|
+
pepxml.protein_name_to_object = protein_name_to_object
|
126
|
+
pepxml.peptide_name_to_object = peptide_name_to_object
|
127
|
+
|
128
|
+
return pepxml
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
|
3
|
+
<msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
|
4
|
+
<spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_03.1121.1121.2" start_scan="1121" end_scan="1121" retention_time_sec="5.4199816666666667" activation_method="CID" precursor_intensity="388495.5625" precursor_neutral_mass="1329.7252673153125" assumed_charge="2" index="221">
|
5
|
+
<search_result search_id="1">
|
6
|
+
<search_hit hit_rank="1" peptide="NLDLDSIIAEVK" protein="CNTM:cont_sp" num_tot_proteins="2" num_matched_ions="0" calc_neutral_pep_mass="1329.7252673153125" massdiff="0" protein_descr="P13647 K2C5_HUMAN Keratin, type II cytoskeletal 5 (Cytokeratin 5) (K5) (CK 5) (58 kDa cytokeratin) - Homo sapiens (Human). # pI:8.14 MW:62462" protein_mw="62.423064734660052" calc_pI="8.06005859375">
|
7
|
+
<alternative_protein protein="CNTM:cont_sp" protein_descr="P48668 K2CE_HUMAN Keratin, type II cytoskeletal 6E (Cytokeratin 6E) (CK 6E) (K6e keratin) - Homo sapiens (Human). # pI:8.14 MW:60092" protein_mw="60.05537958466001" />
|
8
|
+
<search_score name="XCorr" value="4.5027022361755371" />
|
9
|
+
</search_hit>
|
10
|
+
</search_result>
|
11
|
+
</spectrum_query>
|
12
|
+
</msms_run_summary>
|
13
|
+
</msms_pipeline_analysis>
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
|
3
|
+
<msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
|
4
|
+
<spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
5
|
+
<search_result search_id="1">
|
6
|
+
<search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
7
|
+
<alternative_protein protein="" protein_descr=">38SUR_6350_1528184_1	" protein_mw="24.663561404659987" />
|
8
|
+
<alternative_protein protein="" protein_descr=">38SUR_80622_1602456_1	" protein_mw="30.364007294659981" />
|
9
|
+
<search_score name="XCorr" value="4.7916374206542969" />
|
10
|
+
</search_hit>
|
11
|
+
</search_result>
|
12
|
+
</spectrum_query>
|
13
|
+
</msms_run_summary>
|
14
|
+
</msms_pipeline_analysis>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
|
3
|
+
<msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
|
4
|
+
<spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
5
|
+
<search_result search_id="1">
|
6
|
+
<search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
7
|
+
<search_score name="XCorr" value="4.7916374206542969" />
|
8
|
+
</search_hit>
|
9
|
+
</search_result>
|
10
|
+
</spectrum_query>
|
11
|
+
</msms_run_summary>
|
12
|
+
</msms_pipeline_analysis>
|
@@ -0,0 +1,21 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
|
3
|
+
<msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
|
4
|
+
<spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
5
|
+
<search_result search_id="1">
|
6
|
+
<search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
7
|
+
<alternative_protein protein="" protein_descr=">38SUR_6350_1528184_1	" protein_mw="24.663561404659987" />
|
8
|
+
<alternative_protein protein="" protein_descr=">38SUR_80622_1602456_1	" protein_mw="30.364007294659981" />
|
9
|
+
<search_score name="XCorr" value="4.7916374206542969" />
|
10
|
+
</search_hit>
|
11
|
+
</search_result>
|
12
|
+
</spectrum_query>
|
13
|
+
<spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2_3" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
14
|
+
<search_result search_id="1">
|
15
|
+
<search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
16
|
+
<search_score name="XCorr" value="4.7916374206542969" />
|
17
|
+
</search_hit>
|
18
|
+
</search_result>
|
19
|
+
</spectrum_query>
|
20
|
+
</msms_run_summary>
|
21
|
+
</msms_pipeline_analysis>
|
@@ -11,8 +11,6 @@ $:.unshift File.join(File.dirname(__FILE__),'..')
|
|
11
11
|
script_under_test = File.basename(__FILE__).gsub(/^test_/,'')
|
12
12
|
path_to_script = File.join(File.dirname(__FILE__),'..','bin','divvy_spectra')
|
13
13
|
|
14
|
-
TEST_DATA_DIR = File.join(File.dirname(__FILE__),'data')
|
15
|
-
|
16
14
|
describe script_under_test do
|
17
15
|
let(:header){"ID\tUnique spectra\tNon-unique spectra\tEstimated total spectra\tNormalised spectral count\tDescription\tProteins sharing spectra\n"}
|
18
16
|
it 'should do 1 protein hit' do
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'systemu'
|
2
|
+
require 'pp'
|
3
|
+
require 'open3'
|
4
|
+
require 'tempfile'
|
5
|
+
|
6
|
+
require 'spec_helper'
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
describe 'pepxml parsing' do
|
11
|
+
let(:header){"ID\tUnique spectra\tNon-unique spectra\tEstimated total spectra\tNormalised spectral count\tDescription\tProteins sharing spectra\n"}
|
12
|
+
it 'should parse decently' do
|
13
|
+
pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal.pep.xml')))
|
14
|
+
|
15
|
+
# <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
16
|
+
# <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
17
|
+
# <alternative_protein protein="" protein_descr=">38SUR_6350_1528184_1	" protein_mw="24.663561404659987" />
|
18
|
+
# <alternative_protein protein="" protein_descr=">38SUR_80622_1602456_1	" protein_mw="30.364007294659981" />
|
19
|
+
# <search_score name="XCorr" value="4.7916374206542969" />
|
20
|
+
|
21
|
+
pepxml.kind_of?(Bio::PepXML).should == true
|
22
|
+
|
23
|
+
pepxml.protein_name_to_object.keys.sort.should == [
|
24
|
+
'>38SUR_2379_1524213_2',
|
25
|
+
'>38SUR_6350_1528184_1',
|
26
|
+
'>38SUR_80622_1602456_1',
|
27
|
+
].sort
|
28
|
+
pepxml.peptide_name_to_object.keys.sort.should == [
|
29
|
+
'Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2'
|
30
|
+
]
|
31
|
+
pepxml.protein_name_to_object.values.each do |prot|
|
32
|
+
prot.kind_of?(Bio::PepXML::Protein).should == true
|
33
|
+
end
|
34
|
+
pepxml.peptide_name_to_object.values.each do |prot|
|
35
|
+
prot.kind_of?(Bio::PepXML::Peptide).should == true
|
36
|
+
end
|
37
|
+
|
38
|
+
prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
|
39
|
+
prot1.identifier.should == '>38SUR_2379_1524213_2'
|
40
|
+
prot1.descriptive_name.should == '>38SUR_2379_1524213_2'
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should respond to divvy proteomics module things' do
|
44
|
+
pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal.pep.xml')))
|
45
|
+
|
46
|
+
# <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
47
|
+
# <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
48
|
+
# <alternative_protein protein="" protein_descr=">38SUR_6350_1528184_1	" protein_mw="24.663561404659987" />
|
49
|
+
# <alternative_protein protein="" protein_descr=">38SUR_80622_1602456_1	" protein_mw="30.364007294659981" />
|
50
|
+
# <search_score name="XCorr" value="4.7916374206542969" />
|
51
|
+
|
52
|
+
pepxml.kind_of?(Bio::PepXML).should == true
|
53
|
+
|
54
|
+
prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
|
55
|
+
prot1.peptides.length.should == 1
|
56
|
+
prot1.unique_spectra.should == 0
|
57
|
+
prot1.non_unique_spectra.should == 1
|
58
|
+
prot1.estimated_spectral_count.should == 0.0
|
59
|
+
|
60
|
+
|
61
|
+
prot1 = pepxml.protein_name_to_object['>38SUR_6350_1528184_1']
|
62
|
+
prot1.peptides.length.should == 1
|
63
|
+
prot1.unique_spectra.should == 0
|
64
|
+
prot1.non_unique_spectra.should == 1
|
65
|
+
prot1.estimated_spectral_count.should == 0.0
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'should respond to divvy proteomics module things with 1 unique hit' do
|
69
|
+
pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal2.pep.xml')))
|
70
|
+
pepxml.kind_of?(Bio::PepXML).should == true
|
71
|
+
|
72
|
+
prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
|
73
|
+
prot1.peptides.length.should == 1
|
74
|
+
prot1.unique_spectra.should == 1
|
75
|
+
prot1.non_unique_spectra.should == 0
|
76
|
+
prot1.estimated_spectral_count.should == 1.0
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should respond to divvy proteomics module things with 2 hits, where 1 is unique' do
|
80
|
+
pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal3.pep.xml')))
|
81
|
+
pepxml.kind_of?(Bio::PepXML).should == true
|
82
|
+
|
83
|
+
prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
|
84
|
+
prot1.peptides.length.should == 2
|
85
|
+
prot1.unique_spectra.should == 1
|
86
|
+
prot1.non_unique_spectra.should == 1
|
87
|
+
prot1.estimated_spectral_count.should == 2.0
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'should parse when the protein and protein_desc attributes are both defined' do
|
91
|
+
pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'contaminant.pep.xml')))
|
92
|
+
pepxml.kind_of?(Bio::PepXML).should == true
|
93
|
+
|
94
|
+
prot1 = pepxml.protein_name_to_object['CNTM:cont_sp']
|
95
|
+
prot1.nil?.should == false
|
96
|
+
prot1.identifier.should == 'CNTM:cont_sp'
|
97
|
+
prot1.descriptive_name.should == 'CNTM:cont_sp P13647 K2C5_HUMAN Keratin, type II cytoskeletal 5 (Cytokeratin 5) (K5) (CK 5) (58 kDa cytokeratin) - Homo sapiens (Human). # pI:8.14 MW:62462'
|
98
|
+
end
|
99
|
+
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,99 +1,99 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: divvy_proteomics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
19
|
+
version: '1.0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
26
|
+
version: '1.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: systemu
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '2.6'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '2.6'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 2.
|
47
|
+
version: '2.14'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 2.
|
54
|
+
version: '2.14'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rdoc
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '3.12'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '3.12'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: bundler
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: 1.
|
75
|
+
version: '1.5'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.
|
82
|
+
version: '1.5'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: jeweler
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version:
|
89
|
+
version: '2.0'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- -
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version:
|
96
|
+
version: '2.0'
|
97
97
|
description: divvy up spectra from DTASelect files in a somewhat parsimonious way
|
98
98
|
email: donttrustben@gmail.com
|
99
99
|
executables:
|
@@ -103,8 +103,8 @@ extra_rdoc_files:
|
|
103
103
|
- LICENSE.txt
|
104
104
|
- README.md
|
105
105
|
files:
|
106
|
-
- .document
|
107
|
-
- .rspec
|
106
|
+
- ".document"
|
107
|
+
- ".rspec"
|
108
108
|
- Gemfile
|
109
109
|
- LICENSE.txt
|
110
110
|
- README.md
|
@@ -113,8 +113,14 @@ files:
|
|
113
113
|
- bin/divvy_spectra
|
114
114
|
- divvy_proteomics.gemspec
|
115
115
|
- lib/divvy_proteomics.rb
|
116
|
+
- lib/divvyable_protein.rb
|
116
117
|
- lib/dta_select_output.rb
|
118
|
+
- lib/pep_xml.rb
|
119
|
+
- spec/data/contaminant.pep.xml
|
117
120
|
- spec/data/merge_definition.csv
|
121
|
+
- spec/data/minimal.pep.xml
|
122
|
+
- spec/data/minimal2.pep.xml
|
123
|
+
- spec/data/minimal3.pep.xml
|
118
124
|
- spec/data/multiply_mapped_spectra.csv
|
119
125
|
- spec/data/new_format.csv
|
120
126
|
- spec/data/new_format_some_all_shared_spectra.csv
|
@@ -124,6 +130,7 @@ files:
|
|
124
130
|
- spec/data/three_proteins_meant_for_merge.csv
|
125
131
|
- spec/data/three_proteins_with_contaminant.csv
|
126
132
|
- spec/divvy_proteomics_spec.rb
|
133
|
+
- spec/pep_xml_spec.rb
|
127
134
|
- spec/spec_helper.rb
|
128
135
|
homepage: http://github.com/wwood/divvy_proteomics
|
129
136
|
licenses:
|
@@ -135,17 +142,17 @@ require_paths:
|
|
135
142
|
- lib
|
136
143
|
required_ruby_version: !ruby/object:Gem::Requirement
|
137
144
|
requirements:
|
138
|
-
- -
|
145
|
+
- - ">="
|
139
146
|
- !ruby/object:Gem::Version
|
140
147
|
version: '0'
|
141
148
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
142
149
|
requirements:
|
143
|
-
- -
|
150
|
+
- - ">="
|
144
151
|
- !ruby/object:Gem::Version
|
145
152
|
version: '0'
|
146
153
|
requirements: []
|
147
154
|
rubyforge_project:
|
148
|
-
rubygems_version: 2.0
|
155
|
+
rubygems_version: 2.2.0
|
149
156
|
signing_key:
|
150
157
|
specification_version: 4
|
151
158
|
summary: divvy up spectra from DTASelect files in a parsimonious way
|