divvy_proteomics 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +6 -6
- data/README.md +14 -4
- data/VERSION +1 -1
- data/bin/divvy_spectra +28 -15
- data/divvy_proteomics.gemspec +30 -22
- data/lib/divvy_proteomics.rb +16 -0
- data/lib/divvyable_protein.rb +40 -0
- data/lib/dta_select_output.rb +12 -50
- data/lib/pep_xml.rb +130 -0
- data/spec/data/contaminant.pep.xml +13 -0
- data/spec/data/minimal.pep.xml +14 -0
- data/spec/data/minimal2.pep.xml +12 -0
- data/spec/data/minimal3.pep.xml +21 -0
- data/spec/divvy_proteomics_spec.rb +0 -2
- data/spec/pep_xml_spec.rb +99 -0
- data/spec/spec_helper.rb +3 -1
- metadata +36 -29
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57c4b7c64ec34ec42b4b28f6a31537a8901d5a7c
|
4
|
+
data.tar.gz: c216e563aa04c13e7b935852862994579e45d61f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7a1e7dd2be6565d9503ce1e5fc43cb2362920fa8146f66d7c179c0a812de5e1ea2be9f42d70f63bb2bf3811e44f5800f01e4bbbb8731a4365d129c01c138edae
|
7
|
+
data.tar.gz: 846cf30c524054c62205de65aba81d37bdd1f2857cec1e623730d167efa6eacc71491a999806e73547b50d0c36a4b87993245d20217d47090c8e3da6eecf31b3
|
data/Gemfile
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
source "http://rubygems.org"
|
2
2
|
|
3
|
-
gem 'bio-logger', "
|
3
|
+
gem 'bio-logger', "~> 1.0"
|
4
4
|
|
5
5
|
# Add dependencies to develop your gem here.
|
6
6
|
# Include everything needed to run rake, tests, features, etc.
|
7
7
|
group :development do
|
8
|
-
gem 'systemu', "
|
9
|
-
gem "rspec", "
|
10
|
-
gem "rdoc", "
|
11
|
-
gem "bundler", "
|
12
|
-
gem "jeweler", "
|
8
|
+
gem 'systemu', "~> 2.6"
|
9
|
+
gem "rspec", "~> 2.14"
|
10
|
+
gem "rdoc", "~> 3.12"
|
11
|
+
gem "bundler", "~> 1.5"
|
12
|
+
gem "jeweler", "~> 2.0"
|
13
13
|
end
|
data/README.md
CHANGED
@@ -13,20 +13,30 @@ $ gem install divvy_spectra
|
|
13
13
|
$ divvy_spectra <DTASelectFile>
|
14
14
|
```
|
15
15
|
Output is a table, with a row for each protein with a few columns, including number of unique spectra and the
|
16
|
-
estimated number of spectral counts after sorting out the non-uniqueness.
|
16
|
+
estimated number of spectral counts after sorting out the non-uniqueness. Using the ```--pep-xml``` flag, PepXML files
|
17
|
+
are can be used as input also:
|
18
|
+
|
19
|
+
```
|
20
|
+
$ divvy_spectra --pep-xml <PepXML_file>
|
21
|
+
```
|
17
22
|
|
18
23
|
Full usage information:
|
19
24
|
```
|
20
|
-
$ divvy_spectra -h
|
21
25
|
|
22
|
-
Usage: divvy_spectra [options] <
|
26
|
+
Usage: divvy_spectra [options] <input_file>
|
23
27
|
|
24
|
-
Takes a tab separated file containing a (possibly modified) output from a DTAselect run, and use some algorithm to divy up the spectra that match multiple peptides.
|
28
|
+
Takes a tab separated file containing a (possibly modified) output from a DTAselect run (or a pepXML file and add the flag --pep-xml), and use some algorithm to divy up the spectra that match multiple peptides.
|
25
29
|
|
26
30
|
--merge-proteins FILE_OF_IDENTIFIERS
|
27
31
|
Provide a space/tab separated file where the identifiers on each row should be treated as one protein
|
28
32
|
--whitelist FILE_OF_PROTEINS_TO_REPORT
|
29
33
|
Only report proteins that are in this whitelist, after divvying with everything
|
34
|
+
--contaminant-regexes REGEXES
|
35
|
+
Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: ]
|
36
|
+
|
37
|
+
Optional arguments:
|
38
|
+
|
39
|
+
--pep-xml Input file is pep XML, rather than a DTA select output file [default: false]
|
30
40
|
|
31
41
|
Verbosity:
|
32
42
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.3.0
|
data/bin/divvy_spectra
CHANGED
@@ -5,23 +5,24 @@ require 'bio-logger'
|
|
5
5
|
require 'pp'
|
6
6
|
require 'set'
|
7
7
|
|
8
|
-
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME =
|
8
|
+
SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = 'divvy_proteomics'
|
9
9
|
|
10
10
|
rootpath = File.dirname(File.dirname(__FILE__))
|
11
11
|
$: << File.join(rootpath,'lib')
|
12
|
-
require '
|
12
|
+
require 'divvy_proteomics'
|
13
13
|
|
14
14
|
# Parse command line options into the options hash
|
15
15
|
options = {
|
16
16
|
:logger => 'stderr',
|
17
17
|
:log_level => 'info',
|
18
18
|
:contaminant_regexes => [/^CNTM:/],
|
19
|
+
:input_is_pep_xml => false,
|
19
20
|
}
|
20
21
|
o = OptionParser.new do |opts|
|
21
22
|
opts.banner = "
|
22
|
-
Usage: #{SCRIPT_NAME} [options] <
|
23
|
+
Usage: #{SCRIPT_NAME} [options] <input_file>
|
23
24
|
|
24
|
-
Takes a tab separated file containing a (possibly modified) output from a DTAselect run, and use some algorithm to divy up the spectra that match multiple peptides.\n\n"
|
25
|
+
Takes a tab separated file containing a (possibly modified) output from a DTAselect run (or a pepXML file and add the flag --pep-xml), and use some algorithm to divy up the spectra that match multiple peptides.\n\n"
|
25
26
|
|
26
27
|
opts.on("--merge-proteins FILE_OF_IDENTIFIERS", "Provide a space/tab separated file where the identifiers on each row should be treated as one protein") do |file|
|
27
28
|
options[:merge_proteins_file] = file
|
@@ -32,6 +33,10 @@ o = OptionParser.new do |opts|
|
|
32
33
|
opts.on("--contaminant-regexes REGEXES", "Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: #{options[:contaminant_prefixes]}]") do |str|
|
33
34
|
options[:contaminant_regexes] = str.split(/,/).collect{|s| /#{s}/}
|
34
35
|
end
|
36
|
+
opts.separator "\nOptional arguments:\n\n"
|
37
|
+
opts.on("--pep-xml", "Input file is pep XML, rather than a DTA select output file [default: #{options[:input_is_pep_xml]}]") do |arg|
|
38
|
+
options[:input_is_pep_xml] = true
|
39
|
+
end
|
35
40
|
|
36
41
|
|
37
42
|
# logger options
|
@@ -74,7 +79,12 @@ if options[:whitelist_file]
|
|
74
79
|
end
|
75
80
|
|
76
81
|
# Parse the csv file
|
77
|
-
parsed =
|
82
|
+
parsed = nil
|
83
|
+
if options[:input_is_pep_xml]
|
84
|
+
parsed = Bio::PepXML.parse(ARGF)
|
85
|
+
else
|
86
|
+
parsed = Bio::DTASelect::OutputFile.parse(ARGF)
|
87
|
+
end
|
78
88
|
|
79
89
|
# Hashes of identifiers to objects
|
80
90
|
proteins = parsed.protein_name_to_object
|
@@ -90,12 +100,13 @@ mergers.each do |secondary_id, primary_id|
|
|
90
100
|
|
91
101
|
# Invalidate some things about the primary ID because they are no longer valid
|
92
102
|
current_protein = proteins[primary_id]
|
93
|
-
|
94
|
-
current_protein.
|
95
|
-
current_protein.
|
96
|
-
current_protein.
|
97
|
-
current_protein.
|
98
|
-
current_protein.
|
103
|
+
# These variables are not used and are not present in pepXML files, so don't mess with them.
|
104
|
+
# current_protein.sequence_count = nil
|
105
|
+
# current_protein.sequence_coverage = nil
|
106
|
+
# current_protein.length = nil
|
107
|
+
# current_protein.molwt = nil
|
108
|
+
# current_protein.pi = nil
|
109
|
+
# current_protein.validation_status = nil
|
99
110
|
# Keep the primary proteins' description, I reckon
|
100
111
|
|
101
112
|
# When there is spectra that are in the secondary but not the primary, add them to the primary's repertoire.
|
@@ -172,11 +183,13 @@ number_non_shared_peptides = all_peptides.select{|pep| pep.parent_proteins.lengt
|
|
172
183
|
total_peptides = number_shared_peptides+number_non_shared_peptides
|
173
184
|
log.info "Found #{number_shared_peptides} (#{number_shared_peptides.to_f/total_peptides*100}%) shared peptides and #{number_non_shared_peptides} (#{number_non_shared_peptides.to_f/total_peptides*100}%) non-shared peptides"
|
174
185
|
|
175
|
-
|
176
|
-
|
177
|
-
|
186
|
+
unless options[:input_is_pep_xml]
|
187
|
+
# Find non-starred peptides that occur only once in the file - maybe not possible given a correctly formatted file?
|
188
|
+
non_starred_but_uniquely_identified_peptides = hits.values.select do |peptide|
|
189
|
+
peptide.dtaselect_attributes['Unique'] == nil and peptide.parent_proteins.length == 1
|
190
|
+
end
|
191
|
+
log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
|
178
192
|
end
|
179
|
-
log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
|
180
193
|
|
181
194
|
# OK, finished parsing the file. Now output the score for each protein
|
182
195
|
puts [
|
data/divvy_proteomics.gemspec
CHANGED
@@ -2,14 +2,16 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
+
# stub: divvy_proteomics 0.3.0 ruby lib
|
5
6
|
|
6
7
|
Gem::Specification.new do |s|
|
7
8
|
s.name = "divvy_proteomics"
|
8
|
-
s.version = "0.
|
9
|
+
s.version = "0.3.0"
|
9
10
|
|
10
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
12
|
+
s.require_paths = ["lib"]
|
11
13
|
s.authors = ["Ben J Woodcroft"]
|
12
|
-
s.date = "
|
14
|
+
s.date = "2014-01-07"
|
13
15
|
s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
|
14
16
|
s.email = "donttrustben@gmail.com"
|
15
17
|
s.executables = ["divvy_spectra"]
|
@@ -28,8 +30,14 @@ Gem::Specification.new do |s|
|
|
28
30
|
"bin/divvy_spectra",
|
29
31
|
"divvy_proteomics.gemspec",
|
30
32
|
"lib/divvy_proteomics.rb",
|
33
|
+
"lib/divvyable_protein.rb",
|
31
34
|
"lib/dta_select_output.rb",
|
35
|
+
"lib/pep_xml.rb",
|
36
|
+
"spec/data/contaminant.pep.xml",
|
32
37
|
"spec/data/merge_definition.csv",
|
38
|
+
"spec/data/minimal.pep.xml",
|
39
|
+
"spec/data/minimal2.pep.xml",
|
40
|
+
"spec/data/minimal3.pep.xml",
|
33
41
|
"spec/data/multiply_mapped_spectra.csv",
|
34
42
|
"spec/data/new_format.csv",
|
35
43
|
"spec/data/new_format_some_all_shared_spectra.csv",
|
@@ -39,39 +47,39 @@ Gem::Specification.new do |s|
|
|
39
47
|
"spec/data/three_proteins_meant_for_merge.csv",
|
40
48
|
"spec/data/three_proteins_with_contaminant.csv",
|
41
49
|
"spec/divvy_proteomics_spec.rb",
|
50
|
+
"spec/pep_xml_spec.rb",
|
42
51
|
"spec/spec_helper.rb"
|
43
52
|
]
|
44
53
|
s.homepage = "http://github.com/wwood/divvy_proteomics"
|
45
54
|
s.licenses = ["MIT"]
|
46
|
-
s.
|
47
|
-
s.rubygems_version = "2.0.3"
|
55
|
+
s.rubygems_version = "2.2.0"
|
48
56
|
s.summary = "divvy up spectra from DTASelect files in a parsimonious way"
|
49
57
|
|
50
58
|
if s.respond_to? :specification_version then
|
51
59
|
s.specification_version = 4
|
52
60
|
|
53
61
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
54
|
-
s.add_runtime_dependency(%q<bio-logger>, ["
|
55
|
-
s.add_development_dependency(%q<systemu>, ["
|
56
|
-
s.add_development_dependency(%q<rspec>, ["
|
57
|
-
s.add_development_dependency(%q<rdoc>, ["
|
58
|
-
s.add_development_dependency(%q<bundler>, ["
|
59
|
-
s.add_development_dependency(%q<jeweler>, ["
|
62
|
+
s.add_runtime_dependency(%q<bio-logger>, ["~> 1.0"])
|
63
|
+
s.add_development_dependency(%q<systemu>, ["~> 2.6"])
|
64
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.14"])
|
65
|
+
s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
|
66
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.5"])
|
67
|
+
s.add_development_dependency(%q<jeweler>, ["~> 2.0"])
|
60
68
|
else
|
61
|
-
s.add_dependency(%q<bio-logger>, ["
|
62
|
-
s.add_dependency(%q<systemu>, ["
|
63
|
-
s.add_dependency(%q<rspec>, ["
|
64
|
-
s.add_dependency(%q<rdoc>, ["
|
65
|
-
s.add_dependency(%q<bundler>, ["
|
66
|
-
s.add_dependency(%q<jeweler>, ["
|
69
|
+
s.add_dependency(%q<bio-logger>, ["~> 1.0"])
|
70
|
+
s.add_dependency(%q<systemu>, ["~> 2.6"])
|
71
|
+
s.add_dependency(%q<rspec>, ["~> 2.14"])
|
72
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
73
|
+
s.add_dependency(%q<bundler>, ["~> 1.5"])
|
74
|
+
s.add_dependency(%q<jeweler>, ["~> 2.0"])
|
67
75
|
end
|
68
76
|
else
|
69
|
-
s.add_dependency(%q<bio-logger>, ["
|
70
|
-
s.add_dependency(%q<systemu>, ["
|
71
|
-
s.add_dependency(%q<rspec>, ["
|
72
|
-
s.add_dependency(%q<rdoc>, ["
|
73
|
-
s.add_dependency(%q<bundler>, ["
|
74
|
-
s.add_dependency(%q<jeweler>, ["
|
77
|
+
s.add_dependency(%q<bio-logger>, ["~> 1.0"])
|
78
|
+
s.add_dependency(%q<systemu>, ["~> 2.6"])
|
79
|
+
s.add_dependency(%q<rspec>, ["~> 2.14"])
|
80
|
+
s.add_dependency(%q<rdoc>, ["~> 3.12"])
|
81
|
+
s.add_dependency(%q<bundler>, ["~> 1.5"])
|
82
|
+
s.add_dependency(%q<jeweler>, ["~> 2.0"])
|
75
83
|
end
|
76
84
|
end
|
77
85
|
|
data/lib/divvy_proteomics.rb
CHANGED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'bio-logger'
|
2
|
+
Bio::Log::LoggerPlus.new('divvy_proteomics')
|
3
|
+
module Bio
|
4
|
+
module DivvyProteomics
|
5
|
+
module Logging
|
6
|
+
def log
|
7
|
+
Bio::Log::LoggerPlus['divvy_proteomics']
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
require 'divvyable_protein'
|
14
|
+
require 'dta_select_output'
|
15
|
+
require 'pep_xml'
|
16
|
+
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Bio::DivvyProteomics::DivvyableProtein
|
2
|
+
def unique_spectra
|
3
|
+
return 0 if @peptides.nil? or @peptides.empty?
|
4
|
+
num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
5
|
+
num ||= 0
|
6
|
+
return num
|
7
|
+
end
|
8
|
+
|
9
|
+
def non_unique_spectra
|
10
|
+
return 0 if @peptides.nil? or @peptides.empty?
|
11
|
+
num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
12
|
+
num ||= 0
|
13
|
+
|
14
|
+
return num
|
15
|
+
end
|
16
|
+
|
17
|
+
# Are there any peptides that are assigned exclusively to this protein?
|
18
|
+
def uniquely_identified_by_any_peptides?
|
19
|
+
unique_spectra > 0
|
20
|
+
end
|
21
|
+
|
22
|
+
def estimated_spectral_count
|
23
|
+
# How many unique spectra are there for each protein that shares a peptide with the current peptide
|
24
|
+
return 0 if @peptides.nil? or @peptides.empty?
|
25
|
+
peptide_shares = []
|
26
|
+
# If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
|
27
|
+
if !uniquely_identified_by_any_peptides?
|
28
|
+
# Don't attempt to divvy these up, because there are too many assumptions involved
|
29
|
+
return 0
|
30
|
+
else
|
31
|
+
peptides.each do |peptide|
|
32
|
+
log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
|
33
|
+
log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
|
34
|
+
total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
|
35
|
+
peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
|
36
|
+
end
|
37
|
+
return peptide_shares.reduce(:+)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/dta_select_output.rb
CHANGED
@@ -1,12 +1,9 @@
|
|
1
1
|
|
2
2
|
|
3
3
|
|
4
|
+
|
5
|
+
|
4
6
|
module Bio::DTASelect
|
5
|
-
module Logging
|
6
|
-
def log
|
7
|
-
Bio::Log::LoggerPlus['divvy_spectra']
|
8
|
-
end
|
9
|
-
end
|
10
7
|
|
11
8
|
class OutputFile
|
12
9
|
def self.log
|
@@ -14,7 +11,8 @@ module Bio::DTASelect
|
|
14
11
|
end
|
15
12
|
|
16
13
|
class SelectedProtein
|
17
|
-
include Bio::
|
14
|
+
include Bio::DivvyProteomics::Logging
|
15
|
+
include Bio::DivvyProteomics::DivvyableProtein
|
18
16
|
|
19
17
|
attr_accessor :identifier
|
20
18
|
|
@@ -26,43 +24,7 @@ module Bio::DTASelect
|
|
26
24
|
@peptides = []
|
27
25
|
end
|
28
26
|
|
29
|
-
def unique_spectra
|
30
|
-
return 0 if @peptides.nil? or @peptides.empty?
|
31
|
-
num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
32
|
-
num ||= 0
|
33
|
-
return num
|
34
|
-
end
|
35
|
-
|
36
|
-
def non_unique_spectra
|
37
|
-
return 0 if @peptides.nil? or @peptides.empty?
|
38
|
-
num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
|
39
|
-
num ||= 0
|
40
|
-
return num
|
41
|
-
end
|
42
27
|
|
43
|
-
# Are there any peptides that are assigned exclusively to this protein?
|
44
|
-
def uniquely_identified_by_any_peptides?
|
45
|
-
unique_spectra > 0
|
46
|
-
end
|
47
|
-
|
48
|
-
def estimated_spectral_count
|
49
|
-
# How many unique spectra are there for each protein that shares a peptide with the current peptide
|
50
|
-
return 0 if @peptides.nil? or @peptides.empty?
|
51
|
-
peptide_shares = []
|
52
|
-
# If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
|
53
|
-
if !uniquely_identified_by_any_peptides?
|
54
|
-
# Don't attempt to divvy these up, because there are too many assumptions involved
|
55
|
-
return 0
|
56
|
-
else
|
57
|
-
peptides.each do |peptide|
|
58
|
-
log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
|
59
|
-
log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
|
60
|
-
total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
|
61
|
-
peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
|
62
|
-
end
|
63
|
-
return peptide_shares.reduce(:+)
|
64
|
-
end
|
65
|
-
end
|
66
28
|
|
67
29
|
def log
|
68
30
|
Bio::Log::LoggerPlus[LOG_NAME]
|
@@ -70,7 +32,7 @@ module Bio::DTASelect
|
|
70
32
|
end
|
71
33
|
|
72
34
|
class Peptide
|
73
|
-
include Bio::
|
35
|
+
include Bio::DivvyProteomics::Logging
|
74
36
|
|
75
37
|
attr_accessor :identifier
|
76
38
|
|
@@ -98,7 +60,7 @@ module Bio::DTASelect
|
|
98
60
|
end
|
99
61
|
|
100
62
|
class Result
|
101
|
-
include Bio::
|
63
|
+
include Bio::DivvyProteomics::Logging
|
102
64
|
|
103
65
|
# hash of protein identifier to Protein object
|
104
66
|
attr_accessor :protein_name_to_object
|
@@ -123,7 +85,7 @@ module Bio::DTASelect
|
|
123
85
|
# Parse each line of the DTAselect file
|
124
86
|
io.each_line do |line|
|
125
87
|
splits = line.chomp.split("\t")
|
126
|
-
log.debug "Parsing line `#{line.chomp}'"
|
88
|
+
log.debug "Parsing line `#{line.chomp}'" if log.debug?
|
127
89
|
|
128
90
|
if reading_header
|
129
91
|
log.debug "reading header"
|
@@ -146,7 +108,7 @@ module Bio::DTASelect
|
|
146
108
|
if !last_line_was_protein_name
|
147
109
|
# Sometimes several proteins are given all in the one header line
|
148
110
|
# start a new protein
|
149
|
-
log.debug "New protein now being parsed"
|
111
|
+
log.debug "New protein now being parsed" if log.debug?
|
150
112
|
current_proteins = []
|
151
113
|
end
|
152
114
|
|
@@ -174,13 +136,13 @@ module Bio::DTASelect
|
|
174
136
|
|
175
137
|
|
176
138
|
elsif splits[1] == 'Proteins'
|
177
|
-
# Done processing, except for the bits down the bottom which aren't parsed (yet)
|
139
|
+
# Done processing, except for the bits down the bottom which aren't parsed (yet, at least)
|
178
140
|
break
|
179
141
|
|
180
142
|
|
181
143
|
|
182
144
|
else
|
183
|
-
log.debug "New spectra now being parsed"
|
145
|
+
log.debug "New spectra now being parsed" if log.debug?
|
184
146
|
last_line_was_protein_name = false
|
185
147
|
|
186
148
|
# Record a spectra
|
@@ -204,11 +166,11 @@ module Bio::DTASelect
|
|
204
166
|
pep.parent_proteins.push current_protein
|
205
167
|
current_protein.peptides.push pep
|
206
168
|
end
|
207
|
-
log.debug "Parsed this peptide #{pep.inspect}"
|
169
|
+
log.debug "Parsed this peptide #{pep.inspect}" if log.debug?
|
208
170
|
end
|
209
171
|
end
|
210
172
|
|
211
|
-
log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}"
|
173
|
+
log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}" if log.debug?
|
212
174
|
return result
|
213
175
|
end
|
214
176
|
end
|
data/lib/pep_xml.rb
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'rexml/document'
|
2
|
+
|
3
|
+
class Bio::PepXML
|
4
|
+
include Bio::DivvyProteomics::Logging
|
5
|
+
|
6
|
+
attr_accessor :protein_name_to_object, :peptide_name_to_object
|
7
|
+
|
8
|
+
class Protein
|
9
|
+
include Bio::DivvyProteomics::Logging
|
10
|
+
include Bio::DivvyProteomics::DivvyableProtein
|
11
|
+
|
12
|
+
# Array of peptide objects that have been assigned to this protein
|
13
|
+
attr_accessor :peptides
|
14
|
+
|
15
|
+
attr_accessor :identifier, :descriptive_name
|
16
|
+
end
|
17
|
+
|
18
|
+
# Named 'Peptide' but really mean Spectra. Just too hard to change
|
19
|
+
class Peptide
|
20
|
+
attr_accessor :parent_proteins
|
21
|
+
|
22
|
+
# Name of the spectra
|
23
|
+
attr_accessor :identifier
|
24
|
+
|
25
|
+
def initialize
|
26
|
+
@parent_proteins = []
|
27
|
+
end
|
28
|
+
|
29
|
+
#TODO: right now this just always returns 1. It should really be working out redundancy
|
30
|
+
#properly by comparison of peptide sequences, but this isn't yet parsed this info
|
31
|
+
def redundancy
|
32
|
+
1
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.log
|
37
|
+
Bio::PepXML.new.log
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.parse(io)
|
41
|
+
protein_name_to_object = {}
|
42
|
+
peptide_name_to_object = {}
|
43
|
+
|
44
|
+
#pep.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit'){|e|
|
45
|
+
# c+=1; p e.attributes['protein_descr'].strip;
|
46
|
+
# e.elements.each{|e|
|
47
|
+
# p e.name, e.attributes['protein_descr'].strip};break}
|
48
|
+
xml = REXML::Document.new(io)
|
49
|
+
|
50
|
+
parse_name_and_description = lambda do |e|
|
51
|
+
name = e.attributes['protein'].strip
|
52
|
+
description = e.attributes['protein_descr'].strip
|
53
|
+
if name.nil? or name == ''
|
54
|
+
name = e.attributes['protein_descr'].strip
|
55
|
+
else
|
56
|
+
description = name+' '+description
|
57
|
+
end
|
58
|
+
name.gsub!(/\t.*/,'')
|
59
|
+
description.gsub!(/[\t\n]/,' ')
|
60
|
+
|
61
|
+
[name, description]
|
62
|
+
end
|
63
|
+
|
64
|
+
#TODO: some better sanity checking here would be ideal.
|
65
|
+
num_hits_parsed = 0
|
66
|
+
xml.elements.each('msms_pipeline_analysis/msms_run_summary/spectrum_query/search_result/search_hit') do |hit|
|
67
|
+
hit_number = hit.attributes['hit_rank']
|
68
|
+
raise "Parsing error on #{hit}" if hit_number.nil?
|
69
|
+
next if hit_number != "1"
|
70
|
+
|
71
|
+
# Parse the primary hit
|
72
|
+
name1, description1 = parse_name_and_description.call(hit)
|
73
|
+
raise "No protein name found in this xml fragment: #{hit.to_s}" if name1.nil?
|
74
|
+
spectrum_name = hit.parent.parent.attributes['spectrum'].strip
|
75
|
+
raise "Parsing error (couldn't find spectrum name) with spectra #{hit.inspect}" if spectrum_name.nil?
|
76
|
+
|
77
|
+
# It is possible to have multiple peptides both hit the spectra with hit_rank="1"
|
78
|
+
# This happens when when e.g. leucine and isoleucine are possible.
|
79
|
+
spectrum = peptide_name_to_object[spectrum_name]
|
80
|
+
if spectrum.nil?
|
81
|
+
spectrum = Peptide.new
|
82
|
+
spectrum.identifier = spectrum_name
|
83
|
+
peptide_name_to_object[spectrum_name] = spectrum
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
protein1 = protein_name_to_object[name1]
|
88
|
+
if protein1.nil?
|
89
|
+
protein1 = Protein.new
|
90
|
+
protein1.identifier = name1
|
91
|
+
protein1.descriptive_name = description1
|
92
|
+
protein1.peptides = []
|
93
|
+
protein_name_to_object[name1] = protein1
|
94
|
+
end
|
95
|
+
protein1.peptides.push spectrum
|
96
|
+
spectrum.parent_proteins ||= []
|
97
|
+
spectrum.parent_proteins.push protein1
|
98
|
+
|
99
|
+
|
100
|
+
# Parse the alternate hits. Only look at children with protein_descr attributes - these are
|
101
|
+
# these are the alternate proteins
|
102
|
+
hit.each_element_with_attribute('protein_descr') do |e|
|
103
|
+
name, description = parse_name_and_description.call(e)
|
104
|
+
|
105
|
+
alternate = protein_name_to_object[name]
|
106
|
+
if alternate.nil?
|
107
|
+
alternate = Protein.new
|
108
|
+
alternate.identifier = name
|
109
|
+
alternate.descriptive_name = description
|
110
|
+
alternate.peptides = []
|
111
|
+
protein_name_to_object[name] = alternate
|
112
|
+
end
|
113
|
+
alternate.peptides.push spectrum
|
114
|
+
spectrum.parent_proteins.push alternate
|
115
|
+
end
|
116
|
+
|
117
|
+
# Don't count the same protein multiple times - might happen when a spectru
|
118
|
+
spectrum.parent_proteins.uniq!
|
119
|
+
|
120
|
+
num_hits_parsed += 1
|
121
|
+
end
|
122
|
+
log.info "Parsed #{num_hits_parsed} search hits"
|
123
|
+
|
124
|
+
pepxml = Bio::PepXML.new
|
125
|
+
pepxml.protein_name_to_object = protein_name_to_object
|
126
|
+
pepxml.peptide_name_to_object = peptide_name_to_object
|
127
|
+
|
128
|
+
return pepxml
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
|
3
|
+
<msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
|
4
|
+
<spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_03.1121.1121.2" start_scan="1121" end_scan="1121" retention_time_sec="5.4199816666666667" activation_method="CID" precursor_intensity="388495.5625" precursor_neutral_mass="1329.7252673153125" assumed_charge="2" index="221">
|
5
|
+
<search_result search_id="1">
|
6
|
+
<search_hit hit_rank="1" peptide="NLDLDSIIAEVK" protein="CNTM:cont_sp" num_tot_proteins="2" num_matched_ions="0" calc_neutral_pep_mass="1329.7252673153125" massdiff="0" protein_descr="P13647 K2C5_HUMAN Keratin, type II cytoskeletal 5 (Cytokeratin 5) (K5) (CK 5) (58 kDa cytokeratin) - Homo sapiens (Human). # pI:8.14 MW:62462" protein_mw="62.423064734660052" calc_pI="8.06005859375">
|
7
|
+
<alternative_protein protein="CNTM:cont_sp" protein_descr="P48668 K2CE_HUMAN Keratin, type II cytoskeletal 6E (Cytokeratin 6E) (CK 6E) (K6e keratin) - Homo sapiens (Human). # pI:8.14 MW:60092" protein_mw="60.05537958466001" />
|
8
|
+
<search_score name="XCorr" value="4.5027022361755371" />
|
9
|
+
</search_hit>
|
10
|
+
</search_result>
|
11
|
+
</spectrum_query>
|
12
|
+
</msms_run_summary>
|
13
|
+
</msms_pipeline_analysis>
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
|
3
|
+
<msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
|
4
|
+
<spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
5
|
+
<search_result search_id="1">
|
6
|
+
<search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
7
|
+
<alternative_protein protein="" protein_descr=">38SUR_6350_1528184_1	" protein_mw="24.663561404659987" />
|
8
|
+
<alternative_protein protein="" protein_descr=">38SUR_80622_1602456_1	" protein_mw="30.364007294659981" />
|
9
|
+
<search_score name="XCorr" value="4.7916374206542969" />
|
10
|
+
</search_hit>
|
11
|
+
</search_result>
|
12
|
+
</spectrum_query>
|
13
|
+
</msms_run_summary>
|
14
|
+
</msms_pipeline_analysis>
|
@@ -0,0 +1,12 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
|
3
|
+
<msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
|
4
|
+
<spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
5
|
+
<search_result search_id="1">
|
6
|
+
<search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
7
|
+
<search_score name="XCorr" value="4.7916374206542969" />
|
8
|
+
</search_hit>
|
9
|
+
</search_result>
|
10
|
+
</spectrum_query>
|
11
|
+
</msms_run_summary>
|
12
|
+
</msms_pipeline_analysis>
|
@@ -0,0 +1,21 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<msms_pipeline_analysis date="2013-12-06T09:32:51.2000705-07:00" name="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT" summary_xml="" xmlns="http://regis-web.systemsbiology.net/pepXML">
|
3
|
+
<msms_run_summary base_name="D:\Proteome_Discoverer\RawFiles\FASP1\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513\Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_All_SEQUESTHT.msf" raw_data_type=".msf" raw_data=".msf" msManufacturer="" msModel="" msIonization="" msMassAnalyzer="" msDetector="">
|
4
|
+
<spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
5
|
+
<search_result search_id="1">
|
6
|
+
<search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
7
|
+
<alternative_protein protein="" protein_descr=">38SUR_6350_1528184_1	" protein_mw="24.663561404659987" />
|
8
|
+
<alternative_protein protein="" protein_descr=">38SUR_80622_1602456_1	" protein_mw="30.364007294659981" />
|
9
|
+
<search_score name="XCorr" value="4.7916374206542969" />
|
10
|
+
</search_hit>
|
11
|
+
</search_result>
|
12
|
+
</spectrum_query>
|
13
|
+
<spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2_3" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
14
|
+
<search_result search_id="1">
|
15
|
+
<search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
16
|
+
<search_score name="XCorr" value="4.7916374206542969" />
|
17
|
+
</search_hit>
|
18
|
+
</search_result>
|
19
|
+
</spectrum_query>
|
20
|
+
</msms_run_summary>
|
21
|
+
</msms_pipeline_analysis>
|
@@ -11,8 +11,6 @@ $:.unshift File.join(File.dirname(__FILE__),'..')
|
|
11
11
|
script_under_test = File.basename(__FILE__).gsub(/^test_/,'')
|
12
12
|
path_to_script = File.join(File.dirname(__FILE__),'..','bin','divvy_spectra')
|
13
13
|
|
14
|
-
TEST_DATA_DIR = File.join(File.dirname(__FILE__),'data')
|
15
|
-
|
16
14
|
describe script_under_test do
|
17
15
|
let(:header){"ID\tUnique spectra\tNon-unique spectra\tEstimated total spectra\tNormalised spectral count\tDescription\tProteins sharing spectra\n"}
|
18
16
|
it 'should do 1 protein hit' do
|
@@ -0,0 +1,99 @@
|
|
1
|
+
require 'systemu'
|
2
|
+
require 'pp'
|
3
|
+
require 'open3'
|
4
|
+
require 'tempfile'
|
5
|
+
|
6
|
+
require 'spec_helper'
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
describe 'pepxml parsing' do
|
11
|
+
let(:header){"ID\tUnique spectra\tNon-unique spectra\tEstimated total spectra\tNormalised spectral count\tDescription\tProteins sharing spectra\n"}
|
12
|
+
it 'should parse decently' do
|
13
|
+
pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal.pep.xml')))
|
14
|
+
|
15
|
+
# <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
16
|
+
# <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
17
|
+
# <alternative_protein protein="" protein_descr=">38SUR_6350_1528184_1	" protein_mw="24.663561404659987" />
|
18
|
+
# <alternative_protein protein="" protein_descr=">38SUR_80622_1602456_1	" protein_mw="30.364007294659981" />
|
19
|
+
# <search_score name="XCorr" value="4.7916374206542969" />
|
20
|
+
|
21
|
+
pepxml.kind_of?(Bio::PepXML).should == true
|
22
|
+
|
23
|
+
pepxml.protein_name_to_object.keys.sort.should == [
|
24
|
+
'>38SUR_2379_1524213_2',
|
25
|
+
'>38SUR_6350_1528184_1',
|
26
|
+
'>38SUR_80622_1602456_1',
|
27
|
+
].sort
|
28
|
+
pepxml.peptide_name_to_object.keys.sort.should == [
|
29
|
+
'Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2'
|
30
|
+
]
|
31
|
+
pepxml.protein_name_to_object.values.each do |prot|
|
32
|
+
prot.kind_of?(Bio::PepXML::Protein).should == true
|
33
|
+
end
|
34
|
+
pepxml.peptide_name_to_object.values.each do |prot|
|
35
|
+
prot.kind_of?(Bio::PepXML::Peptide).should == true
|
36
|
+
end
|
37
|
+
|
38
|
+
prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
|
39
|
+
prot1.identifier.should == '>38SUR_2379_1524213_2'
|
40
|
+
prot1.descriptive_name.should == '>38SUR_2379_1524213_2'
|
41
|
+
end
|
42
|
+
|
43
|
+
it 'should respond to divvy proteomics module things' do
|
44
|
+
pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal.pep.xml')))
|
45
|
+
|
46
|
+
# <spectrum_query spectrum="Tara_38sfc_FASP_8hr_OrbiVelosPro_Run1_030513_02.9921.9921.2" start_scan="9921" end_scan="9921" retention_time_sec="41.732728333333334" activation_method="CID" precursor_intensity="61015.84375" precursor_neutral_mass="1246.6412829403125" assumed_charge="2" index="1">
|
47
|
+
# <search_hit hit_rank="1" peptide="IADQTIGTANSR" protein="" num_tot_proteins="3" num_matched_ions="0" calc_neutral_pep_mass="1246.6412829403125" massdiff="0" protein_descr=">38SUR_2379_1524213_2	" protein_mw="43.185399974660044" calc_pI="5.63037109375">
|
48
|
+
# <alternative_protein protein="" protein_descr=">38SUR_6350_1528184_1	" protein_mw="24.663561404659987" />
|
49
|
+
# <alternative_protein protein="" protein_descr=">38SUR_80622_1602456_1	" protein_mw="30.364007294659981" />
|
50
|
+
# <search_score name="XCorr" value="4.7916374206542969" />
|
51
|
+
|
52
|
+
pepxml.kind_of?(Bio::PepXML).should == true
|
53
|
+
|
54
|
+
prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
|
55
|
+
prot1.peptides.length.should == 1
|
56
|
+
prot1.unique_spectra.should == 0
|
57
|
+
prot1.non_unique_spectra.should == 1
|
58
|
+
prot1.estimated_spectral_count.should == 0.0
|
59
|
+
|
60
|
+
|
61
|
+
prot1 = pepxml.protein_name_to_object['>38SUR_6350_1528184_1']
|
62
|
+
prot1.peptides.length.should == 1
|
63
|
+
prot1.unique_spectra.should == 0
|
64
|
+
prot1.non_unique_spectra.should == 1
|
65
|
+
prot1.estimated_spectral_count.should == 0.0
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'should respond to divvy proteomics module things with 1 unique hit' do
|
69
|
+
pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal2.pep.xml')))
|
70
|
+
pepxml.kind_of?(Bio::PepXML).should == true
|
71
|
+
|
72
|
+
prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
|
73
|
+
prot1.peptides.length.should == 1
|
74
|
+
prot1.unique_spectra.should == 1
|
75
|
+
prot1.non_unique_spectra.should == 0
|
76
|
+
prot1.estimated_spectral_count.should == 1.0
|
77
|
+
end
|
78
|
+
|
79
|
+
it 'should respond to divvy proteomics module things with 2 hits, where 1 is unique' do
|
80
|
+
pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'minimal3.pep.xml')))
|
81
|
+
pepxml.kind_of?(Bio::PepXML).should == true
|
82
|
+
|
83
|
+
prot1 = pepxml.protein_name_to_object['>38SUR_2379_1524213_2']
|
84
|
+
prot1.peptides.length.should == 2
|
85
|
+
prot1.unique_spectra.should == 1
|
86
|
+
prot1.non_unique_spectra.should == 1
|
87
|
+
prot1.estimated_spectral_count.should == 2.0
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'should parse when the protein and protein_desc attributes are both defined' do
|
91
|
+
pepxml = Bio::PepXML.parse(File.open(File.join(TEST_DATA_DIR, 'contaminant.pep.xml')))
|
92
|
+
pepxml.kind_of?(Bio::PepXML).should == true
|
93
|
+
|
94
|
+
prot1 = pepxml.protein_name_to_object['CNTM:cont_sp']
|
95
|
+
prot1.nil?.should == false
|
96
|
+
prot1.identifier.should == 'CNTM:cont_sp'
|
97
|
+
prot1.descriptive_name.should == 'CNTM:cont_sp P13647 K2C5_HUMAN Keratin, type II cytoskeletal 5 (Cytokeratin 5) (K5) (CK 5) (58 kDa cytokeratin) - Homo sapiens (Human). # pI:8.14 MW:62462'
|
98
|
+
end
|
99
|
+
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,99 +1,99 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: divvy_proteomics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-01-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
19
|
+
version: '1.0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
26
|
+
version: '1.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: systemu
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '2.6'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '2.6'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rspec
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: 2.
|
47
|
+
version: '2.14'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: 2.
|
54
|
+
version: '2.14'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: rdoc
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '3.12'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '3.12'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: bundler
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
|
-
- -
|
73
|
+
- - "~>"
|
74
74
|
- !ruby/object:Gem::Version
|
75
|
-
version: 1.
|
75
|
+
version: '1.5'
|
76
76
|
type: :development
|
77
77
|
prerelease: false
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
79
79
|
requirements:
|
80
|
-
- -
|
80
|
+
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.
|
82
|
+
version: '1.5'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
name: jeweler
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
|
-
- -
|
87
|
+
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version:
|
89
|
+
version: '2.0'
|
90
90
|
type: :development
|
91
91
|
prerelease: false
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
93
93
|
requirements:
|
94
|
-
- -
|
94
|
+
- - "~>"
|
95
95
|
- !ruby/object:Gem::Version
|
96
|
-
version:
|
96
|
+
version: '2.0'
|
97
97
|
description: divvy up spectra from DTASelect files in a somewhat parsimonious way
|
98
98
|
email: donttrustben@gmail.com
|
99
99
|
executables:
|
@@ -103,8 +103,8 @@ extra_rdoc_files:
|
|
103
103
|
- LICENSE.txt
|
104
104
|
- README.md
|
105
105
|
files:
|
106
|
-
- .document
|
107
|
-
- .rspec
|
106
|
+
- ".document"
|
107
|
+
- ".rspec"
|
108
108
|
- Gemfile
|
109
109
|
- LICENSE.txt
|
110
110
|
- README.md
|
@@ -113,8 +113,14 @@ files:
|
|
113
113
|
- bin/divvy_spectra
|
114
114
|
- divvy_proteomics.gemspec
|
115
115
|
- lib/divvy_proteomics.rb
|
116
|
+
- lib/divvyable_protein.rb
|
116
117
|
- lib/dta_select_output.rb
|
118
|
+
- lib/pep_xml.rb
|
119
|
+
- spec/data/contaminant.pep.xml
|
117
120
|
- spec/data/merge_definition.csv
|
121
|
+
- spec/data/minimal.pep.xml
|
122
|
+
- spec/data/minimal2.pep.xml
|
123
|
+
- spec/data/minimal3.pep.xml
|
118
124
|
- spec/data/multiply_mapped_spectra.csv
|
119
125
|
- spec/data/new_format.csv
|
120
126
|
- spec/data/new_format_some_all_shared_spectra.csv
|
@@ -124,6 +130,7 @@ files:
|
|
124
130
|
- spec/data/three_proteins_meant_for_merge.csv
|
125
131
|
- spec/data/three_proteins_with_contaminant.csv
|
126
132
|
- spec/divvy_proteomics_spec.rb
|
133
|
+
- spec/pep_xml_spec.rb
|
127
134
|
- spec/spec_helper.rb
|
128
135
|
homepage: http://github.com/wwood/divvy_proteomics
|
129
136
|
licenses:
|
@@ -135,17 +142,17 @@ require_paths:
|
|
135
142
|
- lib
|
136
143
|
required_ruby_version: !ruby/object:Gem::Requirement
|
137
144
|
requirements:
|
138
|
-
- -
|
145
|
+
- - ">="
|
139
146
|
- !ruby/object:Gem::Version
|
140
147
|
version: '0'
|
141
148
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
142
149
|
requirements:
|
143
|
-
- -
|
150
|
+
- - ">="
|
144
151
|
- !ruby/object:Gem::Version
|
145
152
|
version: '0'
|
146
153
|
requirements: []
|
147
154
|
rubyforge_project:
|
148
|
-
rubygems_version: 2.0
|
155
|
+
rubygems_version: 2.2.0
|
149
156
|
signing_key:
|
150
157
|
specification_version: 4
|
151
158
|
summary: divvy up spectra from DTASelect files in a parsimonious way
|