divvy_proteomics 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/bin/divvy_spectra +19 -6
- data/divvy_proteomics.gemspec +3 -2
- data/spec/data/new_format_some_all_shared_spectra.csv +45 -0
- data/spec/divvy_proteomics_spec.rb +23 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 05ddf71aecd113201c370185104006af9705a525
|
4
|
+
data.tar.gz: b450d8ecd60717f7286f234aed29beaf455aa4eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a0b9e987dd54239a0da817becc9755c42d0f375e89fc16ff7c9f442830b34379de98c89f3bad0d74286567c5cbaafdc6a97656f766f2e63da02ccd113004019e
|
7
|
+
data.tar.gz: 2347260e7dba1a6bb08c91cc2183b445d534b36e1296e6d3338076ed3729073276df516dbfa83bce87e18af4e51721df744fc694eee3a60a096f27d9ecf0f666
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/divvy_spectra
CHANGED
@@ -15,7 +15,7 @@ require 'dta_select_output'
|
|
15
15
|
options = {
|
16
16
|
:logger => 'stderr',
|
17
17
|
:log_level => 'info',
|
18
|
-
:
|
18
|
+
:contaminant_regexes => [/^CNTM:/],
|
19
19
|
}
|
20
20
|
o = OptionParser.new do |opts|
|
21
21
|
opts.banner = "
|
@@ -29,6 +29,10 @@ o = OptionParser.new do |opts|
|
|
29
29
|
opts.on("--whitelist FILE_OF_PROTEINS_TO_REPORT", "Only report proteins that are in this whitelist, after divvying with everything") do |file|
|
30
30
|
options[:whitelist_file] = file
|
31
31
|
end
|
32
|
+
opts.on("--contaminant-regexes REGEXES", "Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: #{options[:contaminant_prefixes]}]") do |str|
|
33
|
+
options[:contaminant_regexes] = str.split(/,/).collect{|s| /#{s}/}
|
34
|
+
end
|
35
|
+
|
32
36
|
|
33
37
|
# logger options
|
34
38
|
opts.separator "\nVerbosity:\n\n"
|
@@ -42,8 +46,7 @@ if ARGV.length > 1
|
|
42
46
|
end
|
43
47
|
# Setup logging
|
44
48
|
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
45
|
-
|
46
|
-
|
49
|
+
#$stderr.puts options[:contaminant_prefixes].inspect
|
47
50
|
|
48
51
|
# Read in merges, if required
|
49
52
|
mergers = {}
|
@@ -130,12 +133,22 @@ mergers.each do |secondary_id, primary_id|
|
|
130
133
|
end #The other two cases do not require any intervention,
|
131
134
|
end
|
132
135
|
|
136
|
+
id_is_contaminating = lambda do |protein_id|
|
137
|
+
selected = false
|
138
|
+
options[:contaminant_regexes].each do |regex|
|
139
|
+
if protein_id.match(regex)
|
140
|
+
selected = true
|
141
|
+
end
|
142
|
+
end
|
143
|
+
selected
|
144
|
+
end
|
145
|
+
|
133
146
|
|
134
147
|
# Total spectra shouldn't count contaminants, but shared spectra should still be divvied up with
|
135
148
|
# Annoying thing here is when contaminating proteins share spectra
|
136
149
|
total_contaminating_peptides = hits.collect do |ident, peptide|
|
137
150
|
num_contaminating_parents = peptide.parent_proteins.select do |prot|
|
138
|
-
prot.identifier
|
151
|
+
id_is_contaminating.call prot.identifier
|
139
152
|
end.length
|
140
153
|
|
141
154
|
if num_contaminating_parents > 0
|
@@ -145,7 +158,7 @@ total_contaminating_peptides = hits.collect do |ident, peptide|
|
|
145
158
|
end
|
146
159
|
end
|
147
160
|
total_contaminating_spectra = total_contaminating_peptides.reduce :+
|
148
|
-
total_contaminating_spectra ||=
|
161
|
+
total_contaminating_spectra ||= 0
|
149
162
|
log.info "Found #{total_contaminating_spectra} contaminating spectral counts"
|
150
163
|
|
151
164
|
total_spectra = hits.collect{|ident, pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
|
@@ -177,7 +190,7 @@ puts [
|
|
177
190
|
].join "\t"
|
178
191
|
log.warn "No unique spectra found!" if total_spectra == 0
|
179
192
|
proteins.each do |protein_id, protein|
|
180
|
-
next if
|
193
|
+
next if id_is_contaminating.call(protein_id) #Don't print contaminants
|
181
194
|
|
182
195
|
if options[:whitelist_file].nil? or whitelist.include?(protein_id) # If there's a whitelist, apply it now
|
183
196
|
log.debug "Now printing protein #{protein_id}, which has #{protein.peptides.length} associated peptides"
|
data/divvy_proteomics.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "divvy_proteomics"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Ben J Woodcroft"]
|
12
|
-
s.date = "2013-11-
|
12
|
+
s.date = "2013-11-06"
|
13
13
|
s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
|
14
14
|
s.email = "donttrustben@gmail.com"
|
15
15
|
s.executables = ["divvy_spectra"]
|
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
"spec/data/merge_definition.csv",
|
33
33
|
"spec/data/multiply_mapped_spectra.csv",
|
34
34
|
"spec/data/new_format.csv",
|
35
|
+
"spec/data/new_format_some_all_shared_spectra.csv",
|
35
36
|
"spec/data/single_protein.csv",
|
36
37
|
"spec/data/single_protein_with_aliases.csv",
|
37
38
|
"spec/data/three_proteins.csv",
|
@@ -0,0 +1,45 @@
|
|
1
|
+
DTASelect v1.9
|
2
|
+
/auto/gtl/ms/Abisko_Soils/Field_Sampling_Aug_2010/SurfDeep/Erio_Deep_Aug2010_27Jun_TR1/Frac1/analysis/tryp_edeep_082010_500bp_Wheat/sequest
|
3
|
+
/auto/gtl/db/Abisko_Soils/edeep_082010_500bp_Wheat_cntm_psm
|
4
|
+
SEQUEST v.27 in SQT format.
|
5
|
+
--DB -p 2 -r 1000
|
6
|
+
true Use criteria
|
7
|
+
1.8 Minimum +1 XCorr
|
8
|
+
2.5 Minimum +2 XCorr
|
9
|
+
3.5 Minimum +3 XCorr
|
10
|
+
0.08 Minimum DeltCN
|
11
|
+
1 Minimum charge state
|
12
|
+
3 Maximum charge state
|
13
|
+
0.0 Minimum ion proportion
|
14
|
+
1000 Maximum Sp rank
|
15
|
+
-1.0 Minimum Sp score
|
16
|
+
Include Modified peptide inclusion
|
17
|
+
Any Tryptic status requirement
|
18
|
+
true Multiple, ambiguous IDs allowed
|
19
|
+
Ignore Peptide validation handling
|
20
|
+
XCorr Purge duplicate peptides by protein
|
21
|
+
false Include only loci with unique peptide
|
22
|
+
false Remove subset proteins
|
23
|
+
Ignore Locus validation handling
|
24
|
+
0 Minimum modified peptides per locus
|
25
|
+
1000 Minimum redundancy for low coverage loci
|
26
|
+
2 Minimum peptides per locus
|
27
|
+
|
28
|
+
Locus Sequence Count Spectrum Count Sequence Coverage Length MolWt pI Validation Status Descriptive Name
|
29
|
+
Unique FileName XCorr DeltCN Obs_mono_m/z Calc_mono_m/z PPM Delta_amu TotalIntensity SpRank SpScore IonProportion Redundancy Sequence
|
30
|
+
E1D_raw_1__154435_1 3 41 79.3% 58 6132 7.3 U # 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
|
31
|
+
aliaese 3 41 79.3% 58 6132 7.3 U # 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
|
32
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3 5.8526 0.4034 2772.4479 2772.4445 1.2211 0.0034 6048.2 1 1349.0 40.0 30 -.MLSIQTNIAALSAQNALTTTNNNLQK.S
|
33
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3 4.5965 0.4101 3275.6674 3275.6608 2.0124 0.0066 5944.2 1 884.7 28.4 7 -.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
|
34
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2 4.5641 0.462 1594.7954 1594.7956 -0.1435 -0.0002 4981.9 1 1347.9 76.7 4 R.INHAADDAAGLAISEK.M
|
35
|
+
E1D_raw_1__40591_2 3 8 74.5% 51 5250 8.6 U # 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None
|
36
|
+
* 20120806_Erio_Deep_Aug2010_27Jun_TR1_01.18475.18475.2 4.3739 0.5173 2140.0658 2140.0653 0.2192 0.0005 7636.1 1 1642.9 65.0 1 K.TSDVAGDGTTTATILAQSIYR.E
|
37
|
+
* 20120806_Erio_Deep_Aug2010_27Jun_TR1_02.21883.21883.2 3.4843 0.1996 2553.2977 2553.2928 1.9293 0.0049 6903.6 1 903.4 47.9 3 K.TSDVAGDGTTTATILAQSIYREGVK.A
|
38
|
+
* 20120806_Erio_Deep_Aug2010_27Jun_TR1_08.06194.06194.2 2.7604 0.1784 1326.7055 1326.7083 -2.1145 -0.0028 6041.5 1 860.0 66.7 4 K.AVAAGANPMELKR.G
|
39
|
+
Proteins Peptide IDs Copies
|
40
|
+
Unfiltered 318515 400116 506301
|
41
|
+
Redundant 1575 3555 18759
|
42
|
+
Nonredundant 1211 2557 12384
|
43
|
+
|
44
|
+
Classification Nonredundant Proteins Redundant Proteins
|
45
|
+
Unclassified 0 0
|
@@ -88,6 +88,17 @@ describe script_under_test do
|
|
88
88
|
stdout.should eq(answer), test_file
|
89
89
|
end
|
90
90
|
|
91
|
+
it 'should handle arbitrary contaminant prefixes' do
|
92
|
+
test_file = "#{path_to_script} #{TEST_DATA_DIR}/single_protein_with_aliases.csv --trace error --contaminant-regexes alias"
|
93
|
+
status, stdout, stderr = systemu test_file
|
94
|
+
|
95
|
+
stderr.should eq("")
|
96
|
+
answer = header+
|
97
|
+
['Mstor_v4.3.2:1344','0','188','0','NaN','Methanoflorens_stordalmirensis_v4.3.2_01361 Methyl-coenzyme M reductase I subunit gamma ','alias1'+"\n"].join("\t")
|
98
|
+
#['alias1','0','188','0','0.0','alias1 Methyl-coenzyme M reductase I subunit gamma ','Mstor_v4.3.2:1344'+"\n"].join("\t")
|
99
|
+
stdout.should eq(answer), test_file
|
100
|
+
end
|
101
|
+
|
91
102
|
it 'should do a whitelist correctly' do
|
92
103
|
Tempfile.open('test_divvy_spectra') do |tempfile|
|
93
104
|
%w(eDeep20120820:eD1_8237_2 eDeep20120820:eD1_1639_1).each {|i| tempfile.puts i}
|
@@ -115,4 +126,16 @@ describe script_under_test do
|
|
115
126
|
['E1D_raw_1__40591_2','8','0','8.0','0.12121212121212122','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',"\n"].join("\t")
|
116
127
|
stdout.should eq(answer)
|
117
128
|
end
|
129
|
+
|
130
|
+
it 'should include spectra shared between unresolvable proteins in calculations' do
|
131
|
+
test_file = "#{path_to_script} #{TEST_DATA_DIR}/new_format_some_all_shared_spectra.csv --trace error"
|
132
|
+
status, stdout, stderr = systemu test_file
|
133
|
+
|
134
|
+
stderr.should eq("")
|
135
|
+
answer = header+
|
136
|
+
['E1D_raw_1__154435_1','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','aliaese'+"\n"].join("\t")+
|
137
|
+
['aliaese','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154435_1'+"\n"].join("\t")+
|
138
|
+
['E1D_raw_1__40591_2','8','0','8.0','0.16326530612244897','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',''+"\n"].join("\t")
|
139
|
+
stdout.should eq(answer)
|
140
|
+
end
|
118
141
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: divvy_proteomics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-11-
|
11
|
+
date: 2013-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -117,6 +117,7 @@ files:
|
|
117
117
|
- spec/data/merge_definition.csv
|
118
118
|
- spec/data/multiply_mapped_spectra.csv
|
119
119
|
- spec/data/new_format.csv
|
120
|
+
- spec/data/new_format_some_all_shared_spectra.csv
|
120
121
|
- spec/data/single_protein.csv
|
121
122
|
- spec/data/single_protein_with_aliases.csv
|
122
123
|
- spec/data/three_proteins.csv
|