divvy_proteomics 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/bin/divvy_spectra +19 -6
- data/divvy_proteomics.gemspec +3 -2
- data/spec/data/new_format_some_all_shared_spectra.csv +45 -0
- data/spec/divvy_proteomics_spec.rb +23 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 05ddf71aecd113201c370185104006af9705a525
|
4
|
+
data.tar.gz: b450d8ecd60717f7286f234aed29beaf455aa4eb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a0b9e987dd54239a0da817becc9755c42d0f375e89fc16ff7c9f442830b34379de98c89f3bad0d74286567c5cbaafdc6a97656f766f2e63da02ccd113004019e
|
7
|
+
data.tar.gz: 2347260e7dba1a6bb08c91cc2183b445d534b36e1296e6d3338076ed3729073276df516dbfa83bce87e18af4e51721df744fc694eee3a60a096f27d9ecf0f666
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/divvy_spectra
CHANGED
@@ -15,7 +15,7 @@ require 'dta_select_output'
|
|
15
15
|
options = {
|
16
16
|
:logger => 'stderr',
|
17
17
|
:log_level => 'info',
|
18
|
-
:
|
18
|
+
:contaminant_regexes => [/^CNTM:/],
|
19
19
|
}
|
20
20
|
o = OptionParser.new do |opts|
|
21
21
|
opts.banner = "
|
@@ -29,6 +29,10 @@ o = OptionParser.new do |opts|
|
|
29
29
|
opts.on("--whitelist FILE_OF_PROTEINS_TO_REPORT", "Only report proteins that are in this whitelist, after divvying with everything") do |file|
|
30
30
|
options[:whitelist_file] = file
|
31
31
|
end
|
32
|
+
opts.on("--contaminant-regexes REGEXES", "Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: #{options[:contaminant_prefixes]}]") do |str|
|
33
|
+
options[:contaminant_regexes] = str.split(/,/).collect{|s| /#{s}/}
|
34
|
+
end
|
35
|
+
|
32
36
|
|
33
37
|
# logger options
|
34
38
|
opts.separator "\nVerbosity:\n\n"
|
@@ -42,8 +46,7 @@ if ARGV.length > 1
|
|
42
46
|
end
|
43
47
|
# Setup logging
|
44
48
|
Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
|
45
|
-
|
46
|
-
|
49
|
+
#$stderr.puts options[:contaminant_prefixes].inspect
|
47
50
|
|
48
51
|
# Read in merges, if required
|
49
52
|
mergers = {}
|
@@ -130,12 +133,22 @@ mergers.each do |secondary_id, primary_id|
|
|
130
133
|
end #The other two cases do not require any intervention,
|
131
134
|
end
|
132
135
|
|
136
|
+
id_is_contaminating = lambda do |protein_id|
|
137
|
+
selected = false
|
138
|
+
options[:contaminant_regexes].each do |regex|
|
139
|
+
if protein_id.match(regex)
|
140
|
+
selected = true
|
141
|
+
end
|
142
|
+
end
|
143
|
+
selected
|
144
|
+
end
|
145
|
+
|
133
146
|
|
134
147
|
# Total spectra shouldn't count contaminants, but shared spectra should still be divvied up with
|
135
148
|
# Annoying thing here is when contaminating proteins share spectra
|
136
149
|
total_contaminating_peptides = hits.collect do |ident, peptide|
|
137
150
|
num_contaminating_parents = peptide.parent_proteins.select do |prot|
|
138
|
-
prot.identifier
|
151
|
+
id_is_contaminating.call prot.identifier
|
139
152
|
end.length
|
140
153
|
|
141
154
|
if num_contaminating_parents > 0
|
@@ -145,7 +158,7 @@ total_contaminating_peptides = hits.collect do |ident, peptide|
|
|
145
158
|
end
|
146
159
|
end
|
147
160
|
total_contaminating_spectra = total_contaminating_peptides.reduce :+
|
148
|
-
total_contaminating_spectra ||=
|
161
|
+
total_contaminating_spectra ||= 0
|
149
162
|
log.info "Found #{total_contaminating_spectra} contaminating spectral counts"
|
150
163
|
|
151
164
|
total_spectra = hits.collect{|ident, pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
|
@@ -177,7 +190,7 @@ puts [
|
|
177
190
|
].join "\t"
|
178
191
|
log.warn "No unique spectra found!" if total_spectra == 0
|
179
192
|
proteins.each do |protein_id, protein|
|
180
|
-
next if
|
193
|
+
next if id_is_contaminating.call(protein_id) #Don't print contaminants
|
181
194
|
|
182
195
|
if options[:whitelist_file].nil? or whitelist.include?(protein_id) # If there's a whitelist, apply it now
|
183
196
|
log.debug "Now printing protein #{protein_id}, which has #{protein.peptides.length} associated peptides"
|
data/divvy_proteomics.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "divvy_proteomics"
|
8
|
-
s.version = "0.
|
8
|
+
s.version = "0.2.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Ben J Woodcroft"]
|
12
|
-
s.date = "2013-11-
|
12
|
+
s.date = "2013-11-06"
|
13
13
|
s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
|
14
14
|
s.email = "donttrustben@gmail.com"
|
15
15
|
s.executables = ["divvy_spectra"]
|
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
|
|
32
32
|
"spec/data/merge_definition.csv",
|
33
33
|
"spec/data/multiply_mapped_spectra.csv",
|
34
34
|
"spec/data/new_format.csv",
|
35
|
+
"spec/data/new_format_some_all_shared_spectra.csv",
|
35
36
|
"spec/data/single_protein.csv",
|
36
37
|
"spec/data/single_protein_with_aliases.csv",
|
37
38
|
"spec/data/three_proteins.csv",
|
@@ -0,0 +1,45 @@
|
|
1
|
+
DTASelect v1.9
|
2
|
+
/auto/gtl/ms/Abisko_Soils/Field_Sampling_Aug_2010/SurfDeep/Erio_Deep_Aug2010_27Jun_TR1/Frac1/analysis/tryp_edeep_082010_500bp_Wheat/sequest
|
3
|
+
/auto/gtl/db/Abisko_Soils/edeep_082010_500bp_Wheat_cntm_psm
|
4
|
+
SEQUEST v.27 in SQT format.
|
5
|
+
--DB -p 2 -r 1000
|
6
|
+
true Use criteria
|
7
|
+
1.8 Minimum +1 XCorr
|
8
|
+
2.5 Minimum +2 XCorr
|
9
|
+
3.5 Minimum +3 XCorr
|
10
|
+
0.08 Minimum DeltCN
|
11
|
+
1 Minimum charge state
|
12
|
+
3 Maximum charge state
|
13
|
+
0.0 Minimum ion proportion
|
14
|
+
1000 Maximum Sp rank
|
15
|
+
-1.0 Minimum Sp score
|
16
|
+
Include Modified peptide inclusion
|
17
|
+
Any Tryptic status requirement
|
18
|
+
true Multiple, ambiguous IDs allowed
|
19
|
+
Ignore Peptide validation handling
|
20
|
+
XCorr Purge duplicate peptides by protein
|
21
|
+
false Include only loci with unique peptide
|
22
|
+
false Remove subset proteins
|
23
|
+
Ignore Locus validation handling
|
24
|
+
0 Minimum modified peptides per locus
|
25
|
+
1000 Minimum redundancy for low coverage loci
|
26
|
+
2 Minimum peptides per locus
|
27
|
+
|
28
|
+
Locus Sequence Count Spectrum Count Sequence Coverage Length MolWt pI Validation Status Descriptive Name
|
29
|
+
Unique FileName XCorr DeltCN Obs_mono_m/z Calc_mono_m/z PPM Delta_amu TotalIntensity SpRank SpScore IonProportion Redundancy Sequence
|
30
|
+
E1D_raw_1__154435_1 3 41 79.3% 58 6132 7.3 U # 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
|
31
|
+
aliaese 3 41 79.3% 58 6132 7.3 U # 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
|
32
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3 5.8526 0.4034 2772.4479 2772.4445 1.2211 0.0034 6048.2 1 1349.0 40.0 30 -.MLSIQTNIAALSAQNALTTTNNNLQK.S
|
33
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3 4.5965 0.4101 3275.6674 3275.6608 2.0124 0.0066 5944.2 1 884.7 28.4 7 -.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
|
34
|
+
20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2 4.5641 0.462 1594.7954 1594.7956 -0.1435 -0.0002 4981.9 1 1347.9 76.7 4 R.INHAADDAAGLAISEK.M
|
35
|
+
E1D_raw_1__40591_2 3 8 74.5% 51 5250 8.6 U # 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None
|
36
|
+
* 20120806_Erio_Deep_Aug2010_27Jun_TR1_01.18475.18475.2 4.3739 0.5173 2140.0658 2140.0653 0.2192 0.0005 7636.1 1 1642.9 65.0 1 K.TSDVAGDGTTTATILAQSIYR.E
|
37
|
+
* 20120806_Erio_Deep_Aug2010_27Jun_TR1_02.21883.21883.2 3.4843 0.1996 2553.2977 2553.2928 1.9293 0.0049 6903.6 1 903.4 47.9 3 K.TSDVAGDGTTTATILAQSIYREGVK.A
|
38
|
+
* 20120806_Erio_Deep_Aug2010_27Jun_TR1_08.06194.06194.2 2.7604 0.1784 1326.7055 1326.7083 -2.1145 -0.0028 6041.5 1 860.0 66.7 4 K.AVAAGANPMELKR.G
|
39
|
+
Proteins Peptide IDs Copies
|
40
|
+
Unfiltered 318515 400116 506301
|
41
|
+
Redundant 1575 3555 18759
|
42
|
+
Nonredundant 1211 2557 12384
|
43
|
+
|
44
|
+
Classification Nonredundant Proteins Redundant Proteins
|
45
|
+
Unclassified 0 0
|
@@ -88,6 +88,17 @@ describe script_under_test do
|
|
88
88
|
stdout.should eq(answer), test_file
|
89
89
|
end
|
90
90
|
|
91
|
+
it 'should handle arbitrary contaminant prefixes' do
|
92
|
+
test_file = "#{path_to_script} #{TEST_DATA_DIR}/single_protein_with_aliases.csv --trace error --contaminant-regexes alias"
|
93
|
+
status, stdout, stderr = systemu test_file
|
94
|
+
|
95
|
+
stderr.should eq("")
|
96
|
+
answer = header+
|
97
|
+
['Mstor_v4.3.2:1344','0','188','0','NaN','Methanoflorens_stordalmirensis_v4.3.2_01361 Methyl-coenzyme M reductase I subunit gamma ','alias1'+"\n"].join("\t")
|
98
|
+
#['alias1','0','188','0','0.0','alias1 Methyl-coenzyme M reductase I subunit gamma ','Mstor_v4.3.2:1344'+"\n"].join("\t")
|
99
|
+
stdout.should eq(answer), test_file
|
100
|
+
end
|
101
|
+
|
91
102
|
it 'should do a whitelist correctly' do
|
92
103
|
Tempfile.open('test_divvy_spectra') do |tempfile|
|
93
104
|
%w(eDeep20120820:eD1_8237_2 eDeep20120820:eD1_1639_1).each {|i| tempfile.puts i}
|
@@ -115,4 +126,16 @@ describe script_under_test do
|
|
115
126
|
['E1D_raw_1__40591_2','8','0','8.0','0.12121212121212122','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',"\n"].join("\t")
|
116
127
|
stdout.should eq(answer)
|
117
128
|
end
|
129
|
+
|
130
|
+
it 'should include spectra shared between unresolvable proteins in calculations' do
|
131
|
+
test_file = "#{path_to_script} #{TEST_DATA_DIR}/new_format_some_all_shared_spectra.csv --trace error"
|
132
|
+
status, stdout, stderr = systemu test_file
|
133
|
+
|
134
|
+
stderr.should eq("")
|
135
|
+
answer = header+
|
136
|
+
['E1D_raw_1__154435_1','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','aliaese'+"\n"].join("\t")+
|
137
|
+
['aliaese','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154435_1'+"\n"].join("\t")+
|
138
|
+
['E1D_raw_1__40591_2','8','0','8.0','0.16326530612244897','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',''+"\n"].join("\t")
|
139
|
+
stdout.should eq(answer)
|
140
|
+
end
|
118
141
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: divvy_proteomics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben J Woodcroft
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-11-
|
11
|
+
date: 2013-11-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio-logger
|
@@ -117,6 +117,7 @@ files:
|
|
117
117
|
- spec/data/merge_definition.csv
|
118
118
|
- spec/data/multiply_mapped_spectra.csv
|
119
119
|
- spec/data/new_format.csv
|
120
|
+
- spec/data/new_format_some_all_shared_spectra.csv
|
120
121
|
- spec/data/single_protein.csv
|
121
122
|
- spec/data/single_protein_with_aliases.csv
|
122
123
|
- spec/data/three_proteins.csv
|