divvy_proteomics 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f930bb8ef783e793512f0d50bd579055ba475476
4
- data.tar.gz: a445f2ef2ea22f7453b4e5b08fb66775797ea9bc
3
+ metadata.gz: 05ddf71aecd113201c370185104006af9705a525
4
+ data.tar.gz: b450d8ecd60717f7286f234aed29beaf455aa4eb
5
5
  SHA512:
6
- metadata.gz: 9020c67860e40394138a02cfcf53a85665cc731292650f1c71b46990d3800de2fe9c2643597a277fe09ca17eede493ec7268496e529bb1a625efdd5f0347c7a9
7
- data.tar.gz: 462c1c66db781653937b156023dc404804d8002f661872dab7b74c8a79d8576cf2110262049e0d6964cfa8162c0bb59d503a6a89841bf4d783eac887dfb14190
6
+ metadata.gz: a0b9e987dd54239a0da817becc9755c42d0f375e89fc16ff7c9f442830b34379de98c89f3bad0d74286567c5cbaafdc6a97656f766f2e63da02ccd113004019e
7
+ data.tar.gz: 2347260e7dba1a6bb08c91cc2183b445d534b36e1296e6d3338076ed3729073276df516dbfa83bce87e18af4e51721df744fc694eee3a60a096f27d9ecf0f666
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
@@ -15,7 +15,7 @@ require 'dta_select_output'
15
15
  options = {
16
16
  :logger => 'stderr',
17
17
  :log_level => 'info',
18
- :contaminant_prefix => /^CNTM:/,
18
+ :contaminant_regexes => [/^CNTM:/],
19
19
  }
20
20
  o = OptionParser.new do |opts|
21
21
  opts.banner = "
@@ -29,6 +29,10 @@ o = OptionParser.new do |opts|
29
29
  opts.on("--whitelist FILE_OF_PROTEINS_TO_REPORT", "Only report proteins that are in this whitelist, after divvying with everything") do |file|
30
30
  options[:whitelist_file] = file
31
31
  end
32
+ opts.on("--contaminant-regexes REGEXES", "Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: #{options[:contaminant_prefixes]}]") do |str|
33
+ options[:contaminant_regexes] = str.split(/,/).collect{|s| /#{s}/}
34
+ end
35
+
32
36
 
33
37
  # logger options
34
38
  opts.separator "\nVerbosity:\n\n"
@@ -42,8 +46,7 @@ if ARGV.length > 1
42
46
  end
43
47
  # Setup logging
44
48
  Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
45
-
46
-
49
+ #$stderr.puts options[:contaminant_prefixes].inspect
47
50
 
48
51
  # Read in merges, if required
49
52
  mergers = {}
@@ -130,12 +133,22 @@ mergers.each do |secondary_id, primary_id|
130
133
  end #The other two cases do not require any intervention,
131
134
  end
132
135
 
136
+ id_is_contaminating = lambda do |protein_id|
137
+ selected = false
138
+ options[:contaminant_regexes].each do |regex|
139
+ if protein_id.match(regex)
140
+ selected = true
141
+ end
142
+ end
143
+ selected
144
+ end
145
+
133
146
 
134
147
  # Total spectra shouldn't count contaminants, but shared spectra should still be divvied up with
135
148
  # Annoying thing here is when contaminating proteins share spectra
136
149
  total_contaminating_peptides = hits.collect do |ident, peptide|
137
150
  num_contaminating_parents = peptide.parent_proteins.select do |prot|
138
- prot.identifier.match(options[:contaminant_prefix])
151
+ id_is_contaminating.call prot.identifier
139
152
  end.length
140
153
 
141
154
  if num_contaminating_parents > 0
@@ -145,7 +158,7 @@ total_contaminating_peptides = hits.collect do |ident, peptide|
145
158
  end
146
159
  end
147
160
  total_contaminating_spectra = total_contaminating_peptides.reduce :+
148
- total_contaminating_spectra ||= []
161
+ total_contaminating_spectra ||= 0
149
162
  log.info "Found #{total_contaminating_spectra} contaminating spectral counts"
150
163
 
151
164
  total_spectra = hits.collect{|ident, pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
@@ -177,7 +190,7 @@ puts [
177
190
  ].join "\t"
178
191
  log.warn "No unique spectra found!" if total_spectra == 0
179
192
  proteins.each do |protein_id, protein|
180
- next if protein_id.match(options[:contaminant_prefix]) #Don't print contaminants
193
+ next if id_is_contaminating.call(protein_id) #Don't print contaminants
181
194
 
182
195
  if options[:whitelist_file].nil? or whitelist.include?(protein_id) # If there's a whitelist, apply it now
183
196
  log.debug "Now printing protein #{protein_id}, which has #{protein.peptides.length} associated peptides"
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "divvy_proteomics"
8
- s.version = "0.1.0"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Ben J Woodcroft"]
12
- s.date = "2013-11-05"
12
+ s.date = "2013-11-06"
13
13
  s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
14
14
  s.email = "donttrustben@gmail.com"
15
15
  s.executables = ["divvy_spectra"]
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
32
32
  "spec/data/merge_definition.csv",
33
33
  "spec/data/multiply_mapped_spectra.csv",
34
34
  "spec/data/new_format.csv",
35
+ "spec/data/new_format_some_all_shared_spectra.csv",
35
36
  "spec/data/single_protein.csv",
36
37
  "spec/data/single_protein_with_aliases.csv",
37
38
  "spec/data/three_proteins.csv",
@@ -0,0 +1,45 @@
1
+ DTASelect v1.9
2
+ /auto/gtl/ms/Abisko_Soils/Field_Sampling_Aug_2010/SurfDeep/Erio_Deep_Aug2010_27Jun_TR1/Frac1/analysis/tryp_edeep_082010_500bp_Wheat/sequest
3
+ /auto/gtl/db/Abisko_Soils/edeep_082010_500bp_Wheat_cntm_psm
4
+ SEQUEST v.27 in SQT format.
5
+ --DB -p 2 -r 1000
6
+ true Use criteria
7
+ 1.8 Minimum +1 XCorr
8
+ 2.5 Minimum +2 XCorr
9
+ 3.5 Minimum +3 XCorr
10
+ 0.08 Minimum DeltCN
11
+ 1 Minimum charge state
12
+ 3 Maximum charge state
13
+ 0.0 Minimum ion proportion
14
+ 1000 Maximum Sp rank
15
+ -1.0 Minimum Sp score
16
+ Include Modified peptide inclusion
17
+ Any Tryptic status requirement
18
+ true Multiple, ambiguous IDs allowed
19
+ Ignore Peptide validation handling
20
+ XCorr Purge duplicate peptides by protein
21
+ false Include only loci with unique peptide
22
+ false Remove subset proteins
23
+ Ignore Locus validation handling
24
+ 0 Minimum modified peptides per locus
25
+ 1000 Minimum redundancy for low coverage loci
26
+ 2 Minimum peptides per locus
27
+
28
+ Locus Sequence Count Spectrum Count Sequence Coverage Length MolWt pI Validation Status Descriptive Name
29
+ Unique FileName XCorr DeltCN Obs_mono_m/z Calc_mono_m/z PPM Delta_amu TotalIntensity SpRank SpScore IonProportion Redundancy Sequence
30
+ E1D_raw_1__154435_1 3 41 79.3% 58 6132 7.3 U # 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
31
+ aliaese 3 41 79.3% 58 6132 7.3 U # 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
32
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3 5.8526 0.4034 2772.4479 2772.4445 1.2211 0.0034 6048.2 1 1349.0 40.0 30 -.MLSIQTNIAALSAQNALTTTNNNLQK.S
33
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3 4.5965 0.4101 3275.6674 3275.6608 2.0124 0.0066 5944.2 1 884.7 28.4 7 -.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
34
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2 4.5641 0.462 1594.7954 1594.7956 -0.1435 -0.0002 4981.9 1 1347.9 76.7 4 R.INHAADDAAGLAISEK.M
35
+ E1D_raw_1__40591_2 3 8 74.5% 51 5250 8.6 U # 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None
36
+ * 20120806_Erio_Deep_Aug2010_27Jun_TR1_01.18475.18475.2 4.3739 0.5173 2140.0658 2140.0653 0.2192 0.0005 7636.1 1 1642.9 65.0 1 K.TSDVAGDGTTTATILAQSIYR.E
37
+ * 20120806_Erio_Deep_Aug2010_27Jun_TR1_02.21883.21883.2 3.4843 0.1996 2553.2977 2553.2928 1.9293 0.0049 6903.6 1 903.4 47.9 3 K.TSDVAGDGTTTATILAQSIYREGVK.A
38
+ * 20120806_Erio_Deep_Aug2010_27Jun_TR1_08.06194.06194.2 2.7604 0.1784 1326.7055 1326.7083 -2.1145 -0.0028 6041.5 1 860.0 66.7 4 K.AVAAGANPMELKR.G
39
+ Proteins Peptide IDs Copies
40
+ Unfiltered 318515 400116 506301
41
+ Redundant 1575 3555 18759
42
+ Nonredundant 1211 2557 12384
43
+
44
+ Classification Nonredundant Proteins Redundant Proteins
45
+ Unclassified 0 0
@@ -88,6 +88,17 @@ describe script_under_test do
88
88
  stdout.should eq(answer), test_file
89
89
  end
90
90
 
91
+ it 'should handle arbitrary contaminant prefixes' do
92
+ test_file = "#{path_to_script} #{TEST_DATA_DIR}/single_protein_with_aliases.csv --trace error --contaminant-regexes alias"
93
+ status, stdout, stderr = systemu test_file
94
+
95
+ stderr.should eq("")
96
+ answer = header+
97
+ ['Mstor_v4.3.2:1344','0','188','0','NaN','Methanoflorens_stordalmirensis_v4.3.2_01361 Methyl-coenzyme M reductase I subunit gamma ','alias1'+"\n"].join("\t")
98
+ #['alias1','0','188','0','0.0','alias1 Methyl-coenzyme M reductase I subunit gamma ','Mstor_v4.3.2:1344'+"\n"].join("\t")
99
+ stdout.should eq(answer), test_file
100
+ end
101
+
91
102
  it 'should do a whitelist correctly' do
92
103
  Tempfile.open('test_divvy_spectra') do |tempfile|
93
104
  %w(eDeep20120820:eD1_8237_2 eDeep20120820:eD1_1639_1).each {|i| tempfile.puts i}
@@ -115,4 +126,16 @@ describe script_under_test do
115
126
  ['E1D_raw_1__40591_2','8','0','8.0','0.12121212121212122','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',"\n"].join("\t")
116
127
  stdout.should eq(answer)
117
128
  end
129
+
130
+ it 'should include spectra shared between unresolvable proteins in calculations' do
131
+ test_file = "#{path_to_script} #{TEST_DATA_DIR}/new_format_some_all_shared_spectra.csv --trace error"
132
+ status, stdout, stderr = systemu test_file
133
+
134
+ stderr.should eq("")
135
+ answer = header+
136
+ ['E1D_raw_1__154435_1','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','aliaese'+"\n"].join("\t")+
137
+ ['aliaese','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154435_1'+"\n"].join("\t")+
138
+ ['E1D_raw_1__40591_2','8','0','8.0','0.16326530612244897','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',''+"\n"].join("\t")
139
+ stdout.should eq(answer)
140
+ end
118
141
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: divvy_proteomics
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-05 00:00:00.000000000 Z
11
+ date: 2013-11-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-logger
@@ -117,6 +117,7 @@ files:
117
117
  - spec/data/merge_definition.csv
118
118
  - spec/data/multiply_mapped_spectra.csv
119
119
  - spec/data/new_format.csv
120
+ - spec/data/new_format_some_all_shared_spectra.csv
120
121
  - spec/data/single_protein.csv
121
122
  - spec/data/single_protein_with_aliases.csv
122
123
  - spec/data/three_proteins.csv