divvy_proteomics 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f930bb8ef783e793512f0d50bd579055ba475476
4
- data.tar.gz: a445f2ef2ea22f7453b4e5b08fb66775797ea9bc
3
+ metadata.gz: 05ddf71aecd113201c370185104006af9705a525
4
+ data.tar.gz: b450d8ecd60717f7286f234aed29beaf455aa4eb
5
5
  SHA512:
6
- metadata.gz: 9020c67860e40394138a02cfcf53a85665cc731292650f1c71b46990d3800de2fe9c2643597a277fe09ca17eede493ec7268496e529bb1a625efdd5f0347c7a9
7
- data.tar.gz: 462c1c66db781653937b156023dc404804d8002f661872dab7b74c8a79d8576cf2110262049e0d6964cfa8162c0bb59d503a6a89841bf4d783eac887dfb14190
6
+ metadata.gz: a0b9e987dd54239a0da817becc9755c42d0f375e89fc16ff7c9f442830b34379de98c89f3bad0d74286567c5cbaafdc6a97656f766f2e63da02ccd113004019e
7
+ data.tar.gz: 2347260e7dba1a6bb08c91cc2183b445d534b36e1296e6d3338076ed3729073276df516dbfa83bce87e18af4e51721df744fc694eee3a60a096f27d9ecf0f666
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
@@ -15,7 +15,7 @@ require 'dta_select_output'
15
15
  options = {
16
16
  :logger => 'stderr',
17
17
  :log_level => 'info',
18
- :contaminant_prefix => /^CNTM:/,
18
+ :contaminant_regexes => [/^CNTM:/],
19
19
  }
20
20
  o = OptionParser.new do |opts|
21
21
  opts.banner = "
@@ -29,6 +29,10 @@ o = OptionParser.new do |opts|
29
29
  opts.on("--whitelist FILE_OF_PROTEINS_TO_REPORT", "Only report proteins that are in this whitelist, after divvying with everything") do |file|
30
30
  options[:whitelist_file] = file
31
31
  end
32
+ opts.on("--contaminant-regexes REGEXES", "Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: #{options[:contaminant_prefixes]}]") do |str|
33
+ options[:contaminant_regexes] = str.split(/,/).collect{|s| /#{s}/}
34
+ end
35
+
32
36
 
33
37
  # logger options
34
38
  opts.separator "\nVerbosity:\n\n"
@@ -42,8 +46,7 @@ if ARGV.length > 1
42
46
  end
43
47
  # Setup logging
44
48
  Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
45
-
46
-
49
+ #$stderr.puts options[:contaminant_prefixes].inspect
47
50
 
48
51
  # Read in merges, if required
49
52
  mergers = {}
@@ -130,12 +133,22 @@ mergers.each do |secondary_id, primary_id|
130
133
  end #The other two cases do not require any intervention,
131
134
  end
132
135
 
136
+ id_is_contaminating = lambda do |protein_id|
137
+ selected = false
138
+ options[:contaminant_regexes].each do |regex|
139
+ if protein_id.match(regex)
140
+ selected = true
141
+ end
142
+ end
143
+ selected
144
+ end
145
+
133
146
 
134
147
  # Total spectra shouldn't count contaminants, but shared spectra should still be divvied up with
135
148
  # Annoying thing here is when contaminating proteins share spectra
136
149
  total_contaminating_peptides = hits.collect do |ident, peptide|
137
150
  num_contaminating_parents = peptide.parent_proteins.select do |prot|
138
- prot.identifier.match(options[:contaminant_prefix])
151
+ id_is_contaminating.call prot.identifier
139
152
  end.length
140
153
 
141
154
  if num_contaminating_parents > 0
@@ -145,7 +158,7 @@ total_contaminating_peptides = hits.collect do |ident, peptide|
145
158
  end
146
159
  end
147
160
  total_contaminating_spectra = total_contaminating_peptides.reduce :+
148
- total_contaminating_spectra ||= []
161
+ total_contaminating_spectra ||= 0
149
162
  log.info "Found #{total_contaminating_spectra} contaminating spectral counts"
150
163
 
151
164
  total_spectra = hits.collect{|ident, pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
@@ -177,7 +190,7 @@ puts [
177
190
  ].join "\t"
178
191
  log.warn "No unique spectra found!" if total_spectra == 0
179
192
  proteins.each do |protein_id, protein|
180
- next if protein_id.match(options[:contaminant_prefix]) #Don't print contaminants
193
+ next if id_is_contaminating.call(protein_id) #Don't print contaminants
181
194
 
182
195
  if options[:whitelist_file].nil? or whitelist.include?(protein_id) # If there's a whitelist, apply it now
183
196
  log.debug "Now printing protein #{protein_id}, which has #{protein.peptides.length} associated peptides"
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "divvy_proteomics"
8
- s.version = "0.1.0"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Ben J Woodcroft"]
12
- s.date = "2013-11-05"
12
+ s.date = "2013-11-06"
13
13
  s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
14
14
  s.email = "donttrustben@gmail.com"
15
15
  s.executables = ["divvy_spectra"]
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
32
32
  "spec/data/merge_definition.csv",
33
33
  "spec/data/multiply_mapped_spectra.csv",
34
34
  "spec/data/new_format.csv",
35
+ "spec/data/new_format_some_all_shared_spectra.csv",
35
36
  "spec/data/single_protein.csv",
36
37
  "spec/data/single_protein_with_aliases.csv",
37
38
  "spec/data/three_proteins.csv",
@@ -0,0 +1,45 @@
1
+ DTASelect v1.9
2
+ /auto/gtl/ms/Abisko_Soils/Field_Sampling_Aug_2010/SurfDeep/Erio_Deep_Aug2010_27Jun_TR1/Frac1/analysis/tryp_edeep_082010_500bp_Wheat/sequest
3
+ /auto/gtl/db/Abisko_Soils/edeep_082010_500bp_Wheat_cntm_psm
4
+ SEQUEST v.27 in SQT format.
5
+ --DB -p 2 -r 1000
6
+ true Use criteria
7
+ 1.8 Minimum +1 XCorr
8
+ 2.5 Minimum +2 XCorr
9
+ 3.5 Minimum +3 XCorr
10
+ 0.08 Minimum DeltCN
11
+ 1 Minimum charge state
12
+ 3 Maximum charge state
13
+ 0.0 Minimum ion proportion
14
+ 1000 Maximum Sp rank
15
+ -1.0 Minimum Sp score
16
+ Include Modified peptide inclusion
17
+ Any Tryptic status requirement
18
+ true Multiple, ambiguous IDs allowed
19
+ Ignore Peptide validation handling
20
+ XCorr Purge duplicate peptides by protein
21
+ false Include only loci with unique peptide
22
+ false Remove subset proteins
23
+ Ignore Locus validation handling
24
+ 0 Minimum modified peptides per locus
25
+ 1000 Minimum redundancy for low coverage loci
26
+ 2 Minimum peptides per locus
27
+
28
+ Locus Sequence Count Spectrum Count Sequence Coverage Length MolWt pI Validation Status Descriptive Name
29
+ Unique FileName XCorr DeltCN Obs_mono_m/z Calc_mono_m/z PPM Delta_amu TotalIntensity SpRank SpScore IonProportion Redundancy Sequence
30
+ E1D_raw_1__154435_1 3 41 79.3% 58 6132 7.3 U # 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
31
+ aliaese 3 41 79.3% 58 6132 7.3 U # 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
32
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3 5.8526 0.4034 2772.4479 2772.4445 1.2211 0.0034 6048.2 1 1349.0 40.0 30 -.MLSIQTNIAALSAQNALTTTNNNLQK.S
33
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3 4.5965 0.4101 3275.6674 3275.6608 2.0124 0.0066 5944.2 1 884.7 28.4 7 -.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
34
+ 20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2 4.5641 0.462 1594.7954 1594.7956 -0.1435 -0.0002 4981.9 1 1347.9 76.7 4 R.INHAADDAAGLAISEK.M
35
+ E1D_raw_1__40591_2 3 8 74.5% 51 5250 8.6 U # 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None
36
+ * 20120806_Erio_Deep_Aug2010_27Jun_TR1_01.18475.18475.2 4.3739 0.5173 2140.0658 2140.0653 0.2192 0.0005 7636.1 1 1642.9 65.0 1 K.TSDVAGDGTTTATILAQSIYR.E
37
+ * 20120806_Erio_Deep_Aug2010_27Jun_TR1_02.21883.21883.2 3.4843 0.1996 2553.2977 2553.2928 1.9293 0.0049 6903.6 1 903.4 47.9 3 K.TSDVAGDGTTTATILAQSIYREGVK.A
38
+ * 20120806_Erio_Deep_Aug2010_27Jun_TR1_08.06194.06194.2 2.7604 0.1784 1326.7055 1326.7083 -2.1145 -0.0028 6041.5 1 860.0 66.7 4 K.AVAAGANPMELKR.G
39
+ Proteins Peptide IDs Copies
40
+ Unfiltered 318515 400116 506301
41
+ Redundant 1575 3555 18759
42
+ Nonredundant 1211 2557 12384
43
+
44
+ Classification Nonredundant Proteins Redundant Proteins
45
+ Unclassified 0 0
@@ -88,6 +88,17 @@ describe script_under_test do
88
88
  stdout.should eq(answer), test_file
89
89
  end
90
90
 
91
+ it 'should handle arbitrary contaminant prefixes' do
92
+ test_file = "#{path_to_script} #{TEST_DATA_DIR}/single_protein_with_aliases.csv --trace error --contaminant-regexes alias"
93
+ status, stdout, stderr = systemu test_file
94
+
95
+ stderr.should eq("")
96
+ answer = header+
97
+ ['Mstor_v4.3.2:1344','0','188','0','NaN','Methanoflorens_stordalmirensis_v4.3.2_01361 Methyl-coenzyme M reductase I subunit gamma ','alias1'+"\n"].join("\t")
98
+ #['alias1','0','188','0','0.0','alias1 Methyl-coenzyme M reductase I subunit gamma ','Mstor_v4.3.2:1344'+"\n"].join("\t")
99
+ stdout.should eq(answer), test_file
100
+ end
101
+
91
102
  it 'should do a whitelist correctly' do
92
103
  Tempfile.open('test_divvy_spectra') do |tempfile|
93
104
  %w(eDeep20120820:eD1_8237_2 eDeep20120820:eD1_1639_1).each {|i| tempfile.puts i}
@@ -115,4 +126,16 @@ describe script_under_test do
115
126
  ['E1D_raw_1__40591_2','8','0','8.0','0.12121212121212122','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',"\n"].join("\t")
116
127
  stdout.should eq(answer)
117
128
  end
129
+
130
+ it 'should include spectra shared between unresolvable proteins in calculations' do
131
+ test_file = "#{path_to_script} #{TEST_DATA_DIR}/new_format_some_all_shared_spectra.csv --trace error"
132
+ status, stdout, stderr = systemu test_file
133
+
134
+ stderr.should eq("")
135
+ answer = header+
136
+ ['E1D_raw_1__154435_1','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','aliaese'+"\n"].join("\t")+
137
+ ['aliaese','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154435_1'+"\n"].join("\t")+
138
+ ['E1D_raw_1__40591_2','8','0','8.0','0.16326530612244897','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',''+"\n"].join("\t")
139
+ stdout.should eq(answer)
140
+ end
118
141
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: divvy_proteomics
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben J Woodcroft
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-05 00:00:00.000000000 Z
11
+ date: 2013-11-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio-logger
@@ -117,6 +117,7 @@ files:
117
117
  - spec/data/merge_definition.csv
118
118
  - spec/data/multiply_mapped_spectra.csv
119
119
  - spec/data/new_format.csv
120
+ - spec/data/new_format_some_all_shared_spectra.csv
120
121
  - spec/data/single_protein.csv
121
122
  - spec/data/single_protein_with_aliases.csv
122
123
  - spec/data/three_proteins.csv