RubyGems - divvy_proteomics - Versions diffs - 0.1.0 → 0.2.0 - Mend

divvy_proteomics 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/VERSION +1 -1
data/bin/divvy_spectra +19 -6
data/divvy_proteomics.gemspec +3 -2
data/spec/data/new_format_some_all_shared_spectra.csv +45 -0
data/spec/divvy_proteomics_spec.rb +23 -0
metadata +3 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f930bb8ef783e793512f0d50bd579055ba475476
-  data.tar.gz: a445f2ef2ea22f7453b4e5b08fb66775797ea9bc
+  metadata.gz: 05ddf71aecd113201c370185104006af9705a525
+  data.tar.gz: b450d8ecd60717f7286f234aed29beaf455aa4eb
 SHA512:
-  metadata.gz: 9020c67860e40394138a02cfcf53a85665cc731292650f1c71b46990d3800de2fe9c2643597a277fe09ca17eede493ec7268496e529bb1a625efdd5f0347c7a9
-  data.tar.gz: 462c1c66db781653937b156023dc404804d8002f661872dab7b74c8a79d8576cf2110262049e0d6964cfa8162c0bb59d503a6a89841bf4d783eac887dfb14190
+  metadata.gz: a0b9e987dd54239a0da817becc9755c42d0f375e89fc16ff7c9f442830b34379de98c89f3bad0d74286567c5cbaafdc6a97656f766f2e63da02ccd113004019e
+  data.tar.gz: 2347260e7dba1a6bb08c91cc2183b445d534b36e1296e6d3338076ed3729073276df516dbfa83bce87e18af4e51721df744fc694eee3a60a096f27d9ecf0f666

data/VERSION CHANGED

	@@ -1 +1 @@
1	- 0.1.0
1	+ 0.2.0

data/bin/divvy_spectra CHANGED

@@ -15,7 +15,7 @@ require 'dta_select_output'
 options = {
   :logger => 'stderr',
   :log_level => 'info',
-  :contaminant_prefix => /^CNTM:/,
+  :contaminant_regexes => [/^CNTM:/],
 }
 o = OptionParser.new do |opts|
   opts.banner = "
@@ -29,6 +29,10 @@ o = OptionParser.new do |opts|
   opts.on("--whitelist FILE_OF_PROTEINS_TO_REPORT", "Only report proteins that are in this whitelist, after divvying with everything") do |file|
     options[:whitelist_file] = file
   end
+  opts.on("--contaminant-regexes REGEXES", "Comma-separated list of regular expressions to apply to protein names. If the protein name matches then all spectra assigned to that protein are considered contaminants. [default: #{options[:contaminant_prefixes]}]") do |str|
+    options[:contaminant_regexes] = str.split(/,/).collect{|s| /#{s}/}
+  end
   # logger options
   opts.separator "\nVerbosity:\n\n"
@@ -42,8 +46,7 @@ if ARGV.length > 1
 end
 # Setup logging
 Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
+#$stderr.puts options[:contaminant_prefixes].inspect
 # Read in merges, if required
 mergers = {}
@@ -130,12 +133,22 @@ mergers.each do |secondary_id, primary_id|
   end #The other two cases do not require any intervention,
 end
+id_is_contaminating = lambda do |protein_id|
+  selected = false
+  options[:contaminant_regexes].each do |regex|
+    if protein_id.match(regex)
+      selected = true
+    end
+  end
+  selected
+end
 # Total spectra shouldn't count contaminants, but shared spectra should still be divvied up with
 # Annoying thing here is when contaminating proteins share spectra
 total_contaminating_peptides = hits.collect do |ident, peptide|
   num_contaminating_parents = peptide.parent_proteins.select do |prot|
-    prot.identifier.match(options[:contaminant_prefix])
+    id_is_contaminating.call prot.identifier
   end.length
   if num_contaminating_parents > 0
@@ -145,7 +158,7 @@ total_contaminating_peptides = hits.collect do |ident, peptide|
   end
 end
 total_contaminating_spectra = total_contaminating_peptides.reduce :+
-total_contaminating_spectra ||= []
+total_contaminating_spectra ||= 0
 log.info "Found #{total_contaminating_spectra} contaminating spectral counts"
 total_spectra = hits.collect{|ident, pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
@@ -177,7 +190,7 @@ puts [
 ].join "\t"
 log.warn "No unique spectra found!" if total_spectra == 0
 proteins.each do |protein_id, protein|
-  next if protein_id.match(options[:contaminant_prefix]) #Don't print contaminants
+  next if id_is_contaminating.call(protein_id) #Don't print contaminants
   if options[:whitelist_file].nil? or whitelist.include?(protein_id) # If there's a whitelist, apply it now
     log.debug "Now printing protein #{protein_id}, which has #{protein.peptides.length} associated peptides"

data/divvy_proteomics.gemspec CHANGED

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "divvy_proteomics"
-  s.version = "0.1.0"
+  s.version = "0.2.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Ben J Woodcroft"]
-  s.date = "2013-11-05"
+  s.date = "2013-11-06"
   s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
   s.email = "donttrustben@gmail.com"
   s.executables = ["divvy_spectra"]
@@ -32,6 +32,7 @@ Gem::Specification.new do |s|
     "spec/data/merge_definition.csv",
     "spec/data/multiply_mapped_spectra.csv",
     "spec/data/new_format.csv",
+    "spec/data/new_format_some_all_shared_spectra.csv",
     "spec/data/single_protein.csv",
     "spec/data/single_protein_with_aliases.csv",
     "spec/data/three_proteins.csv",

data/spec/data/new_format_some_all_shared_spectra.csv ADDED

@@ -0,0 +1,45 @@
+DTASelect v1.9
+/auto/gtl/ms/Abisko_Soils/Field_Sampling_Aug_2010/SurfDeep/Erio_Deep_Aug2010_27Jun_TR1/Frac1/analysis/tryp_edeep_082010_500bp_Wheat/sequest
+/auto/gtl/db/Abisko_Soils/edeep_082010_500bp_Wheat_cntm_psm
+SEQUEST v.27 in SQT format.
+ --DB -p 2 -r 1000
+true	Use criteria
+1.8	Minimum +1 XCorr
+2.5	Minimum +2 XCorr
+3.5	Minimum +3 XCorr
+0.08	Minimum DeltCN
+1	Minimum charge state
+3	Maximum charge state
+0.0	Minimum ion proportion
+1000	Maximum Sp rank
+-1.0	Minimum Sp score
+Include	Modified peptide inclusion
+Any	Tryptic status requirement
+true	Multiple, ambiguous IDs allowed
+Ignore	Peptide validation handling
+XCorr	Purge duplicate peptides by protein
+false	Include only loci with unique peptide
+false	Remove subset proteins
+Ignore	Locus validation handling
+0	Minimum modified peptides per locus
+1000	Minimum redundancy for low coverage loci
+2	Minimum peptides per locus
+Locus	Sequence Count	Spectrum Count	Sequence Coverage	Length	MolWt	pI	Validation Status	Descriptive Name
+Unique	FileName	XCorr	DeltCN	Obs_mono_m/z	Calc_mono_m/z	PPM	Delta_amu	TotalIntensity	SpRank	SpScore	IonProportion	Redundancy	Sequence
+E1D_raw_1__154435_1	3	41	79.3%	58	6132	7.3	U	# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
+aliaese	3	41	79.3%	58	6132	7.3	U	# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
+	20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3	5.8526	0.4034	2772.4479	2772.4445	1.2211	0.0034	6048.2	1	1349.0	40.0	30	-.MLSIQTNIAALSAQNALTTTNNNLQK.S
+	20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3	4.5965	0.4101	3275.6674	3275.6608	2.0124	0.0066	5944.2	1	884.7	28.4	7	-.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
+	20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2	4.5641	0.462	1594.7954	1594.7956	-0.1435	-0.0002	4981.9	1	1347.9	76.7	4	R.INHAADDAAGLAISEK.M
+E1D_raw_1__40591_2	3	8	74.5%	51	5250	8.6	U	# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None
+*	20120806_Erio_Deep_Aug2010_27Jun_TR1_01.18475.18475.2	4.3739	0.5173	2140.0658	2140.0653	0.2192	0.0005	7636.1	1	1642.9	65.0	1	K.TSDVAGDGTTTATILAQSIYR.E
+*	20120806_Erio_Deep_Aug2010_27Jun_TR1_02.21883.21883.2	3.4843	0.1996	2553.2977	2553.2928	1.9293	0.0049	6903.6	1	903.4	47.9	3	K.TSDVAGDGTTTATILAQSIYREGVK.A
+*	20120806_Erio_Deep_Aug2010_27Jun_TR1_08.06194.06194.2	2.7604	0.1784	1326.7055	1326.7083	-2.1145	-0.0028	6041.5	1	860.0	66.7	4	K.AVAAGANPMELKR.G
+	Proteins	Peptide IDs	Copies
+Unfiltered	318515	400116	506301
+Redundant	1575	3555	18759
+Nonredundant	1211	2557	12384
+Classification	Nonredundant Proteins	Redundant Proteins
+Unclassified	0	0

data/spec/divvy_proteomics_spec.rb CHANGED

@@ -88,6 +88,17 @@ describe script_under_test do
     stdout.should eq(answer), test_file
   end
+  it 'should handle arbitrary contaminant prefixes' do
+    test_file = "#{path_to_script} #{TEST_DATA_DIR}/single_protein_with_aliases.csv --trace error --contaminant-regexes alias"
+    status, stdout, stderr = systemu test_file
+    stderr.should eq("")
+    answer = header+
+    ['Mstor_v4.3.2:1344','0','188','0','NaN','Methanoflorens_stordalmirensis_v4.3.2_01361 Methyl-coenzyme M reductase I subunit gamma ','alias1'+"\n"].join("\t")
+    #['alias1','0','188','0','0.0','alias1 Methyl-coenzyme M reductase I subunit gamma ','Mstor_v4.3.2:1344'+"\n"].join("\t")
+    stdout.should eq(answer), test_file
+  end
   it 'should do a whitelist correctly' do
     Tempfile.open('test_divvy_spectra') do |tempfile|
       %w(eDeep20120820:eD1_8237_2 eDeep20120820:eD1_1639_1).each {|i| tempfile.puts i}
@@ -115,4 +126,16 @@ describe script_under_test do
     ['E1D_raw_1__40591_2','8','0','8.0','0.12121212121212122','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',"\n"].join("\t")
     stdout.should eq(answer)
   end
+  it 'should include spectra shared between unresolvable proteins in calculations' do
+    test_file = "#{path_to_script} #{TEST_DATA_DIR}/new_format_some_all_shared_spectra.csv --trace error"
+    status, stdout, stderr = systemu test_file
+    stderr.should eq("")
+    answer = header+
+    ['E1D_raw_1__154435_1','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','aliaese'+"\n"].join("\t")+
+    ['aliaese','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154435_1'+"\n"].join("\t")+
+    ['E1D_raw_1__40591_2','8','0','8.0','0.16326530612244897','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',''+"\n"].join("\t")
+    stdout.should eq(answer)
+  end
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: divvy_proteomics
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Ben J Woodcroft
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-11-05 00:00:00.000000000 Z
+date: 2013-11-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bio-logger
@@ -117,6 +117,7 @@ files:
 - spec/data/merge_definition.csv
 - spec/data/multiply_mapped_spectra.csv
 - spec/data/new_format.csv
+- spec/data/new_format_some_all_shared_spectra.csv
 - spec/data/single_protein.csv
 - spec/data/single_protein_with_aliases.csv
 - spec/data/three_proteins.csv