RubyGems - divvy_proteomics - Versions diffs - 0.0.1 → 0.1.0 - Mend

divvy_proteomics 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +7 -0
data/README.md +12 -5
data/VERSION +1 -1
data/bin/divvy_spectra +27 -189
data/divvy_proteomics.gemspec +6 -4
data/lib/dta_select_output.rb +215 -0
data/spec/data/new_format.csv +49 -0
data/spec/divvy_proteomics_spec.rb +14 -2
metadata +21 -36

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: f930bb8ef783e793512f0d50bd579055ba475476
+  data.tar.gz: a445f2ef2ea22f7453b4e5b08fb66775797ea9bc
+SHA512:
+  metadata.gz: 9020c67860e40394138a02cfcf53a85665cc731292650f1c71b46990d3800de2fe9c2643597a277fe09ca17eede493ec7268496e529bb1a625efdd5f0347c7a9
+  data.tar.gz: 462c1c66db781653937b156023dc404804d8002f661872dab7b74c8a79d8576cf2110262049e0d6964cfa8162c0bb59d503a6a89841bf4d783eac887dfb14190

data/README.md CHANGED Viewed

@@ -1,15 +1,22 @@
 # divvy_proteomics
-Takes a DTASelect CSV file, and parses the reulst
+Takes a DTASelect CSV file, and parses the result so non-unique peptides get accounted for.
 ## Install
-Get ruby somehow, if you don't already have it.
+Get ruby somehow, if you don't already have it. Then, install this gem:
 ```
-gem install divvy_spectra
+$ gem install divvy_spectra
 ```
 ## Usage
 ```
+$ divvy_spectra <DTASelectFile>
+```
+Output is a table, with a row for each protein with a few columns, including number of unique spectra and the
+estimated number of spectral counts after sorting out the non-uniqueness.
+Full usage information:
+```
 $ divvy_spectra -h
     Usage: divvy_spectra [options] <DTASelect_file>
@@ -28,7 +35,7 @@ Verbosity:
         --trace options              Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG
 ```
-== Contributing to divvy\_proteomics
+## Contributing to divvy\_proteomics
 * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
 * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
@@ -38,7 +45,7 @@ Verbosity:
 * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
 * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
-== Copyright
+## Copyright
 Copyright (c) 2013 Ben J Woodcroft. See LICENSE.txt for
 further details.

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.0.1
1	+ 0.1.0

data/bin/divvy_spectra CHANGED Viewed

@@ -7,6 +7,10 @@ require 'set'
 SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
+rootpath = File.dirname(File.dirname(__FILE__))
+$: << File.join(rootpath,'lib')
+require 'dta_select_output'
 # Parse command line options into the options hash
 options = {
   :logger => 'stderr',
@@ -39,88 +43,7 @@ end
 # Setup logging
 Bio::Log::CLI.logger(options[:logger]); Bio::Log::CLI.trace(options[:log_level]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
-class SelectedProtein
-  attr_accessor :identifier
-  attr_accessor :sequence_count, :spectrum_count, :sequence_coverage, :length, :molwt, :pi, :validation_status, :descriptive_name
-  attr_accessor :peptides
-  def initialize
-    @peptides = []
-  end
-  def unique_spectra
-    return 0 if @peptides.nil? or @peptides.empty?
-    num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
-    num ||= 0
-    return num
-  end
-  def non_unique_spectra
-    return 0 if @peptides.nil? or @peptides.empty?
-    num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
-    num ||= 0
-    return num
-  end
-  # Are there any peptides that are assigned exclusively to this protein?
-  def uniquely_identified_by_any_peptides?
-    unique_spectra > 0
-  end
-  def estimated_spectral_count
-    # How many unique spectra are there for each protein that shares a peptide with the current peptide
-    return 0 if @peptides.nil? or @peptides.empty?
-    peptide_shares = []
-    # If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
-    if !uniquely_identified_by_any_peptides?
-      shared_parents = peptides[0].parent_proteins
-      if peptides.find{|pep| pep.parent_proteins != shared_parents}
-        log.warn "Found a protein (#{identifier}) that shares all its peptides with a non-constant set of proteins, hoping this is a rare event, estimated spectral count likely wrong"
-      end
-      num_shared_proteins = shared_parents.length
-      num_peptide_spectra = peptides.collect{|pep| pep.redundancy}.reduce(:+)
-      log.debug "Found #{num_shared_proteins} shared peptides and #{num_peptide_spectra} peptide spectra"
-      return num_peptide_spectra.to_f/num_shared_proteins
-    else
-      peptides.each do |peptide|
-        log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
-        log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
-        total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
-        peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
-      end
-      return peptide_shares.reduce(:+)
-    end
-  end
-  def log
-    Bio::Log::LoggerPlus[LOG_NAME]
-  end
-end
-class Peptide
-  attr_accessor :identifier
-  attr_accessor :reported_unique
-  attr_accessor :xcorr, :deltcn, :obs_mono_mz, :cal_mono_mz, :total_intensity, :sp_rank, :sp_score, :ion_proportion, :redundancy, :sequence
-  attr_accessor :unique
-  attr_accessor :parent_proteins
-  def initialize
-    @parent_proteins = []
-  end
-  def inspect
-    str = "Peptide: #{@parent_proteins.length} @parent_proteins: [#{@parent_proteins.collect{|pro| pro.identifier}.join(', ')}]"
-    [:identifier, :xcorr, :deltcn, :obs_mono_mz, :cal_mono_mz, :total_intensity, :sp_rank, :sp_score, :ion_proportion, :redundancy, :sequence].each do |var|
-      str += ", #{var}: #{send(var)}"
-    end
-    return str
-  end
-end
 # Read in merges, if required
 mergers = {}
@@ -147,107 +70,12 @@ if options[:whitelist_file]
   log.info "Read in #{whitelist.length} IDs into the whitelist, only those will be reported. e.g. #{whitelist[0]}"
 end
+# Parse the csv file
+parsed = Bio::DTASelect::OutputFile.parse(ARGF)
 # Hashes of identifiers to objects
-proteins = {}
-hits = {}
-# Read in the tab separated file
-reading_header = true
-current_proteins = []
-last_line_was_protein_name = false
-# Parse each line of the DTAselect file
-ARGF.each_line do |line|
-  splits = line.chomp.split("\t")
-  log.debug "Parsing line `#{line.chomp}'"
-  if reading_header
-    log.debug "reading header"
-    if splits[0] == 'Unique'
-      reading_header = false
-    end
-    next
-  end
-  # OK, now we are reading the actual table, not the header
-  if splits[0] != '' and splits[11].nil?
-    ident = splits[0]
-    if !last_line_was_protein_name
-      # Sometimes several proteins are given all in the one header line
-      # start a new protein
-      log.debug "New protein now being parsed"
-      current_proteins = []
-    end
-    current_protein = SelectedProtein.new
-    last_line_was_protein_name = true
-    current_proteins.push current_protein
-    current_protein.identifier = ident
-    i = 1
-    current_protein.sequence_count = splits[i].to_i; i+=1
-    current_protein.spectrum_count = splits[i].to_i; i+=1
-    current_protein.sequence_coverage = splits[i].to_f; i+=1
-    current_protein.length = splits[i].to_i; i+=1
-    current_protein.molwt = splits[i].to_f; i+=1
-    current_protein.pi = splits[i].to_f; i+=1
-    current_protein.validation_status = splits[i].to_f; i+=1
-    current_protein.descriptive_name = splits[i]
-    if proteins[ident]
-      raise "Unexpectedly found the same protein identifier twice: #{ident}, from line #{line.chomp}"
-    end
-    proteins[ident] = current_protein
-  elsif splits[1] == 'Proteins'
-    # Done processing, except for the bits down the bottom which aren't parsed (yet)
-    break
-  else
-    log.debug "New spectra now being parsed"
-    last_line_was_protein_name = false
-    # Record a spectra
-    ident = splits[1]
-    raise "Unexpected hits name `#{ident}', from line `#{line.chomp}'" unless ident.length > 10
-    pep = hits[ident]
-    if pep.nil?
-      pep = Peptide.new
-      pep.identifier = ident
-      pep.reported_unique = splits[0]
-      i = 2
-      pep.xcorr = splits[i].to_f; i+= 1
-      pep.deltcn = splits[i].to_f; i+= 1
-      pep.obs_mono_mz = splits[i].to_f; i+= 1
-      pep.cal_mono_mz = splits[i].to_f; i+= 1
-      pep.total_intensity = splits[i].to_f; i+= 1
-      pep.sp_rank = splits[i].to_f; i+= 1
-      pep.sp_score = splits[i].to_f; i+= 1
-      pep.ion_proportion = splits[i].to_f; i+= 1
-      pep.redundancy = splits[i].to_i; i+= 1
-      pep.sequence = splits[i]
-      hits[ident] = pep
-    end
-    current_proteins.each do |current_protein|
-      pep.parent_proteins.push current_protein
-      current_protein.peptides.push pep
-    end
-    log.debug "Parsed this peptide #{pep.inspect}"
-  end
-end
-log.debug "Proteins parsed: #{proteins.inspect}"
+proteins = parsed.protein_name_to_object
+hits = parsed.peptide_name_to_object
 # Merge proteins that are known duplicates if need be
@@ -304,10 +132,23 @@ end
 # Total spectra shouldn't count contaminants, but shared spectra should still be divvied up with
-total_contaminating_spectra = proteins.select{|ident, protein| ident.match(options[:contaminant_prefix])}.collect{|i, pro| pro.estimated_spectral_count}.reduce(:+)
-total_contaminating_spectra ||= 0
+# Annoying thing here is when contaminating proteins share spectra
+total_contaminating_peptides = hits.collect do |ident, peptide|
+  num_contaminating_parents = peptide.parent_proteins.select do |prot|
+    prot.identifier.match(options[:contaminant_prefix])
+  end.length
+  if num_contaminating_parents > 0
+    peptide.redundancy
+  else
+    0
+  end
+end
+total_contaminating_spectra = total_contaminating_peptides.reduce :+
+total_contaminating_spectra ||= []
+log.info "Found #{total_contaminating_spectra} contaminating spectral counts"
-total_spectra = hits.collect{|i,pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
+total_spectra = hits.collect{|ident, pep| pep.redundancy}.reduce(:+) - total_contaminating_spectra
 log.info "Parsed in #{proteins.length} proteins and #{hits.length} peptides, and #{total_spectra.to_i} non-contaminating spectra"
 log.debug "Proteins parsed: #{proteins.inspect}"
@@ -320,9 +161,9 @@ log.info "Found #{number_shared_peptides} (#{number_shared_peptides.to_f/total_p
 # Find non-starred peptides that occur only once in the file - maybe not possible given a correctly formatted file?
 non_starred_but_uniquely_identified_peptides = hits.values.select do |peptide|
-  peptide.reported_unique == nil and peptide.parent_proteins.length == 1
+  peptide.dtaselect_attributes['Unique'] == nil and peptide.parent_proteins.length == 1
 end
-log.info "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one."
+log.debug "Found #{non_starred_but_uniquely_identified_peptides.length} different peptides that weren't starred or 2'd but the identifier is only found one time."
 # OK, finished parsing the file. Now output the score for each protein
 puts [
@@ -334,15 +175,12 @@ puts [
   'Description',
   'Proteins sharing spectra',
 ].join "\t"
+log.warn "No unique spectra found!" if total_spectra == 0
 proteins.each do |protein_id, protein|
   next if protein_id.match(options[:contaminant_prefix]) #Don't print contaminants
   if options[:whitelist_file].nil? or whitelist.include?(protein_id) # If there's a whitelist, apply it now
     log.debug "Now printing protein #{protein_id}, which has #{protein.peptides.length} associated peptides"
-    if !protein.uniquely_identified_by_any_peptides?
-      shareds = protein.peptides.collect{|pep| pep.parent_proteins.collect{|pro| pro.identifier}}.flatten.uniq.reject{|pro_id| pro_id==protein_id}
-      log.warn "This protein #{protein_id} shares all of its spectra with other proteins (#{shareds.join(', ')}), sharing the peptides equally (this may not be appropriate)"
-    end
     puts [
       protein_id,
       protein.unique_spectra,

data/divvy_proteomics.gemspec CHANGED Viewed

@@ -5,11 +5,11 @@
 Gem::Specification.new do |s|
   s.name = "divvy_proteomics"
-  s.version = "0.0.1"
+  s.version = "0.1.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Ben J Woodcroft"]
-  s.date = "2013-04-13"
+  s.date = "2013-11-05"
   s.description = "divvy up spectra from DTASelect files in a somewhat parsimonious way"
   s.email = "donttrustben@gmail.com"
   s.executables = ["divvy_spectra"]
@@ -28,8 +28,10 @@ Gem::Specification.new do |s|
     "bin/divvy_spectra",
     "divvy_proteomics.gemspec",
     "lib/divvy_proteomics.rb",
+    "lib/dta_select_output.rb",
     "spec/data/merge_definition.csv",
     "spec/data/multiply_mapped_spectra.csv",
+    "spec/data/new_format.csv",
     "spec/data/single_protein.csv",
     "spec/data/single_protein_with_aliases.csv",
     "spec/data/three_proteins.csv",
@@ -41,11 +43,11 @@ Gem::Specification.new do |s|
   s.homepage = "http://github.com/wwood/divvy_proteomics"
   s.licenses = ["MIT"]
   s.require_paths = ["lib"]
-  s.rubygems_version = "1.8.24"
+  s.rubygems_version = "2.0.3"
   s.summary = "divvy up spectra from DTASelect files in a parsimonious way"
   if s.respond_to? :specification_version then
-    s.specification_version = 3
+    s.specification_version = 4
     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
       s.add_runtime_dependency(%q<bio-logger>, [">= 0"])

data/lib/dta_select_output.rb ADDED Viewed

@@ -0,0 +1,215 @@
+module Bio::DTASelect
+  module Logging
+    def log
+      Bio::Log::LoggerPlus['divvy_spectra']
+    end
+  end
+  class OutputFile
+    def self.log
+      SelectedProtein.new.log
+    end
+    class SelectedProtein
+      include Bio::DTASelect::Logging
+      attr_accessor :identifier
+      attr_accessor :sequence_count, :spectrum_count, :sequence_coverage, :length, :molwt, :pi, :validation_status, :descriptive_name
+      attr_accessor :peptides
+      def initialize
+        @peptides = []
+      end
+      def unique_spectra
+        return 0 if @peptides.nil? or @peptides.empty?
+        num = @peptides.select{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
+        num ||= 0
+        return num
+      end
+      def non_unique_spectra
+        return 0 if @peptides.nil? or @peptides.empty?
+        num = @peptides.reject{|pep| pep.parent_proteins.length == 1}.collect{|pep| pep.redundancy}.reduce(:+)
+        num ||= 0
+        return num
+      end
+      # Are there any peptides that are assigned exclusively to this protein?
+      def uniquely_identified_by_any_peptides?
+        unique_spectra > 0
+      end
+      def estimated_spectral_count
+        # How many unique spectra are there for each protein that shares a peptide with the current peptide
+        return 0 if @peptides.nil? or @peptides.empty?
+        peptide_shares = []
+        # If all peptides are non-unique and shared with some number of other proteins, then output a negative number num shared spectra divided by the number of proteins
+        if !uniquely_identified_by_any_peptides?
+          # Don't attempt to divvy these up, because there are too many assumptions involved
+          return 0
+        else
+          peptides.each do |peptide|
+            log.debug "Tallying peptide #{peptide.identifier}, which is has #{peptide.redundancy} spectra shared among #{peptide.parent_proteins.length} proteins"
+            log.debug "These proteins have #{peptide.parent_proteins.collect{|pro| pro.unique_spectra}.inspect} unique spectra each"
+            total_linked_unique_spectra = peptide.parent_proteins.collect{|pro| pro.unique_spectra}.reduce(:+)
+            peptide_shares.push unique_spectra.to_f/total_linked_unique_spectra*peptide.redundancy
+          end
+          return peptide_shares.reduce(:+)
+        end
+      end
+      def log
+        Bio::Log::LoggerPlus[LOG_NAME]
+      end
+    end
+    class Peptide
+      include Bio::DTASelect::Logging
+      attr_accessor :identifier
+      # Hash of column names to values. These are different for different DTAselect output files, it seems.
+      attr_accessor :dtaselect_attributes
+      # Array of proteins that have this peptide associated
+      attr_accessor :parent_proteins
+      def initialize
+        @parent_proteins = []
+      end
+      def inspect
+        "Peptide: #{@parent_proteins.length} @parent_proteins: [#{@parent_proteins.collect{|pro| pro.identifier}.join(', ')} @identifier: #{identifier}, @attributes: #{dtaselect_attributes.inspect}]"
+      end
+      def redundancy
+        @dtaselect_attributes['Redundancy'].to_i
+      end
+      def reported_unique?
+        dtaselect_attributes.length == 1
+      end
+    end
+    class Result
+      include Bio::DTASelect::Logging
+      # hash of protein identifier to Protein object
+      attr_accessor :protein_name_to_object
+      # hash of peptide identifier to Peptide object
+      attr_accessor :peptide_name_to_object
+    end
+    def self.parse(io)
+      result = Result.new
+      # Hashes of identifiers to objects
+      result.protein_name_to_object = {}
+      result.peptide_name_to_object = {}
+      # Read in the tab separated file
+      reading_header = true
+      current_proteins = []
+      last_line_was_protein_name = false
+      peptide_attribute_names = nil
+      # Parse each line of the DTAselect file
+      io.each_line do |line|
+        splits = line.chomp.split("\t")
+        log.debug "Parsing line `#{line.chomp}'"
+        if reading_header
+          log.debug "reading header"
+          if splits[0] == 'Unique'
+            reading_header = false
+            # Current line describes the peptide attributes
+            peptide_attribute_names = splits
+            # This field has special importance, so be picky
+            raise "Badly parsed file at this line: #{line.inspect}, expected 2nd field to be 'FileName', found #{splits[1]}" unless splits[1] == 'FileName'
+          end
+          next
+        end
+        # OK, now we are reading the actual table, not the header
+        if splits[0] != '' and splits[11].nil?
+          ident = splits[0]
+          if !last_line_was_protein_name
+            # Sometimes several proteins are given all in the one header line
+            # start a new protein
+            log.debug "New protein now being parsed"
+            current_proteins = []
+          end
+          current_protein = SelectedProtein.new
+          last_line_was_protein_name = true
+          current_proteins.push current_protein
+          current_protein.identifier = ident
+          i = 1
+          current_protein.sequence_count = splits[i].to_i; i+=1
+          current_protein.spectrum_count = splits[i].to_i; i+=1
+          current_protein.sequence_coverage = splits[i].to_f; i+=1
+          current_protein.length = splits[i].to_i; i+=1
+          current_protein.molwt = splits[i].to_f; i+=1
+          current_protein.pi = splits[i].to_f; i+=1
+          current_protein.validation_status = splits[i].to_f; i+=1
+          current_protein.descriptive_name = splits[i]
+          if result.protein_name_to_object[ident]
+            raise "Unexpectedly found the same protein identifier twice: #{ident}, from line #{line.chomp}"
+          end
+          result.protein_name_to_object[ident] = current_protein
+        elsif splits[1] == 'Proteins'
+          # Done processing, except for the bits down the bottom which aren't parsed (yet)
+          break
+        else
+          log.debug "New spectra now being parsed"
+          last_line_was_protein_name = false
+          # Record a spectra
+          ident = splits[1]
+          raise "Unexpected hits name `#{ident}', from line `#{line.chomp}'" unless ident.length > 10
+          pep = result.peptide_name_to_object[ident]
+          if pep.nil?
+            pep = Peptide.new
+            pep.identifier = ident
+            peptide_attribute_names.each_with_index do |attribute_name,i|
+              pep.dtaselect_attributes ||= {}
+              pep.dtaselect_attributes[attribute_name] = splits[i]
+            end
+            result.peptide_name_to_object[ident] = pep
+          end
+          current_proteins.each do |current_protein|
+            pep.parent_proteins.push current_protein
+            current_protein.peptides.push pep
+          end
+          log.debug "Parsed this peptide #{pep.inspect}"
+        end
+      end
+      log.debug "Proteins parsed: #{result.protein_name_to_object.inspect}"
+      return result
+    end
+  end
+end

data/spec/data/new_format.csv ADDED Viewed

@@ -0,0 +1,49 @@
+DTASelect v1.9
+/auto/gtl/ms/Abisko_Soils/Field_Sampling_Aug_2010/SurfDeep/Erio_Deep_Aug2010_27Jun_TR1/Frac1/analysis/tryp_edeep_082010_500bp_Wheat/sequest
+/auto/gtl/db/Abisko_Soils/edeep_082010_500bp_Wheat_cntm_psm
+SEQUEST v.27 in SQT format.
+ --DB -p 2 -r 1000
+true	Use criteria
+1.8	Minimum +1 XCorr
+2.5	Minimum +2 XCorr
+3.5	Minimum +3 XCorr
+0.08	Minimum DeltCN
+1	Minimum charge state
+3	Maximum charge state
+0.0	Minimum ion proportion
+1000	Maximum Sp rank
+-1.0	Minimum Sp score
+Include	Modified peptide inclusion
+Any	Tryptic status requirement
+true	Multiple, ambiguous IDs allowed
+Ignore	Peptide validation handling
+XCorr	Purge duplicate peptides by protein
+false	Include only loci with unique peptide
+false	Remove subset proteins
+Ignore	Locus validation handling
+0	Minimum modified peptides per locus
+1000	Minimum redundancy for low coverage loci
+2	Minimum peptides per locus
+Locus	Sequence Count	Spectrum Count	Sequence Coverage	Length	MolWt	pI	Validation Status	Descriptive Name
+Unique	FileName	XCorr	DeltCN	Obs_mono_m/z	Calc_mono_m/z	PPM	Delta_amu	TotalIntensity	SpRank	SpScore	IonProportion	Redundancy	Sequence
+E1D_raw_1__154436_3	4	58	81.9%	72	7500	7.3	U	# 1956 # 2171 # 1 # ID=154436_3;partial=01;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
+	20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3	5.8526	0.4034	2772.4479	2772.4445	1.2211	0.0034	6048.2	1	1349.0	40.0	30	-.MLSIQTNIAALSAQNALTTTNNNLQK.S
+	20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3	4.5965	0.4101	3275.6674	3275.6608	2.0124	0.0066	5944.2	1	884.7	28.4	7	-.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
+	20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2	4.5641	0.462	1594.7954	1594.7956	-0.1435	-0.0002	4981.9	1	1347.9	76.7	4	R.INHAADDAAGLAISEK.M
+	20120806_Erio_Deep_Aug2010_27Jun_TR1_04.07913.07913.2	4.8897	0.2913	1384.7232	1384.7250	-1.2769	-0.0018	5676.4	1	1767.3	91.7	17	K.MQAQIGGLNQAVR.N
+E1D_raw_1__154435_1	3	41	79.3%	58	6132	7.3	U	# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp
+	20120806_Erio_Deep_Aug2010_27Jun_TR1_04.22361.22361.3	5.8526	0.4034	2772.4479	2772.4445	1.2211	0.0034	6048.2	1	1349.0	40.0	30	-.MLSIQTNIAALSAQNALTTTNNNLQK.S
+	20120806_Erio_Deep_Aug2010_27Jun_TR1_10.22784.22784.3	4.5965	0.4101	3275.6674	3275.6608	2.0124	0.0066	5944.2	1	884.7	28.4	7	-.MLSIQTNIAALSAQNALTTTNNNLQKSMER.L
+	20120806_Erio_Deep_Aug2010_27Jun_TR1_03.06010.06010.2	4.5641	0.462	1594.7954	1594.7956	-0.1435	-0.0002	4981.9	1	1347.9	76.7	4	R.INHAADDAAGLAISEK.M
+E1D_raw_1__40591_2	3	8	74.5%	51	5250	8.6	U	# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None
+*	20120806_Erio_Deep_Aug2010_27Jun_TR1_01.18475.18475.2	4.3739	0.5173	2140.0658	2140.0653	0.2192	0.0005	7636.1	1	1642.9	65.0	1	K.TSDVAGDGTTTATILAQSIYR.E
+*	20120806_Erio_Deep_Aug2010_27Jun_TR1_02.21883.21883.2	3.4843	0.1996	2553.2977	2553.2928	1.9293	0.0049	6903.6	1	903.4	47.9	3	K.TSDVAGDGTTTATILAQSIYREGVK.A
+*	20120806_Erio_Deep_Aug2010_27Jun_TR1_08.06194.06194.2	2.7604	0.1784	1326.7055	1326.7083	-2.1145	-0.0028	6041.5	1	860.0	66.7	4	K.AVAAGANPMELKR.G
+	Proteins	Peptide IDs	Copies
+Unfiltered	318515	400116	506301
+Redundant	1575	3555	18759
+Nonredundant	1211	2557	12384
+Classification	Nonredundant Proteins	Redundant Proteins
+Unclassified	0	0

data/spec/divvy_proteomics_spec.rb CHANGED Viewed

@@ -83,8 +83,8 @@ describe script_under_test do
     stderr.should eq("")
     answer = header+
-    ['Mstor_v4.3.2:1344','0','188','94.0','0.5','Methanoflorens_stordalmirensis_v4.3.2_01361 Methyl-coenzyme M reductase I subunit gamma ','alias1'+"\n"].join("\t")+
-    ['alias1','0','188','94.0','0.5','alias1 Methyl-coenzyme M reductase I subunit gamma ','Mstor_v4.3.2:1344'+"\n"].join("\t")
+    ['Mstor_v4.3.2:1344','0','188','0','0.0','Methanoflorens_stordalmirensis_v4.3.2_01361 Methyl-coenzyme M reductase I subunit gamma ','alias1'+"\n"].join("\t")+
+    ['alias1','0','188','0','0.0','alias1 Methyl-coenzyme M reductase I subunit gamma ','Mstor_v4.3.2:1344'+"\n"].join("\t")
     stdout.should eq(answer), test_file
   end
@@ -103,4 +103,16 @@ describe script_under_test do
       stdout.should eq(answer)
     end
   end
+  it 'should work with the newer file format, wherever that came from' do
+    test_file = "#{path_to_script} #{TEST_DATA_DIR}/new_format.csv --trace error"
+    status, stdout, stderr = systemu test_file
+    stderr.should eq("")
+    answer = header+
+    ['E1D_raw_1__154436_3','17','41','58.0','0.8787878787878788','# 1956 # 2171 # 1 # ID=154436_3;partial=01;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154435_1'+"\n"].join("\t")+
+    ['E1D_raw_1__154435_1','0','41','0','0.0','# 3 # 176 # -1 # ID=154435_1;partial=10;start_type=ATG;rbs_motif=AGGAG;rbs_spacer=5-10bp ','E1D_raw_1__154436_3'+"\n"].join("\t")+
+    ['E1D_raw_1__40591_2','8','0','8.0','0.12121212121212122','# 705 # 857 # 1 # ID=40591_2;partial=01;start_type=ATG;rbs_motif=None;rbs_spacer=None ',"\n"].join("\t")
+    stdout.should eq(answer)
+  end
 end

metadata CHANGED Viewed

@@ -1,110 +1,97 @@
 --- !ruby/object:Gem::Specification
 name: divvy_proteomics
 version: !ruby/object:Gem::Version
-  version: 0.0.1
-  prerelease:
+  version: 0.1.0
 platform: ruby
 authors:
 - Ben J Woodcroft
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-04-13 00:00:00.000000000 Z
+date: 2013-11-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bio-logger
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: systemu
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: 2.8.0
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: 2.8.0
 - !ruby/object:Gem::Dependency
   name: rdoc
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '3.12'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '3.12'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: 1.0.0
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: 1.0.0
 - !ruby/object:Gem::Dependency
   name: jeweler
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: 1.8.4
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: 1.8.4
 description: divvy up spectra from DTASelect files in a somewhat parsimonious way
@@ -126,8 +113,10 @@ files:
 - bin/divvy_spectra
 - divvy_proteomics.gemspec
 - lib/divvy_proteomics.rb
+- lib/dta_select_output.rb
 - spec/data/merge_definition.csv
 - spec/data/multiply_mapped_spectra.csv
+- spec/data/new_format.csv
 - spec/data/single_protein.csv
 - spec/data/single_protein_with_aliases.csv
 - spec/data/three_proteins.csv
@@ -138,29 +127,25 @@ files:
 homepage: http://github.com/wwood/divvy_proteomics
 licenses:
 - MIT
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
-      segments:
-      - 0
-      hash: -659530255
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 2.0.3
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: divvy up spectra from DTASelect files in a parsimonious way
 test_files: []