RubyGems - mspire-simulator - Versions diffs - 0.2.1 → 0.3.0 - Mend

mspire-simulator 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/lib/ms/sim_modifications.rb ADDED Viewed

@@ -0,0 +1,40 @@
+require 'obo/ontology'
+class Modifications
+  def initialize(mods)
+    @modifications = mods
+    @modifications = @modifications.split(/_/)
+    if @modifications[0] != "false"
+      get_mods
+    end
+    return @modifications
+  end
+  def get_mods()
+    mods = {}
+    obo = Obo::Ontology.new(Obo::Ontology::DIR + '/mod.obo')
+    @modifications.each do |mod|
+      diff = nil
+      residue = mod[9..mod.size-1]
+      mod = (obo.id_to_element[mod[0..8]]).tagvalues
+      xref = mod['xref']
+      xref.each do |x|
+        if x =~ /DiffFormula/
+          diff = (x.split(/"/))[1]
+        end
+      end
+      if mods[residue] == nil
+        mods[residue] = [[mod['id'][0],diff]]
+      else
+        mds = mods[residue]
+        mds<<[mod['id'][0],diff]
+        mods[residue] = mds
+      end
+    end
+    @modifications = mods
+  end
+  attr_reader :modifications
+  attr_writer :modifications
+end

data/lib/ms/sim_peptide.rb CHANGED Viewed

@@ -1,117 +1,45 @@
 require 'mspire/isotope/distribution'
 module MS
   class Peptide
-    def initialize(sequence, charge, abu = 1.0)
+    def initialize(sequence, charge, abu = 1.0,db,id,prot_id,modifications)
       @abu = abu
       @p_rt = 0
       @p_int = 0
       @rts = []
-      @charge = charge #this is saved in the file name as well
+      @charge = charge
+      @mods = modifications
+      spec = calcSpectrum(sequence)
-      spec = calcSpectrum(sequence, @charge)
+      # TODO Ryan: alter this to handle variable and static mass modifications...Add it from the Katamari code
-      # TODO Ryan: alter this to handle variable and static mass modifications... Add it from the Katamari code
+      #core mzs, ints
+      db.execute "INSERT INTO core_spec VALUES(#{id},'#{spec.mzs}','#{spec.intensities}')"
-      @core_ints = spec.intensities.clone
-      @core_mzs = spec.mzs.clone
-      @mzs_file = ".m/#{sequence[0]}/#{sequence[0...15]}_#{charge}"
-      @ints_file = ".i/#{sequence[0]}/#{sequence[0...15]}_#{charge}"
-      file = File.open(@mzs_file, "w")
-      file.puts(sequence)
-      file.close
       @mono_mz = spec.mzs[spec.intensities.index(spec.intensities.max)]
       @mass = @mono_mz * @charge
       #U,O,X ???
+      @aa_counts = []
+      stm = "INSERT INTO aac VALUES(#{id},"
       amino_acids = ['A','R','N','D','B','C','E','Q','Z','G','H','I',
         'L','K','M','F','P','S','T','W','Y','V','J']
-      @aa_counts = amino_acids.map do |aa|
-        sequence.count(aa)
+      amino_acids.map do |aa|
+        count = sequence.count(aa)
+        stm<<"#{count},"
+        count
       end
-      @aa_counts<<0.0
-    end
-    attr_reader :mass, :charge, :mono_mz, :core_mzs, :p_rt, :p_int, :core_ints, :hydro, :pi, :aa_counts, :p_rt_i, :abu, :sx
-    attr_writer :mass, :charge, :mono_mz, :core_mzs, :p_rt, :p_int, :core_ints, :hydro, :pi, :aa_counts, :p_rt_i, :abu, :sx
-    def to_s
-      file = File.open(@mzs_file,"r")
-      seq = file.gets.chomp
-      file.close
-      "Peptide: #{seq}"
-    end
-    def sequence
-      file = File.open(@mzs_file,"r")
-      seq = file.gets.chomp
-      file.close
-      seq
+      stm<<"0.0)" #place holder for predicted values
+      stm = db.prepare(stm)
+      stm.execute
+      stm.close if stm
+      db.execute "INSERT INTO peptides VALUES(#{id},'#{sequence}', #{@mass}, #{charge}, #{@mono_mz}, #{@p_rt},NULL, #{@p_int}, #{@abu}, NULL,NULL,NULL,#{prot_id})"
     end
-    #---------------------------------------------------------------------------
-    def ints
-      file = File.open(@ints_file, "r")
-      line = file.gets.chomp.split(/;/)
-      file.close
-      ints = []
-      line.each do |iso|
-        ints<<iso.chomp.split(/,/).map!{|fl| fl.to_f}
-      end
-      return ints
-    end
-    def insert_ints(arr)
-      file = File.open(@ints_file, "a")
-      arr.each do |val|
-        file.print("#{val},")
-      end
-      file.print(";")
-      file.close
-    end
-    def mzs
-      file = File.open(@mzs_file, "r")
-      line = file.gets
-      line = file.gets.chomp.split(/;/)
-      file.close
-      mzs = []
-      line.each do |iso|
-        mzs<<iso.chomp.split(/,/).map!{|fl| fl.to_f}
-      end
-      return mzs
-    end
-    def insert_mzs(arr)
-      file = File.open(@mzs_file, "a")
-      arr.each do |val|
-        file.print("#{val},")
-      end
-      file.print(";")
-      file.close
-    end
-    def rts
-      return Sim_Spectra::r_times[@rts[0]..@rts[1]]
-    end
-    def set_rts(a,b)
-      @rts = [a,b]
-    end
-    def delete
-      if File.exists?(@mzs_file)
-        File.delete(@mzs_file)
-      end
-      if File.exists?(@ints_file)
-        File.delete(@ints_file)
-      end
-    end
-    #---------------------------------------------------------------------------
     # Calculates theoretical specturm
     #
-    def calcSpectrum(seq, charge)
+    def calcSpectrum(seq)
       #isotope.rb from Dr. Prince
       atoms = countAtoms(seq)
@@ -131,7 +59,7 @@ module MS
       var<<"Se"
       var<<atoms[6].to_s
-      mf = Mspire::MolecularFormula.from_string(var, charge)
+      mf = Mspire::MolecularFormula.from_string(var, @charge)
       spec = Mspire::Isotope::Distribution.spectrum(mf, :max, 0.001)
       spec.intensities.map!{|i| i = i*100.0}
@@ -143,6 +71,7 @@ module MS
     # Counts the number of each atom in the peptide sequence.
     #
     def countAtoms(seq)
+      atom_indexes = {'O' => 0,'N' => 1,'C' => 2,'H' => 3,'S' => 4,'P' => 5,'Se' => 6}
       o = 0
       n = 0
       c = 0
@@ -150,9 +79,12 @@ module MS
       s = 0
       p = 0
       se = 0
+      @charge.times {h += 1}
+      atom_counts = [(o + 1),n,c,(h + 2),s,p,se]
       seq.each_char do |aa|
         #poly amino acids
+        #maybe in the future ignore fringe case amino acids
         #"X" is for any (I exclude uncommon "U" and "O")
         if aa == "X"
           aas = Mspire::Isotope::AA::ATOM_COUNTS.keys[0..19]
@@ -166,20 +98,46 @@ module MS
           aas = ["Q","E"]
           aa = aas[rand(2)]
         end
+        #perform modification for residue
+        if @mods != nil
+          if @mods[aa] != nil
+            mods = @mods[aa]
+            mods.each do |mod|
+              mod[1].split(/\s/).each_slice(2) do |sl|
+                atom_counts[atom_indexes[sl[0]]] = atom_counts[atom_indexes[sl[0]]] + sl[1].to_i
+              end
+            end
+          elsif seq[0] == aa and @mods["CT"] != nil#N-terminus
+            mods = @mods["CT"]
+            mods.each do |mod|
+              mod[1].split(/\s/).each_slice(2) do |sl|
+                atom_counts[atom_indexes[sl[0]]] = atom_counts[atom_indexes[sl[0]]] + sl[1].to_i
+              end
+            end
+          elsif seq[-1] == aa and @mods["NT"] != nil#C-terminus
+            mods = @mods["NT"]
+            mods.each do |mod|
+              mod[1].split(/\s/).each_slice(2) do |sl|
+                atom_counts[atom_indexes[sl[0]]] = atom_counts[atom_indexes[sl[0]]] + sl[1].to_i
+              end
+            end
+          end
+        end
         if aa !~ /A|R|N|D|C|E|Q|G|H|I|L|K|M|F|P|S|T|W|Y|V|U|O/
           puts "No amino acid match for #{aa}"
         else
-          o = o + Mspire::Isotope::AA::ATOM_COUNTS[aa][:o]
-          n = n + Mspire::Isotope::AA::ATOM_COUNTS[aa][:n]
-          c = c + Mspire::Isotope::AA::ATOM_COUNTS[aa][:c]
-          h = h + Mspire::Isotope::AA::ATOM_COUNTS[aa][:h]
-          s = s + Mspire::Isotope::AA::ATOM_COUNTS[aa][:s]
-          p = p + Mspire::Isotope::AA::ATOM_COUNTS[aa][:p]
-          se = se + Mspire::Isotope::AA::ATOM_COUNTS[aa][:se]
+          atom_counts[0] = atom_counts[0] + Mspire::Isotope::AA::ATOM_COUNTS[aa][:o]
+          atom_counts[1] = atom_counts[1] + Mspire::Isotope::AA::ATOM_COUNTS[aa][:n]
+          atom_counts[2] = atom_counts[2] + Mspire::Isotope::AA::ATOM_COUNTS[aa][:c]
+          atom_counts[3] = atom_counts[3] + Mspire::Isotope::AA::ATOM_COUNTS[aa][:h]
+          atom_counts[4] = atom_counts[4] + Mspire::Isotope::AA::ATOM_COUNTS[aa][:s]
+          atom_counts[5] = atom_counts[5] + Mspire::Isotope::AA::ATOM_COUNTS[aa][:p]
+          atom_counts[6] = atom_counts[6] + Mspire::Isotope::AA::ATOM_COUNTS[aa][:se]
         end
       end
-      return (o + 1),n,c,(h + 2) ,s,p,se
+      return atom_counts
     end
   end
 end

data/lib/ms/sim_spectra.rb CHANGED Viewed

@@ -6,8 +6,8 @@ require 'ms/sim_feature'
 module MS
   class Sim_Spectra
-    def initialize(peptides,opts,one_d = false)
-      @data
+    def initialize(opts,one_d = false,db)
+      @opts = opts
       @max_mz
       sampling_rate = opts[:sampling_rate]
       run_time = opts[:run_time]
@@ -17,53 +17,39 @@ module MS
       @@r_times = []
       num_of_spec = sampling_rate*run_time
       spec_time = 1/sampling_rate
-      num_of_spec.to_i.times do
+      num_of_spec.to_i.times do |k|
         @@r_times<<spec_time+RThelper.RandomFloat(-var,var)
         spec_time = spec_time + (1/sampling_rate)
       end
       @@r_times = MS::Noise.spec_drops(drop_percentage)
-      pre_features = MS::Rtgenerator.generateRT(peptides,one_d)
+      MS::Rtgenerator.generateRT(one_d,db)
       #Features
-      features_o = MS::Sim_Feature.new(pre_features,opts,one_d)
-      @features = features_o.features
-      @data = features_o.data
-      @max_mz = features_o.max_mz
-      @spectra = @data.clone
-      @noise = nil
+      @features_o = MS::Sim_Feature.new(opts,one_d,db)
+      @max_mz = @features_o.max_mz
     end
-    def noiseify
-      @noise = MS::Noise.noiseify(opts,@max_mz)
-      @@r_times.each do |k|
-        s_v = @data[k]
-        n_v = @noise[k]
-        if s_v != nil
-          @spectra[k] = [s_v[0]+n_v[0],s_v[1]+n_v[1]]
-        else
-          @spectra[k] = [n_v[0],n_v[1]]
+    def noiseify(db)
+      @noise = MS::Noise.noiseify(@opts,@max_mz)
+      cent_id = @features_o.cent_id + 1
+      @noise.each do |key,val|
+        mzs = val[0]
+        ints = val[1]
+        mzs.each_with_index do |mz,index|
+          db.execute "INSERT INTO spectra VALUES(#{cent_id},NULL,#{key},#{mz},#{ints[index]},NULL)"
+          cent_id += 1
         end
       end
-      return @noise
     end
     def self.r_times
       @@r_times
     end
-    attr_reader :data, :max_mz, :spectra, :noise, :features
-    attr_writer :data, :max_mz, :spectra, :noise, :features
+    attr_reader :max_mz
+    attr_writer :max_mz
   end
 end
-#charge ratio: take both charge states, determine pH effective
-#more small peaks from lesser charge states
-#one_d
-#fit to other labs data - different machine

data/lib/ms/sim_trollop.rb CHANGED Viewed

@@ -27,19 +27,20 @@ module MS
                 trypsin,\n \t\tv8_e_trypsin,
                 v8_de_trypsin",
                 :default => "trypsin"
+                opt :missed_cleavages, "Number of missed cleavages during digestion", :default => 2
                 opt :sampling_rate, "How many scans per second", :default => 0.5
                 opt :run_time, "Run time in seconds", :default => 1000.0
                 opt :noise, "Noise on or off", :default => "true"
                 opt :noise_density, "Determines the density of white noise", :default => 10
-		opt :noiseMaxInt, "The max noise intensity level", :default => 1000
-		opt :noiseMinInt, "The minimum noise intensity level", :default => 50
+                opt :noiseMaxInt, "The max noise intensity level", :default => 1000
+                opt :noiseMinInt, "The minimum noise intensity level", :default => 50
                 opt :pH, "The pH that the sample is in - for determining charge", :default => 2.6
                 opt :out_file, "Name of the output file", :default => "test.mzml"
                 opt :contaminants, "Fasta file containing contaminant sequences", :default => "testFiles/contam/hum_keratin.fasta"
                 opt :dropout_percentage, "Defines the percentage of random dropouts in the run. 0.0 <= percentage < 1.0", :default => 0.01
                 opt :shuffle, "Option shuffles the scans to simulate 1d data", :default => "false"
                 opt :one_d, "Turns on one dimension simulation; run_time is automatically set to 300.0", :default => "false"
-                opt :truth, "Determines truth file type; false gives no truth file; one of: xml or csv", :default => "false"
+                opt :truth, "Determines truth file type; false gives no truth file; one of: 'xml' or 'csv' or 'xml_csv' (for both)", :default => "csv"
                 opt :front, "Fronting chromatography parameter", :default => 6.65
                 opt :tail, "Tailing chromatography parameter", :default => 0.30
                 opt :mu, "Expected value of the chromatography curve", :default => 25.0
@@ -53,7 +54,11 @@ module MS
                 opt :mzml, "Mzml file to extract simulation parameters from", :default => "nil"
                 opt :generations, "If an mzml file is provided this specifies the number of generations for the curve fitting algorithm", :default => 30000
                 opt :mass_label, "Specify a mass tag pattern", :default => 0
-                opt :modifications, "Use a specific modifications file, or read them from a header of the fasta file, perhaps... TBD..."
+                opt :ms2s, "Number of peptide ms2s to perform on each scan", :default => 1
+                opt :ms2, "Turn on/off ms2 (true == on)", :default => "true"
+                opt :databaseName, "Name of database file", :default => "peptides_[Time.now.sec]"
+                opt :memory, "Determines whether to store the database in memory or write to file (false == write to file) Note: if true no database file will be accessible after simulation", :default => "false"
+                opt :modifications, "To define residue or termini modifications. Enter a string Id1R1_Id2R2_ ... where Idi is a modification Id from http://psidev.cvs.sourceforge.net/viewvc/psidev/psi/mod/data/PSI-MOD.obo and Ri is the residue/terminus to apply it to (c-term = CT, n-term = NT)", :default => "false"
       end

data/lib/ms/tr_file_writer.rb CHANGED Viewed

@@ -1,188 +1,71 @@
 require 'progress'
-#if m/z value is in "[m/z, percentage contributed to peak]" it's a
-#merged peak.
 module MS
   class Txml_file_writer
-    def self.write(features,spectra,file_name)
-      @spectra = spectra
+    def self.write(db,file_name,opts)
+      prog = Progress.new("Writing xml:")
       file = File.open("#{file_name}_truth.xml","w")
-      r_times = spectra.keys.sort
+      peps = db.execute "SELECT * FROM peptides"
       file.puts "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
       file.puts "<simulated_peptides>"
-      total = features.size.to_f
-      prog = Progress.new("Writing xml:")
+      file.puts "<simulator_options>\n"
+      opts.each do |k,v|
+        file.puts "\t#{k}=#{v},"
+      end
+      file.puts "</simulator_options>\n"
+      total = peps.size.to_f
       num = 0
       step = total/100.0
-      features.each_with_index do |fe,k|
-        sequence = fe.sequence
-        charge = fe.charge
-        mzs = fe.mzs
-        ints = fe.ints
-        rts = fe.rts
+      peps.each do |pep|
+        k = pep[0]
         if k > step * (num + 1)
           num = (((k/total)*100).to_i)
           prog.update(num)
         end
+        sequence = pep[1]
+        charge = pep[3]
+        cents = db.execute "SELECT * FROM spectra WHERE pep_id=#{k}"
         file.puts "\t<simulated_peptide sequence=\"#{sequence}\" charge=\"#{charge.round}\">"
-        mzs.each_with_index do |mzs,i|
-          tags = ""
-          centroids = ""
-          tags<<"\t\t<lc_centroids isotopic_index=\"#{i}\">"
-          mzs.each_with_index do |mz,ind|
-            if ints[i][ind] > 0.9
-              index = get_ind(mz,rts[ind])
-              centroids<<"#{r_times.index(rts[ind])},#{index.inspect};"
-            end
-          end
-          if centroids != ""
-            tags<<centroids
-            tags<<"</lc_centroids>\n"
-            file<<tags
-          end
+        tags = ""
+        tags<<"\t\t<centroids>\n"
+        centroids = ""
+        cents.each do |cent|
+          centroids<<"\t\t\tcent_id=#{cent[0]},pep_id=#{cent[1]},rt=#{cent[2]},mz=#{cent[3]},int=#{cent[4]},merge_id=#{cent[5]}\n"
         end
+        tags<<centroids
+        tags<<"\t\t</centroids>\n"
+        file<<tags
         file.puts "\t</simulated_peptide>"
       end
       file.puts "</simulated_peptides>"
       file.close
       prog.finish!
     end
-    def self.get_ind(mz,rt)
-      index = nil
-      if @spectra[rt] != nil
-        mzs = @spectra[rt][0]
-        ints = @spectra[rt][1]
-        mzs.each_with_index do |m, i|
-          if m == mz
-            index = i
-          elsif m.class == Hash
-            if ind = m.values[0].index(mz)
-              index = [i,m.keys[0][ind+1]]
-            end
-          end
-        end
-      end
-      return index
-    end
   end
   class Tcsv_file_writer
-    def self.write(full_spectra,spectra,noise,features,file_name)
-      @spectra = full_spectra
-      #create indices for real peaks
-      ind_hash = create_indicies(features)
-      #create data structure with indices
-      data = data_with_indicies(full_spectra,spectra,noise,ind_hash)
-      #group by retention time
-      data = data.group_by{|d| d[0]}
+    def self.write(db,file_name,opts)
+      prog = Progress.new("Writing csv:")
+      spectra = db.execute "SELECT * FROM spectra AS S JOIN peptides AS P ON S.pep_id=P.Id"
+      total = spectra.size
       #write
       file = File.open("#{file_name}_truth.csv","w")
-      file.puts "rt,mz,int,index"
-      total = data.size.to_f
+      file.puts "simulator_options=#{opts.inspect}"
+      file.puts "rt,mz,int,centroid_id,merge_id,peptide_id,protien_id,seq,charge,abu"
       count = 0
-      prog = Progress.new("Writing csv(process 2 of 2):")
       num = 0
       step = total/100
-      data.each_value do |val|
-        if count > step * (num + 1)
-          num = (((count/total)*100).to_i)
-          prog.update(num)
-        end
-        val.each do |a|
-          if a[3] >= 1
-            file.puts "#{a[0]},#{a[1]},#{a[2]},#{a[3]}"
-          else
-            file.puts "#{a[0]},#{a[1]},#{a[2]},#{0}"
-          end
-        end
-        count += 1
+      spectra.each do |cent|
+        file.puts "#{cent[2]},#{cent[3]},#{cent[4]},#{cent[0]},#{cent[5]},#{cent[1]},#{cent[18]},#{cent[7]},#{cent[9]},#{cent[14]}"
       end
       file.close
       prog.finish!
     end
-    def self.get_merged_mz(mz,rt)
-      m_mz = nil
-      int = nil
-      mzs = @spectra[rt][0]
-      ints = @spectra[rt][1]
-      mzs.each_with_index do |m, i|
-        if m == mz
-          m_mz = mz
-          int = ints[i]
-        elsif m.class == Hash
-          if ind = m.values[0].index(mz)
-            m_mz = [m.keys[0][0],m.keys[0][ind+1]]
-            int = ints[i].flatten.inject(:+)
-          end
-        end
-      end
-      return m_mz,int
-    end
-    def self.create_indicies(features)
-      ind_hash = {}
-      features.each_with_index do |pep,i|
-        pep.mzs.each_with_index do |m_ar,j|
-          m_ar.each do |mz|
-            ind_hash[mz] = "#{i + 1}.#{j + 1}".to_f
-          end
-        end
-      end
-      return ind_hash
-    end
-    def self.data_with_indicies(full_spectra,spectra,noise,ind_hash)
-      count = 1
-      time_i = 0.0
-      data = []
-      total = spectra.length
-      prog = Progress.new("Writing csv(process 1 of 2):")
-      num = 0
-      step = total/100
-      spectra.each do |k,v|
-        if time_i > step * (num + 1)
-          num = (((time_i/total)*100).to_i)
-          prog.update(num)
-        end
-        merged_d = full_spectra[k]
-        merged_mzs = merged_d[0]
-        merged_ints = merged_d[1]
-        if noise != "false"
-          n_data = noise[k]
-        end
-        if v != nil
-          v.each_slice(2) do |m,i|
-            m.each_with_index do |mz,index|
-              peak_index = ind_hash[mz]
-              mz,int = get_merged_mz(mz,k)
-              data<<[k,mz.inspect,int,peak_index]
-            end
-          end
-        end
-        if noise != "false"
-          n_data.each_slice(2) do |m,i|
-            m.each_with_index do |mz,index|
-              mz,int = get_merged_mz(mz,k)
-              data<<[k,mz.inspect,int,0]
-            end
-          end
-        end
-        time_i += 1
-      end
-      return data
-    end
   end
 end