RubyGems - mspire-simulator - Versions diffs - 0.2.1 → 0.3.0 - Mend

mspire-simulator 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/lib/ms/noise.rb CHANGED Viewed

@@ -7,7 +7,7 @@ module MS
     module_function
     def noiseify(opts,max_mz)
       # spectra is {rt => [[mzs],[ints]]}
-      desity = opts[:noise_density]
+      density = opts[:noise_density]
       max_int = opts[:noiseMaxInt]
       min_int = opts[:noiseMinInt]
       @noise = {}

data/lib/ms/rt/rtgenerator.rb CHANGED Viewed

@@ -10,49 +10,56 @@ module MS
   module Rtgenerator
     module_function
-    def generateRT(peptides, one_d)
+    def generateRT(one_d,db)
+      prog = Progress.new("Generating retention times:")
       @r_times = Sim_Spectra.r_times
       # Gets retention times from the weka model
-      peptides = MS::Weka.predict_rts(peptides)
-      MS::Weka.predict_ints(peptides)
+      MS::Weka.predict_rts(db)
+      MS::Weka.predict_ints(db)
       #-----------------------------------------------------------------
-      prog = Progress.new("Generating retention times:")
       num = 0
-      total = peptides.size
-      step = total/100.0
       max_rt = 4*(@r_times.max/5)
       r_end = max_rt + (@r_times.max/5)/2
       r_start = @r_times.max/5
-      peptides.each_with_index do |pep,ind|
-	if ind > step * (num + 1)
-	  num = (((ind+1)/total.to_f)*100).to_i
-	  prog.update(num)
-	end
+      peps = db.execute "SELECT Id,p_rt,abu,seq FROM peptides"
+      total = peps.size
+      step = total/100.0
+      peps.each do |pep|
+        ind = pep.delete_at(0)
+        init_p_rt = pep[0]
+        abu = pep[1]
+        seq = pep[2]
+        pep_p_rt = nil
+        pep_p_rt_i = nil
+        if ind > step * (num + 1)
+          num = (((ind+1)/total.to_f)*100).to_i
+          prog.update(num)
+        end
         #Fit retention times into scan times
-        p_rt = pep.p_rt * 10**-2
-	percent_time = p_rt
-	sx = RThelper.gaussian(percent_time,0.5,0.45,1.0) * Math.sqrt(pep.abu) #need to figure out what these values should be
-	pep.sx = sx
+        p_rt = init_p_rt * 10**-2
+        percent_time = p_rt
+        sx = RThelper.gaussian(percent_time,0.5,0.45,1.0) * Math.sqrt(abu) #need to figure out what these values should be
         if p_rt > 1
-          pep.p_rt = @r_times.find {|i| i >= r_end}
-          pep.p_rt_i = @r_times.index(pep.p_rt)
+          pep_p_rt = @r_times.find {|i| i >= r_end}
+          pep_p_rt_i = @r_times.index(pep_p_rt)
         else
-          pep.p_rt = @r_times.find {|i| i >= (p_rt * max_rt)}
-          pep.p_rt_i = @r_times.index(pep.p_rt)
+          pep_p_rt = @r_times.find {|i| i >= (p_rt * max_rt)}
+          pep_p_rt_i = @r_times.index(pep_p_rt)
         end
+        a = nil
+        b = nil
-        if pep.p_rt == nil
-          puts "\n\n\t#{pep} TIME-> #{p_rt*max_rt} :: Peptide not predicted in time range: try increasing run time\n\n."
+        if pep_p_rt == nil
+          puts "\n\n\t#{seq} TIME-> #{p_rt*max_rt} :: Peptide not predicted in time range: try increasing run time\n\n."
         else
           #Give peptide retention times
@@ -66,8 +73,8 @@ module MS
             tail_length = 300 * sx
           end
-          a = @r_times.find {|i| i >= (pep.p_rt-head_length)}
-          b = @r_times.find {|i| i >= (pep.p_rt+tail_length)}
+          a = @r_times.find {|i| i >= (pep_p_rt-head_length)}
+          b = @r_times.find {|i| i >= (pep_p_rt+tail_length)}
           a = @r_times.index(a)
           b = @r_times.index(b)
@@ -79,15 +86,11 @@ module MS
             b = @r_times[@r_times.length-1]
           end
-          pep.set_rts(a,b)
         end
+        db.execute "UPDATE peptides SET p_rt=#{pep_p_rt},p_rt_index=#{pep_p_rt_i},sx=#{sx},rt_a=#{a},rt_b=#{b} WHERE Id='#{ind}'"
       end
       #-----------------------------------------------------------------
       prog.finish!
-      return peptides
     end
   end
 end

data/lib/ms/rt/weka.rb CHANGED Viewed

@@ -5,13 +5,16 @@ module MS
   module Weka
     #James Dalg
     module_function
-    def predict_rts(peptides)
+    def predict_rts(db)
       #mz,charge,intensity,rt,A,R,N,D,B,C,E,Q,Z,G,H,I,L,K,M,F,P,S,T,W,Y,V,J,mass,hydro,pi
       #make arrf file to feed weka model
       data = []
-      peptides.each do |pep|
-        data<<pep.aa_counts
+      rs = db.execute "SELECT * FROM aac"
+      rs.each do |row|
+        row.delete_at(0)
+        data<<row
       end
       arff = make_rt_arff(Time.now.nsec.to_s,data)
       path = Gem.bin_path('mspire-simulator', 'mspire-simulator').split(/\//)
@@ -24,23 +27,24 @@ module MS
       count = 0
       while line = file.gets
         if line =~ /(\d*\.\d{0,3}){1}/
-          peptides[count].p_rt = line.match(/(\d*\.\d{0,3}){1}/)[0].to_f
+          p_rt = line.match(/(\d*\.\d{0,3}){1}/)[0].to_f
+          db.execute "UPDATE peptides SET p_rt=#{p_rt} WHERE Id='#{count}'"
           count += 1
         end
       end
       system("rm #{arff}.out")
-      return peptides
     end
-    def predict_ints(peptides)
+    def predict_ints(db)
       data = []
-      peptides.each do |pep|
-        array = []
-        array<<pep.mono_mz<<pep.charge<<pep.mass<<pep.p_rt
-        data << array.concat(pep.aa_counts)
+      aas = "A,R,N,D,B,C,E,Q,Z,G,H,I,L,K,M,F,P,S,T,W,Y,V,J,place_holder"
+      rs = db.execute "SELECT mono_mz, charge, mass, p_rt,#{aas} FROM peptides NATURAL JOIN aac" #JOIN aac
+      rs.each do |row|
+        data<<row
       end
       arff = make_int_arff(Time.now.nsec.to_s,data)
       path = Gem.bin_path('mspire-simulator', 'mspire-simulator').split(/\//)
@@ -53,12 +57,12 @@ module MS
       count = 0
       while line = file.gets
         if line =~ /(\d*\.\d{0,3}){1}/
-          peptides[count].p_int = line.match(/(\d*\.\d{0,3}){1}/)[0].to_f
+          p_int = line.match(/(\d*\.\d{0,3}){1}/)[0].to_f
+          db.execute "UPDATE peptides SET p_int=#{p_int} WHERE Id='#{count}'"
           count += 1
         end
       end
       system("rm #{arff}.out")
-      return peptides
     end

data/lib/ms/sim_digester.rb CHANGED Viewed

@@ -1,34 +1,32 @@
+require 'obo/ontology'
 class String
-  abu = 0
-  attr_reader :abu
-  attr_writer :abu
+  attr_reader :abu, :prot_id
+  attr_writer :abu, :prot_id
 end
 module MS
   class Sim_Digester
-    attr_reader :digested_file
-    attr_writer :digested_file
-    def initialize(digestor,pH)
-      @digestor = digestor
-      @pH = pH
-      @digested_file = ".#{Time.now.nsec.to_s}"
-      system("mkdir .m .i")
-      system("mkdir .m/A .m/R .m/N .m/D .m/C .m/E .m/Q .m/G .m/H .m/I .m/L .m/K .m/M .m/F .m/P .m/S .m/T .m/W .m/Y .m/V .m/U .m/O")
-      system("mkdir .i/A .i/R .i/N .i/D .i/C .i/E .i/Q .i/G .i/H .i/I .i/L .i/K .i/M .i/F .i/P .i/S .i/T .i/W .i/Y .i/V .i/U .i/O")
+    def initialize(opts,db)
+      @db = db
+      @db.execute "CREATE TABLE IF NOT EXISTS digested(prot_id INTEGER PRIMARY KEY,header TEXT, abu REAL, sequence TEXT, peptides TEXT)"
+      @digestor = opts[:digestor]
+      @pH = opts[:pH]
+      @missed_cleavages = opts[:missed_cleavages]
+      @modifications = Modifications.new(opts[:modifications]).modifications
+      @digested = nil
     end
-    def clean
-      system("rm -r -f .m .i")
-    end
-    def create_digested_file(file)
+    def create_digested(file)
       abundances = []
+      headers = []
       inFile = File.open(file,"r")
       seq = ""
       inFile.each_line do |sequence|
         if sequence =~ />/
+          headers<<sequence
           num = sequence.match(/\#.+/).to_s.chomp.gsub('#','')
           if num != ""
             abundances<<(num.to_f)*10.0**-2
@@ -38,7 +36,7 @@ module MS
         sequence
         seq = seq<<";"
         elsif sequence == "/n"; else
-          seq = seq<<sequence.chomp
+          seq = seq<<sequence.chomp.upcase
         end
       end
       inFile.close
@@ -47,72 +45,49 @@ module MS
       trypsin = Mspire::Digester[@digestor]
-      digested = []
-      d_file = File.open(@digested_file, "w")
+      @digested = []
       proteins.each_with_index do |prot,index|
-        dig = trypsin.digest(prot)
+        dig = trypsin.digest(prot,@missed_cleavages) # two missed cleavages for fig 6
+        @db.execute "INSERT INTO digested(header,abu,sequence,peptides) VALUES(\"#{headers[index]}\",#{abundances[index]},\"#{prot}\",'#{dig}')"
         dig.each do |d|
           d.abu = abundances[index]
-          digested<<d
+          d.prot_id = index
+          @digested<<d
         end
       end
       proteins.clear
-      digested.uniq!
-      trun_digested = []
-      if digested.length > 50000
-        50000.times do
-          trun_digested<<digested[rand(digested.length)]
-        end
-        digested.clear
-        digested = trun_digested
-      end
-      digested.each do |dig|
-        d_file.puts(dig<<"#"<<dig.abu.to_s)
-      end
-      d_file.close
-      num_digested = digested.size
-      digested.clear
+      dige = @digested.uniq!
+      num_digested = @digested.size
       puts "Number of peptides: #{num_digested}"
-      return num_digested
     end
     def digest(file)
-      num_digested = create_digested_file(file)
+      prog = Progress.new("Creating peptides '#{file}':")
+      create_digested(file)
-      d_file = File.open(@digested_file, "r")
       i = 0
-      peptides = []
-      prog = Progress.new("Creating peptides '#{file}':")
+      count = 0
       num = 0
-      total = num_digested
+      total = @digested.size
       step = total/100.0
-      d_file.each_line do |peptide_seq|
-        peptide_seq.chomp!
-        peptide_seq.abu = peptide_seq.match(/#.+/).to_s.chomp.gsub('#','').to_f
-          peptide_seq.gsub!(/#.+/,'')
-          if i > step * (num + 1)
-            num = ((i/total.to_f)*100.0).to_i
+      @digested.each do |peptide_seq|
+          if count > step * (num + 1)
+            num = ((count/total.to_f)*100.0).to_i
             prog.update(num)
           end
         charge_ratio = charge_at_pH(identify_potential_charges(peptide_seq), @pH)
         charge_f = charge_ratio.floor
         charge_c = charge_ratio.ceil
-        peptide_f = MS::Peptide.new(peptide_seq, charge_f, peptide_seq.abu) if charge_f != 0
-        peptide_c = MS::Peptide.new(peptide_seq, charge_c, peptide_seq.abu) if charge_c != 0
+        peptide_f = MS::Peptide.new(peptide_seq, charge_f, peptide_seq.abu,@db,i,peptide_seq.prot_id,@modifications) if charge_f != 0
+        i += 1 if charge_f != 0
+        peptide_c = MS::Peptide.new(peptide_seq, charge_c, peptide_seq.abu,@db,i,peptide_seq.prot_id,@modifications) if charge_c != 0
+        i += 1 if charge_c != 0
-        peptides<<peptide_f if charge_f != 0
-        peptides<<peptide_c if charge_c != 0
-        i += 1
+        count += 1
       end
       prog.finish!
-      d_file.close
-      File.delete(@digested_file)
-      return peptides
     end
   end
 end

data/lib/ms/sim_feature.rb CHANGED Viewed

@@ -5,20 +5,11 @@ require 'ms/sim_peptide'
 require 'ms/rt/rt_helper'
 require 'ms/tr_file_writer'
-class Array
-  attr_reader :ms2, :ms_level, :pre_mz, :pre_int, :pre_charge
-  attr_writer :ms2, :ms_level, :pre_mz, :pre_int, :pre_charge
-end
 module MS
   class Sim_Feature
-    def initialize(peptides,opts,one_d)
-      @features = []
-      @data = {}
-      @max_int = 0.0
+    def initialize(opts,one_d,db)
+      @db = db
       @one_d = one_d
-      @max_time = Sim_Spectra.r_times.max
       @opts = opts
       @max_mz = -1
@@ -26,113 +17,25 @@ module MS
       #------------------Each_Peptide_=>_Feature----------------------
       prog = Progress.new("Generating features:")
       num = 0
-      total = peptides.size
+      @db.execute "CREATE TABLE IF NOT EXISTS spectra(cent_id INTEGER PRIMARY KEY,pep_id INTEGER,rt REAL,mzs REAL,ints REAL,merge_id INTEGER)"
+      @cent_id = 0
+      peps = @db.execute "SELECT * FROM peptides"
+      total = peps.size
       step = total/100.0
-      peptides.each_with_index do |pep,ind|
+      peps.each do |pep|
+        ind = pep[0]
         if ind > step * (num + 1)
           num = (((ind+1)/total.to_f)*100).to_i
           prog.update(num)
         end
-        feature = getInts(pep)
-        @features<<feature
+        getInts(pep)
       end
       prog.finish!
-      #---------------------------------------------------------------
-      #-----------------Transform_to_spectra_data_for_mzml------------
-      # rt => [[mzs],[ints]]
-      prog = Progress.new("Generating MS2 & Populating structure for mzml:")
-      num = 0
-      total = @features.size
-      step = total/100.0
-      ms2_count = 0
-      seq = nil
-      @features.each_with_index do |fe,k|
-        if k > step * (num + 1)
-          num = ((k/total.to_f)*100).to_i
-          prog.update(num)
-        end
-        fe_ints = fe.ints
-        fe_mzs = fe.mzs
-        ms2_int = fe.ints.flatten.max
-        ms2 = false
-        pre_mz = nil
-        pre_charge = nil
-        fe.rts.each_with_index do |rt,i|
-          rt_mzs = []
-          rt_ints = []
-          fe.core_mzs.size.times do |j|
-            mz,int = [ fe_mzs[j][i], fe_ints[j][i] ]
-            if @max_mz < mz
-              @max_mz = mz
-            end
-            if int == nil
-              int = 0.0
-            end
-            if int > 0.9
-              rt_mzs<<mz
-              rt_ints<<int
-              if int == ms2_int and fe.sequence.size > 1
-                ms2 = true
-                pre_mz = mz
-                pre_charge = fe.charge
-              end
-            end
-          end
-          spec = nil
-          if rt_mzs.include?(nil) or rt_mzs.empty?; else
-            if @data.key?(rt)
-              ms1 = @data[rt]
-              spec = [ms1[0] + rt_mzs, ms1[1] + rt_ints]
-              spec.ms_level = ms1.ms_level
-              spec.ms2 = ms1.ms2
-            else
-              spec = [rt_mzs, rt_ints]
-            end
-            if false#ms2 and fe.sequence != seq
-              #add ms2 spec
-	      seq = fe.sequence
-              spec.ms_level = 2
-              ms2_mzs = MS::Fragmenter.new.fragment(seq)
-              ms2_ints = Array.new(ms2_mzs.size,500.to_f)
-              spec2 = [(rt + RThelper.RandomFloat(0.01,@opts[:sampling_rate] - 0.1)), ms2_mzs, ms2_ints]
-              spec2.ms_level = 2
-              spec2.pre_mz = pre_mz
-              spec2.pre_int = ms2_int
-              spec2.pre_charge = pre_charge
-              if spec.ms2 != nil
-                ms2_arr = spec.ms2
-                ms2_arr<<spec2
-                spec.ms2 = ms2_arr
-              else
-                spec.ms2 = [spec2]
-              end
-              ms2_count += 1
-            end
-            @data[rt] = spec
-          end
-          ms2 = false
-        end
-      end
-      prog.finish!
-      puts "MS2s = #{ms2_count}"
-      #---------------------------------------------------------------
     end
-    attr_reader :data, :features, :max_mz
-    attr_writer :data, :features, :max_mz
+    attr_reader :max_mz, :cent_id
+    attr_writer :max_mz, :cent_id
     # Intensities are shaped in the rt direction by a gaussian with
     # a dynamic standard deviation.
@@ -140,50 +43,55 @@ module MS
     # by a simple gaussian curve (see 'factor' below).
     #
     def getInts(pep)
-      p_int = pep.p_int + RThelper.RandomFloat(-5,2)
+      pep_id = pep[0]
+      p_int = pep[7] + RThelper.RandomFloat(-5,2)
       if p_int > 10
         p_int -= 10
       end
       predicted_int = (p_int * 10**-1) * 14183000.0
-      relative_ints = pep.core_ints
-      avg = pep.p_rt
+      low = 0.1*predicted_int
+      relative_ints = (@db.execute "SELECT ints FROM core_spec WHERE pep_id=#{pep_id}").flatten[0].gsub(/\[/,"").split(/,/).map{|val| val.to_f}
+      core_mzs = (@db.execute "SELECT mzs FROM core_spec WHERE pep_id=#{pep_id}").flatten[0].gsub(/\[/,"").split(/,/).map{|val| val.to_f}
+      avg = pep[5] #p_rt
       sampling_rate = @opts[:sampling_rate].to_f
-      tail = @opts[:tail].to_f
-      front = @opts[:front].to_f
+      wobA = Distribution::Normal.rng(@opts[:wobA].to_f,0.0114199604).call #0.0014199604 is the standard deviation from Hek_cells_100904050914 file
+      wobB = Distribution::Normal.rng(@opts[:wobB].to_f,0.01740082).call #1.20280082 is the standard deviation from Hek_cells_100904050914 file
+      tail = Distribution::Normal.rng(@opts[:tail].to_f,0.018667495).call #0.258667495 is the standard deviation from Hek_cells_100904050914 file
+      front = Distribution::Normal.rng(@opts[:front].to_f,0.01466692).call #4.83466692 is the standard deviation from Hek_cells_100904050914 file
+      # These number didn't work. May need to get more samples or figure something else out. For now this will give us some
+      # meta variance in any case
       mu = @opts[:mu].to_f
       index = 0
-      sx = pep.sx
-      sy = (sx**-1) * Math.sqrt(pep.abu)
+      sx = pep[9]
+      sy = (sx**-1) * Math.sqrt(pep[8]) #abu
       shuff = RThelper.RandomFloat(0.05,1.0)
-      pep.core_mzs.each do |mzmu|
-        fin_mzs = []
-        fin_ints = []
+      core_mzs.each do |mzmu|
         relative_abundances_int = relative_ints[index]
         t_index = 1
-        pep.rts.each_with_index do |rt,i|
+        (Sim_Spectra::r_times[pep[10]..pep[11]]).each_with_index do |rt,i|
           if !@one_d
             #-------------Tailing-------------------------
             shape = (tail * (t_index / sx)) + front
-            fin_ints << (RThelper.gaussian((t_index / sx) ,mu ,shape,100.0))
+            int = (RThelper.gaussian((t_index / sx) ,mu ,shape,100.0))
             t_index += 1
             #---------------------------------------------
           else
             #-----------Random 1d data--------------------
-            fin_ints<<(relative_abundances_int * ints_factor) * shuff
+            int = (relative_abundances_int * ints_factor) * shuff
             #---------------------------------------------
           end
-          if fin_ints[i] < 0.01
-            fin_ints[i] = RThelper.RandomFloat(0.001,0.4)
+          if int < 0.01
+            int = RThelper.RandomFloat(0.001,0.4)
           end
 =begin
@@ -196,38 +104,37 @@ module MS
     end
 =end
-if fin_ints[i] > 0.4
-  #-------------Jagged-ness---------------------
-  sd = (@opts[:jagA] * (1-Math.exp(-(@opts[:jagC]) * fin_ints[i])) + @opts[:jagB])/2
-  diff = (Distribution::Normal.rng(0,sd).call)
-  fin_ints[i] = fin_ints[i] + diff
-  #---------------------------------------------
-end
-#-------------mz wobble-----------------------
-y = fin_ints[i]
-wobble_mz = nil
-if y > 0
-  wobble_int = @opts[:wobA]*y**(@opts[:wobB])
-  wobble_mz = Distribution::Normal.rng(mzmu,wobble_int).call
-  if wobble_mz < 0
-    wobble_mz = 0.01
-  end
+          if int > 0.4
+          #-------------Jagged-ness---------------------
+          sd = (@opts[:jagA] * (1-Math.exp(-(@opts[:jagC]) * int)) + @opts[:jagB])/2
+          diff = (Distribution::Normal.rng(0,sd).call)
+          int += diff
+          #---------------------------------------------
+          end
-  fin_mzs<<wobble_mz
-end
-#---------------------------------------------
+          #-------------mz wobble-----------------------
+          wobble_mz = nil
+          if int > 0
+            wobble_int = wobA*int**wobB
+            wobble_mz = Distribution::Normal.rng(mzmu,wobble_int).call
+            if wobble_mz < 0
+              wobble_mz = 0.01
+            end
+          end
+          #---------------------------------------------
-fin_ints[i] = fin_ints[i]*(predicted_int*(relative_abundances_int*10**-2)) * sy
+          int = int*(predicted_int*(relative_abundances_int*10**-2)) * sy
+          if int > low.abs and wobble_mz > 0
+            @db.execute "INSERT INTO spectra VALUES(#{@cent_id},#{pep_id},#{rt},#{wobble_mz},#{int},NULL)"
+            @cent_id += 1
+            if @max_mz < wobble_mz
+              @max_mz = wobble_mz
+            end
+          end
         end
-        pep.insert_ints(fin_ints)
-        pep.insert_mzs(fin_mzs)
         index += 1
       end
-      return pep
     end
   end
 end