mspire-simulator 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +22 -0
- data/README.rdoc +17 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/bin/mspire-simulator +125 -0
- data/bin/sim_mail.rb +26 -0
- data/bin/weka/M5P.model +0 -0
- data/bin/weka/M5Rules.model +0 -0
- data/bin/weka/weka.jar +0 -0
- data/lib/ms/curvefit/curve_fit_helper.rb +152 -0
- data/lib/ms/curvefit/fit_graph.rb +84 -0
- data/lib/ms/curvefit/mzml_reader.rb +28 -0
- data/lib/ms/curvefit.rb +120 -0
- data/lib/ms/isoelectric_calc.rb +122 -0
- data/lib/ms/merger.rb +101 -0
- data/lib/ms/mzml_wrapper.rb +67 -0
- data/lib/ms/noise.rb +51 -0
- data/lib/ms/rt/rt_helper.rb +31 -0
- data/lib/ms/rt/rtgenerator.rb +81 -0
- data/lib/ms/rt/weka.rb +150 -0
- data/lib/ms/sim_digester.rb +92 -0
- data/lib/ms/sim_feature.rb +175 -0
- data/lib/ms/sim_peptide.rb +182 -0
- data/lib/ms/sim_spectra.rb +70 -0
- data/lib/ms/sim_trollop.rb +68 -0
- data/lib/ms/tr_file_writer.rb +175 -0
- data/lib/progress.rb +24 -0
- data/mspire-simulator.gemspec +103 -0
- data/spec/file_writer_spec.rb +74 -0
- data/spec/merger_spec.rb +23 -0
- data/spec/ms-simulate_spec.rb +9 -0
- data/spec/peptide_spec.rb +16 -0
- data/spec/progress_spec.rb +22 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/spectra_spec.rb +111 -0
- data/testFiles/contam/hum_keratin.fasta +11 -0
- metadata +246 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# http://isoelectric.ovh.org/files/practise-isoelectric-point.html#mozTocId496531
|
3
|
+
# Taken from Ryan's github repo
|
4
|
+
|
5
|
+
Precision = 0.001
|
6
|
+
ResidueTable = {
|
7
|
+
:K => [2.18,8.95,10.53],
|
8
|
+
:E => [2.19,9.67,4.25],
|
9
|
+
:D => [1.88,9.60,3.65],
|
10
|
+
:H => [1.82,9.17,6.00],
|
11
|
+
:R => [2.17,9.04,12.48],
|
12
|
+
:Q => [2.17,9.13,nil],
|
13
|
+
:N => [2.02,8.80,nil],
|
14
|
+
:C => [1.96,10.28,8.18],
|
15
|
+
:T => [2.11,9.62,nil],
|
16
|
+
:S => [2.21,9.15,nil],
|
17
|
+
:W => [2.38,9.39,nil],
|
18
|
+
:Y => [2.20,9.11,10.07],
|
19
|
+
:F => [1.83,9.13,nil],
|
20
|
+
:M => [2.28,9.21,nil],
|
21
|
+
:I => [2.36,9.68,nil],
|
22
|
+
:L => [2.36,9.60,nil],
|
23
|
+
:V => [2.32,9.62,nil],
|
24
|
+
:P => [1.99,10.96,nil],
|
25
|
+
:A => [2.34,9.69,nil],
|
26
|
+
:G => [2.34,9.60,nil],
|
27
|
+
# These are the fringe cases... B and Z... Jerks, these are harder to calculate pIs
|
28
|
+
:B => [1.95,9.20,3.65],
|
29
|
+
:Z => [2.18,9.40,4.25],
|
30
|
+
:X => [2.20,9.40,nil],
|
31
|
+
:U => [1.96,10.28,5.20] # Unfortunately, I've only found the pKr for this... so I've used Cysteine's values.
|
32
|
+
}
|
33
|
+
PepCharges = Struct.new(:seq, :n_term, :c_term, :y_num, :c_num, :k_num, :h_num, :r_num, :d_num, :e_num, :u_num, :polar_num, :hydrophobic_num, :pi)
|
34
|
+
def identify_potential_charges(str)
|
35
|
+
string = str.upcase
|
36
|
+
first = string[0]; last = string[-1]
|
37
|
+
puts string if first.nil? or last.nil?
|
38
|
+
begin
|
39
|
+
out = PepCharges.new(string, ResidueTable[first.to_sym][0], ResidueTable[last.to_sym][1], 0, 0, 0 ,0 ,0 ,0, 0, 0, 0, 0, 0)
|
40
|
+
rescue NoMethodError
|
41
|
+
abort string
|
42
|
+
end
|
43
|
+
string.chars.each do |letter|
|
44
|
+
case letter
|
45
|
+
when "Y"
|
46
|
+
out.y_num += 1
|
47
|
+
when "C"
|
48
|
+
out.c_num += 1
|
49
|
+
when "K"
|
50
|
+
out.k_num += 1
|
51
|
+
when "H"
|
52
|
+
out.h_num += 1
|
53
|
+
when "R"
|
54
|
+
out.r_num += 1
|
55
|
+
when "D"
|
56
|
+
out.d_num += 1
|
57
|
+
when "E"
|
58
|
+
out.e_num += 1
|
59
|
+
when "U"
|
60
|
+
out.u_num += 1
|
61
|
+
when "S", "T", "N", "Q"
|
62
|
+
out.polar_num += 1
|
63
|
+
when "A", "V", "I", "L", "M", "F", "W", "G", "P"
|
64
|
+
out.hydrophobic_num += 1
|
65
|
+
end
|
66
|
+
end
|
67
|
+
out
|
68
|
+
end # Returns the PepCharges structure
|
69
|
+
|
70
|
+
def charge_at_pH(pep_charges, pH)
|
71
|
+
charge = 0
|
72
|
+
charge += -1/(1+10**(pep_charges.c_term-pH))
|
73
|
+
charge += -pep_charges.d_num/(1+10**(ResidueTable[:D][2]-pH))
|
74
|
+
charge += -pep_charges.e_num/(1+10**(ResidueTable[:E][2]-pH))
|
75
|
+
charge += -pep_charges.c_num/(1+10**(ResidueTable[:C][2]-pH))
|
76
|
+
charge += -pep_charges.y_num/(1+10**(ResidueTable[:Y][2]-pH))
|
77
|
+
charge += 1/(1+10**(pH - pep_charges.n_term))
|
78
|
+
charge += pep_charges.h_num/(1+10**(pH-ResidueTable[:H][2]))
|
79
|
+
charge += pep_charges.k_num/(1+10**(pH-ResidueTable[:K][2]))
|
80
|
+
charge += pep_charges.r_num/(1+10**(pH-ResidueTable[:R][2]))
|
81
|
+
charge
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
def calc_PI(pep_charges)
|
86
|
+
pH = 8; pH_prev = 0.0; pH_next = 14.0
|
87
|
+
charge = charge_at_pH(pep_charges, pH)
|
88
|
+
while pH-pH_prev > Precision and pH_next-pH > Precision
|
89
|
+
if charge < 0.0
|
90
|
+
tmp = pH
|
91
|
+
pH = pH - ((pH-pH_prev)/2)
|
92
|
+
charge = charge_at_pH(pep_charges, pH)
|
93
|
+
pH_next = tmp
|
94
|
+
else
|
95
|
+
tmp = pH
|
96
|
+
pH = pH + ((pH_next - pH)/2)
|
97
|
+
charge = charge_at_pH(pep_charges, pH)
|
98
|
+
pH_prev = tmp
|
99
|
+
end
|
100
|
+
# puts "charge: #{charge.round(2)}\tpH: #{pH.round(2)}\tpH_next: #{pH_next.round(2)}\tpH_prev: #{pH_prev.round(2)}"
|
101
|
+
end
|
102
|
+
pH
|
103
|
+
end
|
104
|
+
#pepcharges =[]
|
105
|
+
=begin
|
106
|
+
# RUN the ENTRY FILE HERE
|
107
|
+
pi = []
|
108
|
+
io = File.open(ARGV.shift, 'r')
|
109
|
+
io.each_line do |line|
|
110
|
+
pi << calc_PI(identify_potential_charges(line[/^([A-Z]+):.*/]))
|
111
|
+
end
|
112
|
+
=end
|
113
|
+
=begin
|
114
|
+
pIes = []
|
115
|
+
pepcharges.each do |a|
|
116
|
+
pIes << [a, calc_PI(a)]
|
117
|
+
end
|
118
|
+
=end
|
119
|
+
#out_pi = pepcharges.map {|a| calc_PI(a)}
|
120
|
+
|
121
|
+
#require 'yaml'
|
122
|
+
#File.open('pi_list.yml', 'w') {|f| YAML.dump( pi, f) }
|
data/lib/ms/merger.rb
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
require_relative '../progress'
|
2
|
+
|
3
|
+
class Merger
|
4
|
+
def self.mz_value(arr)
|
5
|
+
if arr.class == Hash
|
6
|
+
return arr.keys[0][0]
|
7
|
+
else
|
8
|
+
return arr
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.int_value(arr)
|
13
|
+
if arr.class == Array
|
14
|
+
return arr.last + int_value(arr.first)
|
15
|
+
else
|
16
|
+
return arr
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.w_avg(values,weights)
|
21
|
+
if values.class == hash
|
22
|
+
values = values.values.flatten
|
23
|
+
end
|
24
|
+
a = []
|
25
|
+
int = 0
|
26
|
+
mz = 0
|
27
|
+
values.each_with_index do |v,i|
|
28
|
+
mz = mz_value(v)
|
29
|
+
int = int_value(weights[i])
|
30
|
+
a<<mz*int
|
31
|
+
end
|
32
|
+
a = a.inject(:+)
|
33
|
+
b = weights.flatten.inject(:+)
|
34
|
+
return a/b
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.merge(spectra,half_range)
|
38
|
+
@start = Time.now
|
39
|
+
new_data = {}
|
40
|
+
total = spectra.size
|
41
|
+
k = 0
|
42
|
+
spectra.each do |rt,val|
|
43
|
+
Progress.progress("Merging Overlaps:",(((k/total)*100).to_i))
|
44
|
+
peaks = val.transpose
|
45
|
+
peaks.sort_by!{|a| a[0]}
|
46
|
+
peaks = peaks.transpose
|
47
|
+
mzs = peaks[0]
|
48
|
+
ints = peaks[1]
|
49
|
+
mzs.each_with_index do |mz,i|
|
50
|
+
next if mz.class == Hash
|
51
|
+
o_mz = mz
|
52
|
+
mz = mz.keys[0][0] if mz.class == Hash
|
53
|
+
range = (mz..mz+half_range)
|
54
|
+
if range.include?(mzs[i+1])
|
55
|
+
metaA_mz = [o_mz, mzs[i+1]]
|
56
|
+
meta_int = [ints[i],ints[i+1]]
|
57
|
+
sum = meta_int.flatten.inject(:+).to_f
|
58
|
+
i1 = ints[i]
|
59
|
+
i1 = ints[i].flatten.inject(:+) if ints[i].class == Array
|
60
|
+
frac1 = (i1/sum) * 100
|
61
|
+
frac2 = (ints[i+1]/sum) * 100
|
62
|
+
metaB_mz = {[w_avg(metaA_mz,meta_int),frac1,frac2] => metaA_mz}
|
63
|
+
|
64
|
+
mzs[i] = nil; mzs[i+1] = metaB_mz
|
65
|
+
ints[i] = nil; ints[i+1] = meta_int
|
66
|
+
end
|
67
|
+
end
|
68
|
+
new_data[rt] = [mzs.compact,ints.compact]
|
69
|
+
k += 1
|
70
|
+
end
|
71
|
+
Progress.progress("Merging Overlaps:",100,Time.now-@start)
|
72
|
+
puts ''
|
73
|
+
return new_data
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.compact(spectra)
|
77
|
+
@start = Time.now
|
78
|
+
total = spectra.size
|
79
|
+
k = 0
|
80
|
+
spectra.each do |rt,val|
|
81
|
+
Progress.progress("Merge Finishing:",(((k/total)*100).to_i))
|
82
|
+
mzs = val[0]
|
83
|
+
ints = val[1]
|
84
|
+
mzs.each_with_index do |m,i|
|
85
|
+
if m.class == Hash
|
86
|
+
mzs[i] = m.keys[0][0]
|
87
|
+
ints[i] = ints[i].flatten.inject(:+)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
spectra[rt] = [mzs,ints]
|
91
|
+
k += 1
|
92
|
+
end
|
93
|
+
Progress.progress("Merge Finishing:",100,Time.now-@start)
|
94
|
+
puts ''
|
95
|
+
return spectra
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
#test
|
100
|
+
#data = {1 => [[1.0,1.5,1.7,3.0,4.0,5.0,6.0,7.0,8.0,9.0],[10,9,8,7,6,5,4,3,2,1]], 2 => [[1,2,3,4,5,6,7,8,9],[9,8,7,6,5,4,3,2,1]]}
|
101
|
+
#p Merger.merge(data,0.5)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'progress'
|
4
|
+
require 'mspire/mzml'
|
5
|
+
|
6
|
+
class Mzml_Wrapper
|
7
|
+
|
8
|
+
def initialize(spectra)
|
9
|
+
#spectra is a Hash rt=>[[mzs],[ints]]
|
10
|
+
@start = Time.now
|
11
|
+
|
12
|
+
|
13
|
+
count = 0.0
|
14
|
+
scan_number = 1
|
15
|
+
specs = []
|
16
|
+
spectra.each do |rt,data|
|
17
|
+
Progress.progress("Converting to mzml:",(((count/spectra.size)*100).to_i))
|
18
|
+
|
19
|
+
spc = Mspire::Mzml::Spectrum.new("scan=#{scan_number}") do |spec|
|
20
|
+
spec.describe_many!(['MS:1000127', ['MS:1000511', 1]])
|
21
|
+
spec.data_arrays = [
|
22
|
+
Mspire::Mzml::DataArray.new(data[0]).describe!('MS:1000514'),
|
23
|
+
Mspire::Mzml::DataArray.new(data[1]).describe!('MS:1000515')
|
24
|
+
]
|
25
|
+
spec.scan_list = Mspire::Mzml::ScanList.new do |sl|
|
26
|
+
scan = Mspire::Mzml::Scan.new do |scan|
|
27
|
+
scan.describe! 'MS:1000016', rt, 'UO:0000010'
|
28
|
+
end
|
29
|
+
sl << scan
|
30
|
+
end
|
31
|
+
end
|
32
|
+
count += 1
|
33
|
+
scan_number += 1
|
34
|
+
specs<<spc
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
@mzml = Mspire::Mzml.new do |mzml|
|
40
|
+
mzml.id = 'ms1'
|
41
|
+
mzml.cvs = Mspire::Mzml::CV::DEFAULT_CVS
|
42
|
+
mzml.file_description = Mspire::Mzml::FileDescription.new do |fd|
|
43
|
+
fd.file_content = Mspire::Mzml::FileContent.new
|
44
|
+
fd.source_files << Mspire::Mzml::SourceFile.new
|
45
|
+
end
|
46
|
+
default_instrument_config = Mspire::Mzml::InstrumentConfiguration.new("IC").describe!('MS:1000031')
|
47
|
+
mzml.instrument_configurations << default_instrument_config
|
48
|
+
software = Mspire::Mzml::Software.new
|
49
|
+
mzml.software_list << software
|
50
|
+
default_data_processing = Mspire::Mzml::DataProcessing.new("did_nothing")
|
51
|
+
mzml.data_processing_list << default_data_processing
|
52
|
+
mzml.run = Mspire::Mzml::Run.new("simulated_run", default_instrument_config) do |run|
|
53
|
+
spectrum_list = Mspire::Mzml::SpectrumList.new(default_data_processing, specs)
|
54
|
+
run.spectrum_list = spectrum_list
|
55
|
+
end
|
56
|
+
end
|
57
|
+
Progress.progress("Converting to mzml:",100,Time.now-@start)
|
58
|
+
puts ''
|
59
|
+
return @mzml
|
60
|
+
end
|
61
|
+
|
62
|
+
def to_xml(file)
|
63
|
+
return @mzml.to_xml(file)
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
data/lib/ms/noise.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
|
2
|
+
require 'progress'
|
3
|
+
require 'ms/rt/rt_helper'
|
4
|
+
|
5
|
+
module MS
|
6
|
+
module Noise
|
7
|
+
module_function
|
8
|
+
def noiseify(density,max_mz)
|
9
|
+
# spectra is {rt => [[mzs],[ints]]}
|
10
|
+
@start = Time.now
|
11
|
+
@noise = {}
|
12
|
+
r_times = Sim_Spectra.r_times
|
13
|
+
|
14
|
+
count = 0.0
|
15
|
+
r_times.each do |rt|
|
16
|
+
|
17
|
+
Progress.progress("Adding noise:",(((count/r_times.size)*100).to_i))
|
18
|
+
|
19
|
+
nmzs = []
|
20
|
+
nints = []
|
21
|
+
|
22
|
+
density.times do
|
23
|
+
rmz = RThelper.RandomFloat(0.0,max_mz)
|
24
|
+
rint = RThelper.RandomFloat(50,1000)
|
25
|
+
|
26
|
+
nmzs<<rmz
|
27
|
+
nints<<rint
|
28
|
+
end
|
29
|
+
@noise[rt] = [nmzs,nints]
|
30
|
+
count += 1
|
31
|
+
end
|
32
|
+
|
33
|
+
Progress.progress("Adding noise:",100,Time.now-@start)
|
34
|
+
puts ''
|
35
|
+
|
36
|
+
return @noise
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def spec_drops(drop_percentage)
|
41
|
+
r_times = Sim_Spectra.r_times
|
42
|
+
l = r_times.length
|
43
|
+
num_drops = drop_percentage * l
|
44
|
+
num_drops.to_i.times do
|
45
|
+
r_times.delete_at(rand(l+1))
|
46
|
+
end
|
47
|
+
return r_times
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
module RThelper
|
3
|
+
|
4
|
+
module_function
|
5
|
+
def normalized_gaussian(x,mu,sd)
|
6
|
+
x = x.to_f
|
7
|
+
mu = mu.to_f
|
8
|
+
sd = sd.to_f
|
9
|
+
return ((1/(Math.sqrt(2*(Math::PI)*(sd**2))))*(Math.exp(-(((x-mu)**2)/((2*sd)**2)))))
|
10
|
+
end
|
11
|
+
|
12
|
+
module_function
|
13
|
+
def gaussian(x,mu,sd,h)
|
14
|
+
x = x.to_f
|
15
|
+
mu = mu.to_f
|
16
|
+
sd = sd.to_f
|
17
|
+
h = h.to_f
|
18
|
+
return h*Math.exp(-(x-mu)**2/(sd**2))
|
19
|
+
end
|
20
|
+
|
21
|
+
module_function
|
22
|
+
def RandomFloat(a,b)
|
23
|
+
a = a.to_f
|
24
|
+
b = b.to_f
|
25
|
+
random = rand(2147483647.0) / 2147483647.0
|
26
|
+
diff = b - a
|
27
|
+
r = random * diff
|
28
|
+
return a + r
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
|
2
|
+
require 'time'
|
3
|
+
require 'progress'
|
4
|
+
require 'ms/sim_feature'
|
5
|
+
require 'ms/rt/weka'
|
6
|
+
require 'ms/sim_peptide'
|
7
|
+
require 'ms/rt/rt_helper'
|
8
|
+
|
9
|
+
module MS
|
10
|
+
module Rtgenerator
|
11
|
+
|
12
|
+
module_function
|
13
|
+
def generateRT(peptides, one_d)
|
14
|
+
|
15
|
+
@start = Time.now
|
16
|
+
@r_times = Sim_Spectra.r_times
|
17
|
+
|
18
|
+
# Gets retention times from the weka model
|
19
|
+
peptides = MS::Weka.predict_rts(peptides)
|
20
|
+
MS::Weka.predict_ints(peptides)
|
21
|
+
|
22
|
+
|
23
|
+
#-----------------------------------------------------------------
|
24
|
+
peptides.each_with_index do |pep,ind|
|
25
|
+
Progress.progress("Generating retention times:",(((ind+1)/peptides.size.to_f)*100).to_i)
|
26
|
+
|
27
|
+
|
28
|
+
#Fit retention times into scan times
|
29
|
+
max_rt = @r_times.max
|
30
|
+
p_rt = pep.p_rt * 10**-2
|
31
|
+
if p_rt > 1
|
32
|
+
pep.p_rt = @r_times.max
|
33
|
+
pep.p_rt_i = @r_times.index(pep.p_rt)
|
34
|
+
else
|
35
|
+
pep.p_rt = @r_times.find {|i| i >= (p_rt * max_rt)}
|
36
|
+
pep.p_rt_i = @r_times.index(pep.p_rt)
|
37
|
+
end
|
38
|
+
|
39
|
+
if pep.p_rt == nil
|
40
|
+
puts "\n\n\t#{pep} TIME-> #{p_rt*max_rt} :: Peptide not predicted in time range: try increasing run time\n\n."
|
41
|
+
else
|
42
|
+
|
43
|
+
#Give peptide retention times
|
44
|
+
head_length = nil
|
45
|
+
tail_length = nil
|
46
|
+
if one_d
|
47
|
+
head_length = 300.0
|
48
|
+
tail_length = 701
|
49
|
+
else
|
50
|
+
head_length = 100.0
|
51
|
+
tail_length = 300
|
52
|
+
end
|
53
|
+
|
54
|
+
a = @r_times.find {|i| i >= (pep.p_rt-head_length)}
|
55
|
+
b = @r_times.find {|i| i >= (pep.p_rt+tail_length)}
|
56
|
+
a = @r_times.index(a)
|
57
|
+
b = @r_times.index(b)
|
58
|
+
|
59
|
+
if a == nil
|
60
|
+
a = @r_times[0]
|
61
|
+
end
|
62
|
+
|
63
|
+
if b == nil
|
64
|
+
b = @r_times[@r_times.length-1]
|
65
|
+
end
|
66
|
+
|
67
|
+
pep.set_rts(a,b)
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
71
|
+
#-----------------------------------------------------------------
|
72
|
+
|
73
|
+
|
74
|
+
Progress.progress("Generating retention times:",100,Time.now-@start)
|
75
|
+
puts ""
|
76
|
+
|
77
|
+
return peptides
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/lib/ms/rt/weka.rb
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
|
2
|
+
require 'csv'
|
3
|
+
|
4
|
+
module MS
|
5
|
+
module Weka
|
6
|
+
#James Dalg
|
7
|
+
module_function
|
8
|
+
def predict_rts(peptides)
|
9
|
+
#mz,charge,intensity,rt,A,R,N,D,B,C,E,Q,Z,G,H,I,L,K,M,F,P,S,T,W,Y,V,J,mass,hydro,pi
|
10
|
+
#make arrf file to feed weka model
|
11
|
+
data = []
|
12
|
+
peptides.each do |pep|
|
13
|
+
data<<pep.aa_counts
|
14
|
+
end
|
15
|
+
arff = make_rt_arff(Time.now.nsec.to_s,data)
|
16
|
+
system("java weka.classifiers.functions.MultilayerPerceptron -T #{arff} -l bin/weka/M5Rules.model -p 24 > #{arff}.out")
|
17
|
+
system("rm #{arff}")
|
18
|
+
|
19
|
+
#extract what was predicted by weka model
|
20
|
+
file = File.open("#{arff}.out","r")
|
21
|
+
count = 0
|
22
|
+
while line = file.gets
|
23
|
+
if line =~ /(\d*\.\d{0,3}){1}/
|
24
|
+
peptides[count].p_rt = line.match(/(\d*\.\d{0,3}){1}/)[0].to_f
|
25
|
+
count += 1
|
26
|
+
end
|
27
|
+
end
|
28
|
+
system("rm #{arff}.out")
|
29
|
+
return peptides
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
def predict_ints(peptides)
|
35
|
+
data = []
|
36
|
+
peptides.each do |pep|
|
37
|
+
array = []
|
38
|
+
array<<pep.mono_mz<<pep.charge<<pep.mass<<pep.p_rt
|
39
|
+
data << array.concat(pep.aa_counts)
|
40
|
+
end
|
41
|
+
arff = make_int_arff(Time.now.nsec.to_s,data)
|
42
|
+
system("java weka.classifiers.trees.M5P -T #{arff} -l bin/weka/M5P.model -p 27 > #{arff}.out")
|
43
|
+
system("rm #{arff}")
|
44
|
+
|
45
|
+
#extract what was predicted by weka model
|
46
|
+
file = File.open("#{arff}.out","r")
|
47
|
+
count = 0
|
48
|
+
while line = file.gets
|
49
|
+
if line =~ /(\d*\.\d{0,3}){1}/
|
50
|
+
peptides[count].p_int = line.match(/(\d*\.\d{0,3}){1}/)[0].to_f
|
51
|
+
count += 1
|
52
|
+
end
|
53
|
+
end
|
54
|
+
system("rm #{arff}.out")
|
55
|
+
return peptides
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
#James Dalg
|
61
|
+
def make_rt_arff(sourcefile, training)
|
62
|
+
sourcefile<<".arff"
|
63
|
+
File.open(sourcefile, "wb") do |f| # need to cite f.puts (not %Q)? if so http://www.devdaily.com/blog/post/ruby/how-write-text-to-file-ruby-example
|
64
|
+
f.puts %Q{%
|
65
|
+
%
|
66
|
+
@RELATION molecularinfo
|
67
|
+
@ATTRIBUTE A NUMERIC
|
68
|
+
@ATTRIBUTE R NUMERIC
|
69
|
+
@ATTRIBUTE N NUMERIC
|
70
|
+
@ATTRIBUTE D NUMERIC
|
71
|
+
@ATTRIBUTE B NUMERIC
|
72
|
+
@ATTRIBUTE C NUMERIC
|
73
|
+
@ATTRIBUTE E NUMERIC
|
74
|
+
@ATTRIBUTE Q NUMERIC
|
75
|
+
@ATTRIBUTE Z NUMERIC
|
76
|
+
@ATTRIBUTE G NUMERIC
|
77
|
+
@ATTRIBUTE H NUMERIC
|
78
|
+
@ATTRIBUTE I NUMERIC
|
79
|
+
@ATTRIBUTE L NUMERIC
|
80
|
+
@ATTRIBUTE K NUMERIC
|
81
|
+
@ATTRIBUTE M NUMERIC
|
82
|
+
@ATTRIBUTE F NUMERIC
|
83
|
+
@ATTRIBUTE P NUMERIC
|
84
|
+
@ATTRIBUTE S NUMERIC
|
85
|
+
@ATTRIBUTE T NUMERIC
|
86
|
+
@ATTRIBUTE W NUMERIC
|
87
|
+
@ATTRIBUTE Y NUMERIC
|
88
|
+
@ATTRIBUTE V NUMERIC
|
89
|
+
@ATTRIBUTE J NUMERIC
|
90
|
+
@ATTRIBUTE rt NUMERIC
|
91
|
+
@DATA
|
92
|
+
%
|
93
|
+
% }
|
94
|
+
end
|
95
|
+
training.each do |innerarray|
|
96
|
+
CSV.open(sourcefile, "a") do |csv| #derived from sample code http://www.ruby-doc.org/stdlib-1.9.3/libdoc/csv/rdoc/CSV.html
|
97
|
+
csv << innerarray #idea may be slightly attributable to http://www.ruby-forum.com/topic/299571
|
98
|
+
end
|
99
|
+
end
|
100
|
+
return sourcefile
|
101
|
+
end
|
102
|
+
|
103
|
+
|
104
|
+
#James Dalg
|
105
|
+
def make_int_arff(sourcefile, training)
|
106
|
+
sourcefile<<".arff"
|
107
|
+
File.open(sourcefile, "wb") do |f| # need to cite f.puts (not %Q)? if so http://www.devdaily.com/blog/post/ruby/how-write-text-to-file-ruby-example
|
108
|
+
f.puts %Q{%
|
109
|
+
%
|
110
|
+
@RELATION molecularinfo
|
111
|
+
@ATTRIBUTE mz NUMERIC
|
112
|
+
@ATTRIBUTE charge NUMERIC
|
113
|
+
@ATTRIBUTE mass NUMERIC
|
114
|
+
@ATTRIBUTE rt NUMERIC
|
115
|
+
@ATTRIBUTE A NUMERIC
|
116
|
+
@ATTRIBUTE R NUMERIC
|
117
|
+
@ATTRIBUTE N NUMERIC
|
118
|
+
@ATTRIBUTE D NUMERIC
|
119
|
+
@ATTRIBUTE B NUMERIC
|
120
|
+
@ATTRIBUTE C NUMERIC
|
121
|
+
@ATTRIBUTE E NUMERIC
|
122
|
+
@ATTRIBUTE Q NUMERIC
|
123
|
+
@ATTRIBUTE Z NUMERIC
|
124
|
+
@ATTRIBUTE G NUMERIC
|
125
|
+
@ATTRIBUTE H NUMERIC
|
126
|
+
@ATTRIBUTE I NUMERIC
|
127
|
+
@ATTRIBUTE L NUMERIC
|
128
|
+
@ATTRIBUTE K NUMERIC
|
129
|
+
@ATTRIBUTE M NUMERIC
|
130
|
+
@ATTRIBUTE F NUMERIC
|
131
|
+
@ATTRIBUTE P NUMERIC
|
132
|
+
@ATTRIBUTE S NUMERIC
|
133
|
+
@ATTRIBUTE T NUMERIC
|
134
|
+
@ATTRIBUTE W NUMERIC
|
135
|
+
@ATTRIBUTE Y NUMERIC
|
136
|
+
@ATTRIBUTE V NUMERIC
|
137
|
+
@ATTRIBUTE intensity NUMERIC
|
138
|
+
@DATA
|
139
|
+
%
|
140
|
+
% }
|
141
|
+
end
|
142
|
+
training.each do |innerarray|
|
143
|
+
CSV.open(sourcefile, "a") do |csv| #derived from sample code http://www.ruby-doc.org/stdlib-1.9.3/libdoc/csv/rdoc/CSV.html
|
144
|
+
csv << innerarray #idea may be slightly attributable to http://www.ruby-forum.com/topic/299571
|
145
|
+
end
|
146
|
+
end
|
147
|
+
return sourcefile
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
|
2
|
+
module MS
|
3
|
+
class Sim_Digester
|
4
|
+
|
5
|
+
attr_reader :digested_file
|
6
|
+
attr_writer :digested_file
|
7
|
+
|
8
|
+
def initialize(digestor,pH)
|
9
|
+
@digestor = digestor
|
10
|
+
@pH = pH
|
11
|
+
@digested_file = ".#{Time.now.nsec.to_s}"
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_digested_file(file)
|
15
|
+
inFile = File.open(file,"r")
|
16
|
+
seq = ""
|
17
|
+
inFile.each_line do |sequence|
|
18
|
+
if sequence =~ />/ or sequence == "\n"
|
19
|
+
seq = seq<<";"
|
20
|
+
else
|
21
|
+
seq = seq<<sequence.chomp
|
22
|
+
end
|
23
|
+
end
|
24
|
+
inFile.close
|
25
|
+
|
26
|
+
proteins = seq.split(/;/).delete_if{|str| str == ""}
|
27
|
+
|
28
|
+
trypsin = Mspire::Digester[@digestor]
|
29
|
+
|
30
|
+
digested = []
|
31
|
+
d_file = File.open(@digested_file, "w")
|
32
|
+
proteins.each do |prot|
|
33
|
+
dig = trypsin.digest(prot)
|
34
|
+
dig.each do |d|
|
35
|
+
digested<<d
|
36
|
+
end
|
37
|
+
end
|
38
|
+
proteins.clear
|
39
|
+
digested.uniq!
|
40
|
+
|
41
|
+
trun_digested = []
|
42
|
+
if digested.length > 50000
|
43
|
+
50000.times do
|
44
|
+
trun_digested<<digested[rand(digested.length)]
|
45
|
+
end
|
46
|
+
digested.clear
|
47
|
+
digested = trun_digested
|
48
|
+
end
|
49
|
+
|
50
|
+
digested.each do |dig|
|
51
|
+
d_file.puts(dig)
|
52
|
+
end
|
53
|
+
d_file.close
|
54
|
+
num_digested = digested.size
|
55
|
+
digested.clear
|
56
|
+
puts "Number of peptides: #{num_digested}"
|
57
|
+
return num_digested
|
58
|
+
end
|
59
|
+
|
60
|
+
def digest(file)
|
61
|
+
start = Time.now
|
62
|
+
|
63
|
+
num_digested = create_digested_file(file)
|
64
|
+
|
65
|
+
d_file = File.open(@digested_file, "r")
|
66
|
+
i = 0
|
67
|
+
|
68
|
+
peptides = []
|
69
|
+
|
70
|
+
d_file.each_line do |peptide_seq|
|
71
|
+
peptide_seq.chomp!
|
72
|
+
Progress.progress("Creating peptides '#{file}':",((i/num_digested.to_f)*100.0).to_i)
|
73
|
+
|
74
|
+
charge_ratio = charge_at_pH(identify_potential_charges(peptide_seq), @pH)
|
75
|
+
charge_f = charge_ratio.floor
|
76
|
+
charge_c = charge_ratio.ceil
|
77
|
+
|
78
|
+
peptide_f = MS::Peptide.new(peptide_seq, charge_f) if charge_f != 0
|
79
|
+
peptide_c = MS::Peptide.new(peptide_seq, charge_c) if charge_c != 0
|
80
|
+
|
81
|
+
peptides<<peptide_f if charge_f != 0
|
82
|
+
peptides<<peptide_c if charge_c != 0
|
83
|
+
i += 1
|
84
|
+
end
|
85
|
+
d_file.close
|
86
|
+
File.delete(@digested_file)
|
87
|
+
Progress.progress("Creating peptides '#{file}':",100,Time.now-start)
|
88
|
+
puts ''
|
89
|
+
return peptides
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|