mspire-simulator 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +22 -0
- data/README.rdoc +17 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/bin/mspire-simulator +125 -0
- data/bin/sim_mail.rb +26 -0
- data/bin/weka/M5P.model +0 -0
- data/bin/weka/M5Rules.model +0 -0
- data/bin/weka/weka.jar +0 -0
- data/lib/ms/curvefit/curve_fit_helper.rb +152 -0
- data/lib/ms/curvefit/fit_graph.rb +84 -0
- data/lib/ms/curvefit/mzml_reader.rb +28 -0
- data/lib/ms/curvefit.rb +120 -0
- data/lib/ms/isoelectric_calc.rb +122 -0
- data/lib/ms/merger.rb +101 -0
- data/lib/ms/mzml_wrapper.rb +67 -0
- data/lib/ms/noise.rb +51 -0
- data/lib/ms/rt/rt_helper.rb +31 -0
- data/lib/ms/rt/rtgenerator.rb +81 -0
- data/lib/ms/rt/weka.rb +150 -0
- data/lib/ms/sim_digester.rb +92 -0
- data/lib/ms/sim_feature.rb +175 -0
- data/lib/ms/sim_peptide.rb +182 -0
- data/lib/ms/sim_spectra.rb +70 -0
- data/lib/ms/sim_trollop.rb +68 -0
- data/lib/ms/tr_file_writer.rb +175 -0
- data/lib/progress.rb +24 -0
- data/mspire-simulator.gemspec +103 -0
- data/spec/file_writer_spec.rb +74 -0
- data/spec/merger_spec.rb +23 -0
- data/spec/ms-simulate_spec.rb +9 -0
- data/spec/peptide_spec.rb +16 -0
- data/spec/progress_spec.rb +22 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/spectra_spec.rb +111 -0
- data/testFiles/contam/hum_keratin.fasta +11 -0
- metadata +246 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# http://isoelectric.ovh.org/files/practise-isoelectric-point.html#mozTocId496531
|
3
|
+
# Taken from Ryan's github repo
|
4
|
+
|
5
|
+
Precision = 0.001
|
6
|
+
ResidueTable = {
|
7
|
+
:K => [2.18,8.95,10.53],
|
8
|
+
:E => [2.19,9.67,4.25],
|
9
|
+
:D => [1.88,9.60,3.65],
|
10
|
+
:H => [1.82,9.17,6.00],
|
11
|
+
:R => [2.17,9.04,12.48],
|
12
|
+
:Q => [2.17,9.13,nil],
|
13
|
+
:N => [2.02,8.80,nil],
|
14
|
+
:C => [1.96,10.28,8.18],
|
15
|
+
:T => [2.11,9.62,nil],
|
16
|
+
:S => [2.21,9.15,nil],
|
17
|
+
:W => [2.38,9.39,nil],
|
18
|
+
:Y => [2.20,9.11,10.07],
|
19
|
+
:F => [1.83,9.13,nil],
|
20
|
+
:M => [2.28,9.21,nil],
|
21
|
+
:I => [2.36,9.68,nil],
|
22
|
+
:L => [2.36,9.60,nil],
|
23
|
+
:V => [2.32,9.62,nil],
|
24
|
+
:P => [1.99,10.96,nil],
|
25
|
+
:A => [2.34,9.69,nil],
|
26
|
+
:G => [2.34,9.60,nil],
|
27
|
+
# These are the fringe cases... B and Z... Jerks, these are harder to calculate pIs
|
28
|
+
:B => [1.95,9.20,3.65],
|
29
|
+
:Z => [2.18,9.40,4.25],
|
30
|
+
:X => [2.20,9.40,nil],
|
31
|
+
:U => [1.96,10.28,5.20] # Unfortunately, I've only found the pKr for this... so I've used Cysteine's values.
|
32
|
+
}
|
33
|
+
PepCharges = Struct.new(:seq, :n_term, :c_term, :y_num, :c_num, :k_num, :h_num, :r_num, :d_num, :e_num, :u_num, :polar_num, :hydrophobic_num, :pi)
|
34
|
+
def identify_potential_charges(str)
|
35
|
+
string = str.upcase
|
36
|
+
first = string[0]; last = string[-1]
|
37
|
+
puts string if first.nil? or last.nil?
|
38
|
+
begin
|
39
|
+
out = PepCharges.new(string, ResidueTable[first.to_sym][0], ResidueTable[last.to_sym][1], 0, 0, 0 ,0 ,0 ,0, 0, 0, 0, 0, 0)
|
40
|
+
rescue NoMethodError
|
41
|
+
abort string
|
42
|
+
end
|
43
|
+
string.chars.each do |letter|
|
44
|
+
case letter
|
45
|
+
when "Y"
|
46
|
+
out.y_num += 1
|
47
|
+
when "C"
|
48
|
+
out.c_num += 1
|
49
|
+
when "K"
|
50
|
+
out.k_num += 1
|
51
|
+
when "H"
|
52
|
+
out.h_num += 1
|
53
|
+
when "R"
|
54
|
+
out.r_num += 1
|
55
|
+
when "D"
|
56
|
+
out.d_num += 1
|
57
|
+
when "E"
|
58
|
+
out.e_num += 1
|
59
|
+
when "U"
|
60
|
+
out.u_num += 1
|
61
|
+
when "S", "T", "N", "Q"
|
62
|
+
out.polar_num += 1
|
63
|
+
when "A", "V", "I", "L", "M", "F", "W", "G", "P"
|
64
|
+
out.hydrophobic_num += 1
|
65
|
+
end
|
66
|
+
end
|
67
|
+
out
|
68
|
+
end # Returns the PepCharges structure
|
69
|
+
|
70
|
+
def charge_at_pH(pep_charges, pH)
|
71
|
+
charge = 0
|
72
|
+
charge += -1/(1+10**(pep_charges.c_term-pH))
|
73
|
+
charge += -pep_charges.d_num/(1+10**(ResidueTable[:D][2]-pH))
|
74
|
+
charge += -pep_charges.e_num/(1+10**(ResidueTable[:E][2]-pH))
|
75
|
+
charge += -pep_charges.c_num/(1+10**(ResidueTable[:C][2]-pH))
|
76
|
+
charge += -pep_charges.y_num/(1+10**(ResidueTable[:Y][2]-pH))
|
77
|
+
charge += 1/(1+10**(pH - pep_charges.n_term))
|
78
|
+
charge += pep_charges.h_num/(1+10**(pH-ResidueTable[:H][2]))
|
79
|
+
charge += pep_charges.k_num/(1+10**(pH-ResidueTable[:K][2]))
|
80
|
+
charge += pep_charges.r_num/(1+10**(pH-ResidueTable[:R][2]))
|
81
|
+
charge
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
def calc_PI(pep_charges)
|
86
|
+
pH = 8; pH_prev = 0.0; pH_next = 14.0
|
87
|
+
charge = charge_at_pH(pep_charges, pH)
|
88
|
+
while pH-pH_prev > Precision and pH_next-pH > Precision
|
89
|
+
if charge < 0.0
|
90
|
+
tmp = pH
|
91
|
+
pH = pH - ((pH-pH_prev)/2)
|
92
|
+
charge = charge_at_pH(pep_charges, pH)
|
93
|
+
pH_next = tmp
|
94
|
+
else
|
95
|
+
tmp = pH
|
96
|
+
pH = pH + ((pH_next - pH)/2)
|
97
|
+
charge = charge_at_pH(pep_charges, pH)
|
98
|
+
pH_prev = tmp
|
99
|
+
end
|
100
|
+
# puts "charge: #{charge.round(2)}\tpH: #{pH.round(2)}\tpH_next: #{pH_next.round(2)}\tpH_prev: #{pH_prev.round(2)}"
|
101
|
+
end
|
102
|
+
pH
|
103
|
+
end
|
104
|
+
#pepcharges =[]
|
105
|
+
=begin
|
106
|
+
# RUN the ENTRY FILE HERE
|
107
|
+
pi = []
|
108
|
+
io = File.open(ARGV.shift, 'r')
|
109
|
+
io.each_line do |line|
|
110
|
+
pi << calc_PI(identify_potential_charges(line[/^([A-Z]+):.*/]))
|
111
|
+
end
|
112
|
+
=end
|
113
|
+
=begin
|
114
|
+
pIes = []
|
115
|
+
pepcharges.each do |a|
|
116
|
+
pIes << [a, calc_PI(a)]
|
117
|
+
end
|
118
|
+
=end
|
119
|
+
#out_pi = pepcharges.map {|a| calc_PI(a)}
|
120
|
+
|
121
|
+
#require 'yaml'
|
122
|
+
#File.open('pi_list.yml', 'w') {|f| YAML.dump( pi, f) }
|
data/lib/ms/merger.rb
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
require_relative '../progress'
|
2
|
+
|
3
|
+
class Merger
|
4
|
+
def self.mz_value(arr)
|
5
|
+
if arr.class == Hash
|
6
|
+
return arr.keys[0][0]
|
7
|
+
else
|
8
|
+
return arr
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.int_value(arr)
|
13
|
+
if arr.class == Array
|
14
|
+
return arr.last + int_value(arr.first)
|
15
|
+
else
|
16
|
+
return arr
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.w_avg(values,weights)
|
21
|
+
if values.class == hash
|
22
|
+
values = values.values.flatten
|
23
|
+
end
|
24
|
+
a = []
|
25
|
+
int = 0
|
26
|
+
mz = 0
|
27
|
+
values.each_with_index do |v,i|
|
28
|
+
mz = mz_value(v)
|
29
|
+
int = int_value(weights[i])
|
30
|
+
a<<mz*int
|
31
|
+
end
|
32
|
+
a = a.inject(:+)
|
33
|
+
b = weights.flatten.inject(:+)
|
34
|
+
return a/b
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.merge(spectra,half_range)
|
38
|
+
@start = Time.now
|
39
|
+
new_data = {}
|
40
|
+
total = spectra.size
|
41
|
+
k = 0
|
42
|
+
spectra.each do |rt,val|
|
43
|
+
Progress.progress("Merging Overlaps:",(((k/total)*100).to_i))
|
44
|
+
peaks = val.transpose
|
45
|
+
peaks.sort_by!{|a| a[0]}
|
46
|
+
peaks = peaks.transpose
|
47
|
+
mzs = peaks[0]
|
48
|
+
ints = peaks[1]
|
49
|
+
mzs.each_with_index do |mz,i|
|
50
|
+
next if mz.class == Hash
|
51
|
+
o_mz = mz
|
52
|
+
mz = mz.keys[0][0] if mz.class == Hash
|
53
|
+
range = (mz..mz+half_range)
|
54
|
+
if range.include?(mzs[i+1])
|
55
|
+
metaA_mz = [o_mz, mzs[i+1]]
|
56
|
+
meta_int = [ints[i],ints[i+1]]
|
57
|
+
sum = meta_int.flatten.inject(:+).to_f
|
58
|
+
i1 = ints[i]
|
59
|
+
i1 = ints[i].flatten.inject(:+) if ints[i].class == Array
|
60
|
+
frac1 = (i1/sum) * 100
|
61
|
+
frac2 = (ints[i+1]/sum) * 100
|
62
|
+
metaB_mz = {[w_avg(metaA_mz,meta_int),frac1,frac2] => metaA_mz}
|
63
|
+
|
64
|
+
mzs[i] = nil; mzs[i+1] = metaB_mz
|
65
|
+
ints[i] = nil; ints[i+1] = meta_int
|
66
|
+
end
|
67
|
+
end
|
68
|
+
new_data[rt] = [mzs.compact,ints.compact]
|
69
|
+
k += 1
|
70
|
+
end
|
71
|
+
Progress.progress("Merging Overlaps:",100,Time.now-@start)
|
72
|
+
puts ''
|
73
|
+
return new_data
|
74
|
+
end
|
75
|
+
|
76
|
+
def self.compact(spectra)
|
77
|
+
@start = Time.now
|
78
|
+
total = spectra.size
|
79
|
+
k = 0
|
80
|
+
spectra.each do |rt,val|
|
81
|
+
Progress.progress("Merge Finishing:",(((k/total)*100).to_i))
|
82
|
+
mzs = val[0]
|
83
|
+
ints = val[1]
|
84
|
+
mzs.each_with_index do |m,i|
|
85
|
+
if m.class == Hash
|
86
|
+
mzs[i] = m.keys[0][0]
|
87
|
+
ints[i] = ints[i].flatten.inject(:+)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
spectra[rt] = [mzs,ints]
|
91
|
+
k += 1
|
92
|
+
end
|
93
|
+
Progress.progress("Merge Finishing:",100,Time.now-@start)
|
94
|
+
puts ''
|
95
|
+
return spectra
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
#test
|
100
|
+
#data = {1 => [[1.0,1.5,1.7,3.0,4.0,5.0,6.0,7.0,8.0,9.0],[10,9,8,7,6,5,4,3,2,1]], 2 => [[1,2,3,4,5,6,7,8,9],[9,8,7,6,5,4,3,2,1]]}
|
101
|
+
#p Merger.merge(data,0.5)
|
@@ -0,0 +1,67 @@
|
|
1
|
+
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'progress'
|
4
|
+
require 'mspire/mzml'
|
5
|
+
|
6
|
+
class Mzml_Wrapper
|
7
|
+
|
8
|
+
def initialize(spectra)
|
9
|
+
#spectra is a Hash rt=>[[mzs],[ints]]
|
10
|
+
@start = Time.now
|
11
|
+
|
12
|
+
|
13
|
+
count = 0.0
|
14
|
+
scan_number = 1
|
15
|
+
specs = []
|
16
|
+
spectra.each do |rt,data|
|
17
|
+
Progress.progress("Converting to mzml:",(((count/spectra.size)*100).to_i))
|
18
|
+
|
19
|
+
spc = Mspire::Mzml::Spectrum.new("scan=#{scan_number}") do |spec|
|
20
|
+
spec.describe_many!(['MS:1000127', ['MS:1000511', 1]])
|
21
|
+
spec.data_arrays = [
|
22
|
+
Mspire::Mzml::DataArray.new(data[0]).describe!('MS:1000514'),
|
23
|
+
Mspire::Mzml::DataArray.new(data[1]).describe!('MS:1000515')
|
24
|
+
]
|
25
|
+
spec.scan_list = Mspire::Mzml::ScanList.new do |sl|
|
26
|
+
scan = Mspire::Mzml::Scan.new do |scan|
|
27
|
+
scan.describe! 'MS:1000016', rt, 'UO:0000010'
|
28
|
+
end
|
29
|
+
sl << scan
|
30
|
+
end
|
31
|
+
end
|
32
|
+
count += 1
|
33
|
+
scan_number += 1
|
34
|
+
specs<<spc
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
@mzml = Mspire::Mzml.new do |mzml|
|
40
|
+
mzml.id = 'ms1'
|
41
|
+
mzml.cvs = Mspire::Mzml::CV::DEFAULT_CVS
|
42
|
+
mzml.file_description = Mspire::Mzml::FileDescription.new do |fd|
|
43
|
+
fd.file_content = Mspire::Mzml::FileContent.new
|
44
|
+
fd.source_files << Mspire::Mzml::SourceFile.new
|
45
|
+
end
|
46
|
+
default_instrument_config = Mspire::Mzml::InstrumentConfiguration.new("IC").describe!('MS:1000031')
|
47
|
+
mzml.instrument_configurations << default_instrument_config
|
48
|
+
software = Mspire::Mzml::Software.new
|
49
|
+
mzml.software_list << software
|
50
|
+
default_data_processing = Mspire::Mzml::DataProcessing.new("did_nothing")
|
51
|
+
mzml.data_processing_list << default_data_processing
|
52
|
+
mzml.run = Mspire::Mzml::Run.new("simulated_run", default_instrument_config) do |run|
|
53
|
+
spectrum_list = Mspire::Mzml::SpectrumList.new(default_data_processing, specs)
|
54
|
+
run.spectrum_list = spectrum_list
|
55
|
+
end
|
56
|
+
end
|
57
|
+
Progress.progress("Converting to mzml:",100,Time.now-@start)
|
58
|
+
puts ''
|
59
|
+
return @mzml
|
60
|
+
end
|
61
|
+
|
62
|
+
def to_xml(file)
|
63
|
+
return @mzml.to_xml(file)
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
data/lib/ms/noise.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
|
2
|
+
require 'progress'
|
3
|
+
require 'ms/rt/rt_helper'
|
4
|
+
|
5
|
+
module MS
|
6
|
+
module Noise
|
7
|
+
module_function
|
8
|
+
def noiseify(density,max_mz)
|
9
|
+
# spectra is {rt => [[mzs],[ints]]}
|
10
|
+
@start = Time.now
|
11
|
+
@noise = {}
|
12
|
+
r_times = Sim_Spectra.r_times
|
13
|
+
|
14
|
+
count = 0.0
|
15
|
+
r_times.each do |rt|
|
16
|
+
|
17
|
+
Progress.progress("Adding noise:",(((count/r_times.size)*100).to_i))
|
18
|
+
|
19
|
+
nmzs = []
|
20
|
+
nints = []
|
21
|
+
|
22
|
+
density.times do
|
23
|
+
rmz = RThelper.RandomFloat(0.0,max_mz)
|
24
|
+
rint = RThelper.RandomFloat(50,1000)
|
25
|
+
|
26
|
+
nmzs<<rmz
|
27
|
+
nints<<rint
|
28
|
+
end
|
29
|
+
@noise[rt] = [nmzs,nints]
|
30
|
+
count += 1
|
31
|
+
end
|
32
|
+
|
33
|
+
Progress.progress("Adding noise:",100,Time.now-@start)
|
34
|
+
puts ''
|
35
|
+
|
36
|
+
return @noise
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def spec_drops(drop_percentage)
|
41
|
+
r_times = Sim_Spectra.r_times
|
42
|
+
l = r_times.length
|
43
|
+
num_drops = drop_percentage * l
|
44
|
+
num_drops.to_i.times do
|
45
|
+
r_times.delete_at(rand(l+1))
|
46
|
+
end
|
47
|
+
return r_times
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
|
2
|
+
module RThelper
|
3
|
+
|
4
|
+
module_function
|
5
|
+
def normalized_gaussian(x,mu,sd)
|
6
|
+
x = x.to_f
|
7
|
+
mu = mu.to_f
|
8
|
+
sd = sd.to_f
|
9
|
+
return ((1/(Math.sqrt(2*(Math::PI)*(sd**2))))*(Math.exp(-(((x-mu)**2)/((2*sd)**2)))))
|
10
|
+
end
|
11
|
+
|
12
|
+
module_function
|
13
|
+
def gaussian(x,mu,sd,h)
|
14
|
+
x = x.to_f
|
15
|
+
mu = mu.to_f
|
16
|
+
sd = sd.to_f
|
17
|
+
h = h.to_f
|
18
|
+
return h*Math.exp(-(x-mu)**2/(sd**2))
|
19
|
+
end
|
20
|
+
|
21
|
+
module_function
|
22
|
+
def RandomFloat(a,b)
|
23
|
+
a = a.to_f
|
24
|
+
b = b.to_f
|
25
|
+
random = rand(2147483647.0) / 2147483647.0
|
26
|
+
diff = b - a
|
27
|
+
r = random * diff
|
28
|
+
return a + r
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
|
2
|
+
require 'time'
|
3
|
+
require 'progress'
|
4
|
+
require 'ms/sim_feature'
|
5
|
+
require 'ms/rt/weka'
|
6
|
+
require 'ms/sim_peptide'
|
7
|
+
require 'ms/rt/rt_helper'
|
8
|
+
|
9
|
+
module MS
|
10
|
+
module Rtgenerator
|
11
|
+
|
12
|
+
module_function
|
13
|
+
def generateRT(peptides, one_d)
|
14
|
+
|
15
|
+
@start = Time.now
|
16
|
+
@r_times = Sim_Spectra.r_times
|
17
|
+
|
18
|
+
# Gets retention times from the weka model
|
19
|
+
peptides = MS::Weka.predict_rts(peptides)
|
20
|
+
MS::Weka.predict_ints(peptides)
|
21
|
+
|
22
|
+
|
23
|
+
#-----------------------------------------------------------------
|
24
|
+
peptides.each_with_index do |pep,ind|
|
25
|
+
Progress.progress("Generating retention times:",(((ind+1)/peptides.size.to_f)*100).to_i)
|
26
|
+
|
27
|
+
|
28
|
+
#Fit retention times into scan times
|
29
|
+
max_rt = @r_times.max
|
30
|
+
p_rt = pep.p_rt * 10**-2
|
31
|
+
if p_rt > 1
|
32
|
+
pep.p_rt = @r_times.max
|
33
|
+
pep.p_rt_i = @r_times.index(pep.p_rt)
|
34
|
+
else
|
35
|
+
pep.p_rt = @r_times.find {|i| i >= (p_rt * max_rt)}
|
36
|
+
pep.p_rt_i = @r_times.index(pep.p_rt)
|
37
|
+
end
|
38
|
+
|
39
|
+
if pep.p_rt == nil
|
40
|
+
puts "\n\n\t#{pep} TIME-> #{p_rt*max_rt} :: Peptide not predicted in time range: try increasing run time\n\n."
|
41
|
+
else
|
42
|
+
|
43
|
+
#Give peptide retention times
|
44
|
+
head_length = nil
|
45
|
+
tail_length = nil
|
46
|
+
if one_d
|
47
|
+
head_length = 300.0
|
48
|
+
tail_length = 701
|
49
|
+
else
|
50
|
+
head_length = 100.0
|
51
|
+
tail_length = 300
|
52
|
+
end
|
53
|
+
|
54
|
+
a = @r_times.find {|i| i >= (pep.p_rt-head_length)}
|
55
|
+
b = @r_times.find {|i| i >= (pep.p_rt+tail_length)}
|
56
|
+
a = @r_times.index(a)
|
57
|
+
b = @r_times.index(b)
|
58
|
+
|
59
|
+
if a == nil
|
60
|
+
a = @r_times[0]
|
61
|
+
end
|
62
|
+
|
63
|
+
if b == nil
|
64
|
+
b = @r_times[@r_times.length-1]
|
65
|
+
end
|
66
|
+
|
67
|
+
pep.set_rts(a,b)
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
71
|
+
#-----------------------------------------------------------------
|
72
|
+
|
73
|
+
|
74
|
+
Progress.progress("Generating retention times:",100,Time.now-@start)
|
75
|
+
puts ""
|
76
|
+
|
77
|
+
return peptides
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/lib/ms/rt/weka.rb
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
|
2
|
+
require 'csv'
|
3
|
+
|
4
|
+
module MS
|
5
|
+
module Weka
|
6
|
+
#James Dalg
|
7
|
+
module_function
|
8
|
+
def predict_rts(peptides)
|
9
|
+
#mz,charge,intensity,rt,A,R,N,D,B,C,E,Q,Z,G,H,I,L,K,M,F,P,S,T,W,Y,V,J,mass,hydro,pi
|
10
|
+
#make arrf file to feed weka model
|
11
|
+
data = []
|
12
|
+
peptides.each do |pep|
|
13
|
+
data<<pep.aa_counts
|
14
|
+
end
|
15
|
+
arff = make_rt_arff(Time.now.nsec.to_s,data)
|
16
|
+
system("java weka.classifiers.functions.MultilayerPerceptron -T #{arff} -l bin/weka/M5Rules.model -p 24 > #{arff}.out")
|
17
|
+
system("rm #{arff}")
|
18
|
+
|
19
|
+
#extract what was predicted by weka model
|
20
|
+
file = File.open("#{arff}.out","r")
|
21
|
+
count = 0
|
22
|
+
while line = file.gets
|
23
|
+
if line =~ /(\d*\.\d{0,3}){1}/
|
24
|
+
peptides[count].p_rt = line.match(/(\d*\.\d{0,3}){1}/)[0].to_f
|
25
|
+
count += 1
|
26
|
+
end
|
27
|
+
end
|
28
|
+
system("rm #{arff}.out")
|
29
|
+
return peptides
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
def predict_ints(peptides)
|
35
|
+
data = []
|
36
|
+
peptides.each do |pep|
|
37
|
+
array = []
|
38
|
+
array<<pep.mono_mz<<pep.charge<<pep.mass<<pep.p_rt
|
39
|
+
data << array.concat(pep.aa_counts)
|
40
|
+
end
|
41
|
+
arff = make_int_arff(Time.now.nsec.to_s,data)
|
42
|
+
system("java weka.classifiers.trees.M5P -T #{arff} -l bin/weka/M5P.model -p 27 > #{arff}.out")
|
43
|
+
system("rm #{arff}")
|
44
|
+
|
45
|
+
#extract what was predicted by weka model
|
46
|
+
file = File.open("#{arff}.out","r")
|
47
|
+
count = 0
|
48
|
+
while line = file.gets
|
49
|
+
if line =~ /(\d*\.\d{0,3}){1}/
|
50
|
+
peptides[count].p_int = line.match(/(\d*\.\d{0,3}){1}/)[0].to_f
|
51
|
+
count += 1
|
52
|
+
end
|
53
|
+
end
|
54
|
+
system("rm #{arff}.out")
|
55
|
+
return peptides
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
#James Dalg
|
61
|
+
def make_rt_arff(sourcefile, training)
|
62
|
+
sourcefile<<".arff"
|
63
|
+
File.open(sourcefile, "wb") do |f| # need to cite f.puts (not %Q)? if so http://www.devdaily.com/blog/post/ruby/how-write-text-to-file-ruby-example
|
64
|
+
f.puts %Q{%
|
65
|
+
%
|
66
|
+
@RELATION molecularinfo
|
67
|
+
@ATTRIBUTE A NUMERIC
|
68
|
+
@ATTRIBUTE R NUMERIC
|
69
|
+
@ATTRIBUTE N NUMERIC
|
70
|
+
@ATTRIBUTE D NUMERIC
|
71
|
+
@ATTRIBUTE B NUMERIC
|
72
|
+
@ATTRIBUTE C NUMERIC
|
73
|
+
@ATTRIBUTE E NUMERIC
|
74
|
+
@ATTRIBUTE Q NUMERIC
|
75
|
+
@ATTRIBUTE Z NUMERIC
|
76
|
+
@ATTRIBUTE G NUMERIC
|
77
|
+
@ATTRIBUTE H NUMERIC
|
78
|
+
@ATTRIBUTE I NUMERIC
|
79
|
+
@ATTRIBUTE L NUMERIC
|
80
|
+
@ATTRIBUTE K NUMERIC
|
81
|
+
@ATTRIBUTE M NUMERIC
|
82
|
+
@ATTRIBUTE F NUMERIC
|
83
|
+
@ATTRIBUTE P NUMERIC
|
84
|
+
@ATTRIBUTE S NUMERIC
|
85
|
+
@ATTRIBUTE T NUMERIC
|
86
|
+
@ATTRIBUTE W NUMERIC
|
87
|
+
@ATTRIBUTE Y NUMERIC
|
88
|
+
@ATTRIBUTE V NUMERIC
|
89
|
+
@ATTRIBUTE J NUMERIC
|
90
|
+
@ATTRIBUTE rt NUMERIC
|
91
|
+
@DATA
|
92
|
+
%
|
93
|
+
% }
|
94
|
+
end
|
95
|
+
training.each do |innerarray|
|
96
|
+
CSV.open(sourcefile, "a") do |csv| #derived from sample code http://www.ruby-doc.org/stdlib-1.9.3/libdoc/csv/rdoc/CSV.html
|
97
|
+
csv << innerarray #idea may be slightly attributable to http://www.ruby-forum.com/topic/299571
|
98
|
+
end
|
99
|
+
end
|
100
|
+
return sourcefile
|
101
|
+
end
|
102
|
+
|
103
|
+
|
104
|
+
#James Dalg
|
105
|
+
def make_int_arff(sourcefile, training)
|
106
|
+
sourcefile<<".arff"
|
107
|
+
File.open(sourcefile, "wb") do |f| # need to cite f.puts (not %Q)? if so http://www.devdaily.com/blog/post/ruby/how-write-text-to-file-ruby-example
|
108
|
+
f.puts %Q{%
|
109
|
+
%
|
110
|
+
@RELATION molecularinfo
|
111
|
+
@ATTRIBUTE mz NUMERIC
|
112
|
+
@ATTRIBUTE charge NUMERIC
|
113
|
+
@ATTRIBUTE mass NUMERIC
|
114
|
+
@ATTRIBUTE rt NUMERIC
|
115
|
+
@ATTRIBUTE A NUMERIC
|
116
|
+
@ATTRIBUTE R NUMERIC
|
117
|
+
@ATTRIBUTE N NUMERIC
|
118
|
+
@ATTRIBUTE D NUMERIC
|
119
|
+
@ATTRIBUTE B NUMERIC
|
120
|
+
@ATTRIBUTE C NUMERIC
|
121
|
+
@ATTRIBUTE E NUMERIC
|
122
|
+
@ATTRIBUTE Q NUMERIC
|
123
|
+
@ATTRIBUTE Z NUMERIC
|
124
|
+
@ATTRIBUTE G NUMERIC
|
125
|
+
@ATTRIBUTE H NUMERIC
|
126
|
+
@ATTRIBUTE I NUMERIC
|
127
|
+
@ATTRIBUTE L NUMERIC
|
128
|
+
@ATTRIBUTE K NUMERIC
|
129
|
+
@ATTRIBUTE M NUMERIC
|
130
|
+
@ATTRIBUTE F NUMERIC
|
131
|
+
@ATTRIBUTE P NUMERIC
|
132
|
+
@ATTRIBUTE S NUMERIC
|
133
|
+
@ATTRIBUTE T NUMERIC
|
134
|
+
@ATTRIBUTE W NUMERIC
|
135
|
+
@ATTRIBUTE Y NUMERIC
|
136
|
+
@ATTRIBUTE V NUMERIC
|
137
|
+
@ATTRIBUTE intensity NUMERIC
|
138
|
+
@DATA
|
139
|
+
%
|
140
|
+
% }
|
141
|
+
end
|
142
|
+
training.each do |innerarray|
|
143
|
+
CSV.open(sourcefile, "a") do |csv| #derived from sample code http://www.ruby-doc.org/stdlib-1.9.3/libdoc/csv/rdoc/CSV.html
|
144
|
+
csv << innerarray #idea may be slightly attributable to http://www.ruby-forum.com/topic/299571
|
145
|
+
end
|
146
|
+
end
|
147
|
+
return sourcefile
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
|
2
|
+
module MS
|
3
|
+
class Sim_Digester
|
4
|
+
|
5
|
+
attr_reader :digested_file
|
6
|
+
attr_writer :digested_file
|
7
|
+
|
8
|
+
def initialize(digestor,pH)
|
9
|
+
@digestor = digestor
|
10
|
+
@pH = pH
|
11
|
+
@digested_file = ".#{Time.now.nsec.to_s}"
|
12
|
+
end
|
13
|
+
|
14
|
+
def create_digested_file(file)
|
15
|
+
inFile = File.open(file,"r")
|
16
|
+
seq = ""
|
17
|
+
inFile.each_line do |sequence|
|
18
|
+
if sequence =~ />/ or sequence == "\n"
|
19
|
+
seq = seq<<";"
|
20
|
+
else
|
21
|
+
seq = seq<<sequence.chomp
|
22
|
+
end
|
23
|
+
end
|
24
|
+
inFile.close
|
25
|
+
|
26
|
+
proteins = seq.split(/;/).delete_if{|str| str == ""}
|
27
|
+
|
28
|
+
trypsin = Mspire::Digester[@digestor]
|
29
|
+
|
30
|
+
digested = []
|
31
|
+
d_file = File.open(@digested_file, "w")
|
32
|
+
proteins.each do |prot|
|
33
|
+
dig = trypsin.digest(prot)
|
34
|
+
dig.each do |d|
|
35
|
+
digested<<d
|
36
|
+
end
|
37
|
+
end
|
38
|
+
proteins.clear
|
39
|
+
digested.uniq!
|
40
|
+
|
41
|
+
trun_digested = []
|
42
|
+
if digested.length > 50000
|
43
|
+
50000.times do
|
44
|
+
trun_digested<<digested[rand(digested.length)]
|
45
|
+
end
|
46
|
+
digested.clear
|
47
|
+
digested = trun_digested
|
48
|
+
end
|
49
|
+
|
50
|
+
digested.each do |dig|
|
51
|
+
d_file.puts(dig)
|
52
|
+
end
|
53
|
+
d_file.close
|
54
|
+
num_digested = digested.size
|
55
|
+
digested.clear
|
56
|
+
puts "Number of peptides: #{num_digested}"
|
57
|
+
return num_digested
|
58
|
+
end
|
59
|
+
|
60
|
+
def digest(file)
|
61
|
+
start = Time.now
|
62
|
+
|
63
|
+
num_digested = create_digested_file(file)
|
64
|
+
|
65
|
+
d_file = File.open(@digested_file, "r")
|
66
|
+
i = 0
|
67
|
+
|
68
|
+
peptides = []
|
69
|
+
|
70
|
+
d_file.each_line do |peptide_seq|
|
71
|
+
peptide_seq.chomp!
|
72
|
+
Progress.progress("Creating peptides '#{file}':",((i/num_digested.to_f)*100.0).to_i)
|
73
|
+
|
74
|
+
charge_ratio = charge_at_pH(identify_potential_charges(peptide_seq), @pH)
|
75
|
+
charge_f = charge_ratio.floor
|
76
|
+
charge_c = charge_ratio.ceil
|
77
|
+
|
78
|
+
peptide_f = MS::Peptide.new(peptide_seq, charge_f) if charge_f != 0
|
79
|
+
peptide_c = MS::Peptide.new(peptide_seq, charge_c) if charge_c != 0
|
80
|
+
|
81
|
+
peptides<<peptide_f if charge_f != 0
|
82
|
+
peptides<<peptide_c if charge_c != 0
|
83
|
+
i += 1
|
84
|
+
end
|
85
|
+
d_file.close
|
86
|
+
File.delete(@digested_file)
|
87
|
+
Progress.progress("Creating peptides '#{file}':",100,Time.now-start)
|
88
|
+
puts ''
|
89
|
+
return peptides
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|