mspire 0.3.1 → 0.3.9
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +2 -2
- data/bin/bioworks_to_pepxml.rb +15 -3
- data/bin/ms_to_lmat.rb +2 -1
- data/bin/sqt_group.rb +26 -0
- data/changelog.txt +36 -0
- data/lib/ms/msrun.rb +3 -1
- data/lib/ms/parser/mzdata/dom.rb +14 -14
- data/lib/ms/scan.rb +3 -3
- data/lib/mspire.rb +1 -1
- data/lib/sample_enzyme.rb +39 -0
- data/lib/spec_id.rb +18 -0
- data/lib/spec_id/aa_freqs.rb +6 -9
- data/lib/spec_id/digestor.rb +16 -17
- data/lib/spec_id/mass.rb +63 -1
- data/lib/spec_id/parser/proph.rb +101 -2
- data/lib/spec_id/precision/filter.rb +3 -2
- data/lib/spec_id/precision/filter/cmdline.rb +3 -1
- data/lib/spec_id/precision/filter/output.rb +1 -0
- data/lib/spec_id/precision/prob.rb +88 -21
- data/lib/spec_id/precision/prob/cmdline.rb +28 -16
- data/lib/spec_id/precision/prob/output.rb +8 -2
- data/lib/spec_id/proph/pep_summary.rb +25 -12
- data/lib/spec_id/sequest.rb +28 -0
- data/lib/spec_id/sequest/pepxml.rb +142 -197
- data/lib/spec_id/sqt.rb +349 -0
- data/lib/spec_id/srf.rb +33 -23
- data/lib/validator.rb +40 -57
- data/lib/validator/aa.rb +3 -90
- data/lib/validator/aa_est.rb +112 -0
- data/lib/validator/cmdline.rb +163 -31
- data/lib/validator/decoy.rb +15 -7
- data/lib/validator/digestion_based.rb +5 -4
- data/lib/validator/q_value.rb +32 -0
- data/script/peps_per_bin.rb +67 -0
- data/script/sqt_to_meta.rb +24 -0
- data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
- data/specs/bin/fasta_shaker_spec.rb +2 -2
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
- data/specs/bin/filter_and_validate_spec.rb +25 -6
- data/specs/bin/ms_to_lmat_spec.rb +2 -2
- data/specs/bin/prob_validate_spec.rb +5 -3
- data/specs/sample_enzyme_spec.rb +86 -1
- data/specs/spec_helper.rb +11 -9
- data/specs/spec_id/bioworks_spec.rb +2 -1
- data/specs/spec_id/precision/filter_spec.rb +5 -5
- data/specs/spec_id/precision/prob_spec.rb +0 -67
- data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
- data/specs/spec_id/protein_summary_spec.rb +4 -4
- data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
- data/specs/spec_id/sequest_spec.rb +38 -0
- data/specs/spec_id/sqt_spec.rb +111 -3
- data/specs/spec_id_spec.rb +2 -0
- data/specs/transmem/phobius_spec.rb +3 -1
- data/specs/transmem/toppred_spec.rb +1 -1
- data/specs/validator/aa_est_spec.rb +66 -0
- data/specs/validator/aa_spec.rb +1 -68
- data/specs/validator/background_spec.rb +2 -0
- data/specs/validator/bias_spec.rb +3 -27
- data/specs/validator/decoy_spec.rb +2 -2
- data/specs/validator/transmem_spec.rb +2 -1
- data/test_files/small.sqt +87 -0
- metadata +312 -293
data/Rakefile
CHANGED
@@ -238,8 +238,8 @@ spec = Gem::Specification.new do |s|
|
|
238
238
|
s.rdoc_options = rdoc_options
|
239
239
|
s.extra_rdoc_files = rdoc_extra_includes
|
240
240
|
s.executables = FL["bin/*"].map {|file| File.basename(file) }
|
241
|
-
s.add_dependency('libjtp', '~> 0.2.
|
242
|
-
s.add_dependency('axml')
|
241
|
+
s.add_dependency('libjtp', '~> 0.2.13')
|
242
|
+
s.add_dependency('axml', '~> 0.0.0')
|
243
243
|
s.requirements << '"libxml" is the prefered xml parser right now. libxml, xmlparser, REXML and regular expressions are used as fallback in some routines.'
|
244
244
|
s.requirements << 'some plotting functions will not be available without the "gnuplot" gem (and underlying gnuplot binary)'
|
245
245
|
s.requirements << 'the "t2x" binary (in archive) or readw.exe is required to convert .RAW files to mzXML in some applications'
|
data/bin/bioworks_to_pepxml.rb
CHANGED
@@ -43,14 +43,26 @@ opt_obj = OptionParser.new do |op|
|
|
43
43
|
op.separator "Options:"
|
44
44
|
op.on('-h', '--help', "display this and more notes and exit") {|v| opt.help = v }
|
45
45
|
op.on('-o', '--outdir path', "output directory d: '#{DEFAULT_OUTDIR}'") {|v| opt.outdir = v }
|
46
|
+
op.on('--sample_enzyme <type>', "For digested samples run with no enzymatic",
|
47
|
+
"search constraint, the enzyme used for",
|
48
|
+
"digestion, options: 'Trypsin_KR_P'") {|v|
|
49
|
+
case v
|
50
|
+
when 'Trypsin_KR_P'
|
51
|
+
opt.sample_enzyme = SampleEnzyme.new("trypsin")
|
52
|
+
else
|
53
|
+
raise ArgumentError, "Don't recognize enzyme: #{v}"
|
54
|
+
end
|
55
|
+
}
|
56
|
+
op.on('-a', '--all_hits', "includes all hits, not just top xcorr") {|v| opt.all_hits = v }
|
57
|
+
op.on('--deltacn_orig', "top hit deltacn = 0.0, (no deltacnstar att)") {|v| opt.deltacn_orig = v }
|
58
|
+
op.on('-m', '--mspath path', "path to MS files d: '#{DEFAULT_MZ_PATH}'") {|v| opt.mspath = v }
|
59
|
+
op.on('--copy_mzxml', "copies mzXML files to outdir path"){|v| opt.copy_mzxml = v }
|
46
60
|
|
47
61
|
op.separator ""
|
48
62
|
op.separator "bioworks.xml files may require additional options:"
|
49
63
|
op.separator ""
|
50
64
|
op.on('-p', '--params file', "sequest params file d: '#{DEFAULT_PARAMS_FILE}'") {|v| opt.params = v }
|
51
65
|
op.on('-d', '--dbpath path', "path to databases d: '#{DEFAULT_DATABASE_PATH}'") {|v| opt.dbpath = v }
|
52
|
-
op.on('-m', '--mspath path', "path to MS files d: '#{DEFAULT_MZ_PATH}'") {|v| opt.mspath = v }
|
53
|
-
op.on('--copy_mzxml', "copies mzXML files to outdir path"){|v| opt.copy_mzxml = v }
|
54
66
|
op.on('--model <LCQ|Orbi|string>', "MS model (xml) d: '#{DEFAULT_MS_MODEL}'") {|v| opt.model = v }
|
55
67
|
op.on('--mass_analyzer <string>', "Mass Analyzer (xml) d: '#{DEFAULT_MASS_ANALYZER}'") {|v| opt.mass_analyzer = v }
|
56
68
|
|
@@ -131,5 +143,5 @@ opt.params ||= DEFAULT_PARAMS_FILE
|
|
131
143
|
opt.mass_analyzer ||= DEFAULT_MASS_ANALYZER
|
132
144
|
opt.model ||= DEFAULT_MS_MODEL
|
133
145
|
|
134
|
-
xml_objs = Sequest::PepXML.set_from_bioworks(bioworks_file, {:params => opt.params, :ms_data => opt.mspath, :out_path => opt.outdir, :model => model, :backup_db_path => opt.dbpath, :copy_mzxml => opt.copy_mzxml, :ms_mass_analyzer => opt.mass_analyzer, :print => true})
|
146
|
+
xml_objs = Sequest::PepXML.set_from_bioworks(bioworks_file, {:params => opt.params, :ms_data => opt.mspath, :out_path => opt.outdir, :model => model, :backup_db_path => opt.dbpath, :copy_mzxml => opt.copy_mzxml, :ms_mass_analyzer => opt.mass_analyzer, :print => true, :all_hits => opt.all_hits, :deltacn_orig => opt.deltacn_orig, :sample_enzyme => opt.sample_enzyme})
|
135
147
|
|
data/bin/ms_to_lmat.rb
CHANGED
@@ -47,7 +47,8 @@ ARGV.each do |file|
|
|
47
47
|
}
|
48
48
|
args.merge!(opt)
|
49
49
|
lmat = LMat.new.from_times_and_spectra(times, spectra, args)
|
50
|
-
|
50
|
+
ext = File.extname(file)
|
51
|
+
outfile = file.sub(/#{Regexp.escape(ext)}$/, opt[:newext])
|
51
52
|
if args[:ascii]
|
52
53
|
outfile << "a"
|
53
54
|
lmat.print(outfile)
|
data/bin/sqt_group.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'spec_id/sqt'
|
5
|
+
|
6
|
+
$OUTFILE = 'bioworks.sqg'
|
7
|
+
|
8
|
+
opts = OptionParser.new do |op|
|
9
|
+
op.banner = "usage: #{File.basename(__FILE__)} <file1>.sqt <file2>.sqt ..."
|
10
|
+
op.separator "outputs: 'bioworks.sqg'"
|
11
|
+
op.separator ""
|
12
|
+
op.separator " A '.sqg' file is an ascii text file with a list"
|
13
|
+
op.separator " of the sqt files (full path names) in that group."
|
14
|
+
op.separator ""
|
15
|
+
op.on('-o', '--output <filename>', 'a different output name') {|v| $OUTFILE }
|
16
|
+
end
|
17
|
+
|
18
|
+
if ARGV.size == 0
|
19
|
+
puts opts
|
20
|
+
exit
|
21
|
+
end
|
22
|
+
|
23
|
+
obj = SQTGroup.new
|
24
|
+
obj.filenames = ARGV.to_a
|
25
|
+
obj.to_sqg($OUTFILE)
|
26
|
+
|
data/changelog.txt
CHANGED
@@ -126,3 +126,39 @@ interfaces and implementations (using ArrayClass)
|
|
126
126
|
## version 0.3.1
|
127
127
|
|
128
128
|
1. Bug fix in srf filtering (num_hits adjusted)
|
129
|
+
|
130
|
+
## version 0.3.2
|
131
|
+
|
132
|
+
1. Uses sequest peptide_mass_tolerance filter on srf group files by default
|
133
|
+
now.
|
134
|
+
|
135
|
+
## version 0.3.3
|
136
|
+
|
137
|
+
1. Worked out minor kinks in prob_precision.rb
|
138
|
+
|
139
|
+
## version 0.3.4
|
140
|
+
|
141
|
+
1. filters >= +3 charged ions now.
|
142
|
+
|
143
|
+
## version 0.3.5
|
144
|
+
|
145
|
+
1. fixed creation of background distribution in validators (hash_by base_name,
|
146
|
+
first_scan, charge now)
|
147
|
+
|
148
|
+
## version 0.3.6
|
149
|
+
|
150
|
+
1. split off bad_aa_est from bad_aa
|
151
|
+
|
152
|
+
## version 0.3.7
|
153
|
+
|
154
|
+
1. can deal with No_Enzyme searches now (while still capable of setting
|
155
|
+
sample_enzyme)
|
156
|
+
|
157
|
+
## version 0.3.8
|
158
|
+
|
159
|
+
1. can set a decoy to target ratio for decoy validation
|
160
|
+
2. added mass calculator in Mass::Calculator
|
161
|
+
|
162
|
+
## version 0.3.9
|
163
|
+
|
164
|
+
1. doesn't clobber mzdata filename in ms_to_lmat.rb conversion
|
data/lib/ms/msrun.rb
CHANGED
@@ -30,7 +30,9 @@ class MS::MSRun
|
|
30
30
|
myopts = opts.dup ; myopts[:msrun] = self
|
31
31
|
if file
|
32
32
|
filetype_and_version = MS::Parser.filetype_and_version(file)
|
33
|
-
MS::Parser.new(filetype_and_version, :msrun)
|
33
|
+
parser = MS::Parser.new(filetype_and_version, :msrun)
|
34
|
+
parser.parse(file, myopts)
|
35
|
+
#MS::Parser.new(filetype_and_version, :msrun).parse(file, myopts)
|
34
36
|
(@filetype, @version) = filetype_and_version
|
35
37
|
end
|
36
38
|
end
|
data/lib/ms/parser/mzdata/dom.rb
CHANGED
@@ -51,23 +51,20 @@ class MS::Parser::MzData::DOM
|
|
51
51
|
# %w(num msLevel retentionTime startMz endMz precursors spectrum)
|
52
52
|
|
53
53
|
root = get_root_node_from_file(file)
|
54
|
-
scan_count = 0
|
55
54
|
description = root.find_first('child::description')
|
56
55
|
bioworks33 = is_bioworks33?(description)
|
57
56
|
spectrum_list = description.next
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
end
|
57
|
+
|
58
|
+
scans = []
|
59
|
+
|
60
|
+
# bioworks 33 gives incorrect scan count
|
61
|
+
stated_num_scans = spectrum_list['count'].to_i
|
64
62
|
|
65
63
|
# if I move from node to node, it means I've checked that it's a sequence
|
66
64
|
# and that the elements are req'd
|
67
65
|
if spectrum_list.child?
|
68
66
|
spectrum_n = spectrum_list.child
|
69
67
|
loop do
|
70
|
-
scan_count += 1
|
71
68
|
scan = MS::Scan.new(9)
|
72
69
|
id = spectrum_n["id"].to_i
|
73
70
|
id_to_scan_hash[id] = scan
|
@@ -81,11 +78,9 @@ class MS::Parser::MzData::DOM
|
|
81
78
|
spec_inst_n = spec_settings_n.find_first('child::spectrumInstrument')
|
82
79
|
scan[1] = spec_inst_n['msLevel'].to_i
|
83
80
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
scans[scan_count] = scan
|
88
|
-
end
|
81
|
+
# we could use a scan_count, but in bioworks 33, we can't trust the
|
82
|
+
# scan count! So, we just collect them
|
83
|
+
scans << scan
|
89
84
|
|
90
85
|
scan[3] = spec_inst_n['mzRangeStart'].to_f
|
91
86
|
scan[4] = spec_inst_n['mzRangeStop'].to_f
|
@@ -149,7 +144,12 @@ class MS::Parser::MzData::DOM
|
|
149
144
|
MS::MSRun.add_parent_scan(scans, opts[:spectra])
|
150
145
|
end
|
151
146
|
msrun_obj.scans = scans
|
152
|
-
msrun_obj.scan_count =
|
147
|
+
msrun_obj.scan_count = scans.size
|
148
|
+
unless bioworks33 # we know the scan count is off here
|
149
|
+
if msrun_obj.scan_count != stated_num_scans
|
150
|
+
warn "num collected scans (#{scans.size}) does not agree with stated num scans (#{stated_num_scans})!"
|
151
|
+
end
|
152
|
+
end
|
153
153
|
msrun_obj.start_time = msrun_obj.scans.first.time
|
154
154
|
msrun_obj.end_time = msrun_obj.scans.last.time
|
155
155
|
end
|
data/lib/ms/scan.rb
CHANGED
@@ -28,7 +28,7 @@ class MS::Scan
|
|
28
28
|
atts = %w(num ms_level time start_mz end_mz)
|
29
29
|
display = atts.map do |att|
|
30
30
|
if val = send(att.to_sym)
|
31
|
-
"
|
31
|
+
"#{att}=#{val}"
|
32
32
|
else
|
33
33
|
nil
|
34
34
|
end
|
@@ -38,9 +38,9 @@ class MS::Scan
|
|
38
38
|
if spectrum
|
39
39
|
spectrum.mz.size
|
40
40
|
else
|
41
|
-
nil
|
41
|
+
'nil'
|
42
42
|
end
|
43
|
-
"<MS::Scan:#{__id__} " + display.join(", ") + "
|
43
|
+
"<MS::Scan:#{__id__} " + display.join(", ") + " precursors=#{precursors.inspect}" + " spectrum(size)=#{spec_display}" + " >"
|
44
44
|
end
|
45
45
|
|
46
46
|
# returns the string (space delimited): "ms_level num time [prec_mz prec_inten]"
|
data/lib/mspire.rb
CHANGED
data/lib/sample_enzyme.rb
CHANGED
@@ -23,6 +23,7 @@ class SampleEnzyme
|
|
23
23
|
# For other enzymes, you must set :cut, :no_cut, :name, and :sense
|
24
24
|
# will yield the object if you want to set the values that way
|
25
25
|
def initialize(name=nil)
|
26
|
+
@num_missed_cleavages_regex = nil
|
26
27
|
@sense = nil
|
27
28
|
@cut = nil
|
28
29
|
@no_cut = nil
|
@@ -62,6 +63,44 @@ class SampleEnzyme
|
|
62
63
|
self.new.from_pepxml_node(node)
|
63
64
|
end
|
64
65
|
|
66
|
+
# takes an amino acid sequence (e.g., -.PEPTIDK.L)
|
67
|
+
# returns the number of missed cleavages
|
68
|
+
def num_missed_cleavages(aaseq)
|
69
|
+
raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
|
70
|
+
@num_missed_cleavages_regex =
|
71
|
+
if @num_missed_cleavages_regex ; @num_missed_cleavages_regex
|
72
|
+
else
|
73
|
+
regex_string = "[#{@cut}]"
|
74
|
+
if @no_cut and @no_cut != ''
|
75
|
+
regex_string << "[^#{@no_cut}]"
|
76
|
+
end
|
77
|
+
/#{regex_string}/
|
78
|
+
end
|
79
|
+
arr = aaseq.scan(@num_missed_cleavages_regex)
|
80
|
+
num = arr.size
|
81
|
+
if aaseq[-1,1] =~ @num_missed_cleavages_regex
|
82
|
+
num -= 1
|
83
|
+
end
|
84
|
+
num
|
85
|
+
end
|
86
|
+
|
87
|
+
# requires full sequence (with heads and tails)
|
88
|
+
def num_tol_term(sequence)
|
89
|
+
raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
|
90
|
+
no_cut = @no_cut || ''
|
91
|
+
num_tol = 0
|
92
|
+
first, middle, last = SpecID::Pep.split_sequence(sequence)
|
93
|
+
last_of_middle = middle[-1,1]
|
94
|
+
first_of_middle = middle[0,1]
|
95
|
+
if ( @cut.include?(first) && !no_cut.include?(first_of_middle) ) || first == '-'
|
96
|
+
num_tol += 1
|
97
|
+
end
|
98
|
+
if @cut.include?(last_of_middle) && !no_cut.include?(last) || last == '-'
|
99
|
+
num_tol += 1
|
100
|
+
end
|
101
|
+
num_tol
|
102
|
+
end
|
103
|
+
|
65
104
|
# returns all peptides of missed cleavages <= 'missed_cleavages'
|
66
105
|
# so 2 missed cleavages will return all no missed cleavage peptides
|
67
106
|
# all 1 missed cleavages and all 2 missed cleavages.
|
data/lib/spec_id.rb
CHANGED
@@ -7,6 +7,7 @@ require 'spec_id/bioworks'
|
|
7
7
|
require 'spec_id/sequest'
|
8
8
|
require 'spec_id/proph/prot_summary'
|
9
9
|
require 'spec_id_xml'
|
10
|
+
require 'spec_id/sqt'
|
10
11
|
require 'spec_id/mass'
|
11
12
|
require 'fasta'
|
12
13
|
|
@@ -71,6 +72,10 @@ module SpecID
|
|
71
72
|
Proph::ProtSummary.new(file)
|
72
73
|
when 'pepproph'
|
73
74
|
Proph::PepSummary.new(file)
|
75
|
+
when 'sqg'
|
76
|
+
SQTGroup.new(file)
|
77
|
+
when 'sqt'
|
78
|
+
SQTGroup.new([file])
|
74
79
|
else
|
75
80
|
abort "UNRECOGNIZED file type for #{file}"
|
76
81
|
end
|
@@ -447,6 +452,8 @@ module SpecID
|
|
447
452
|
def self.file_type(file)
|
448
453
|
if file =~ /\.srg$/
|
449
454
|
return 'srg'
|
455
|
+
elsif file =~ /\.sqg$/
|
456
|
+
return 'sqg'
|
450
457
|
end
|
451
458
|
if IO.read(file, 7,438) == 'Enzyme:'
|
452
459
|
return 'srf'
|
@@ -461,6 +468,17 @@ module SpecID
|
|
461
468
|
elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
|
462
469
|
return 'pepproph'
|
463
470
|
end
|
471
|
+
# assumes the header of a sqt file is less than 200 lines ...
|
472
|
+
200.times do
|
473
|
+
line = fh.gets
|
474
|
+
if line
|
475
|
+
lines << line
|
476
|
+
else ; break
|
477
|
+
end
|
478
|
+
end
|
479
|
+
if lines =~ /^H\tDatabase/ and lines =~ /^H\tSQTGenerator/
|
480
|
+
return 'sqt'
|
481
|
+
end
|
464
482
|
end
|
465
483
|
end
|
466
484
|
|
data/lib/spec_id/aa_freqs.rb
CHANGED
@@ -3,30 +3,27 @@ require 'fasta'
|
|
3
3
|
module SpecID ; end
|
4
4
|
|
5
5
|
class SpecID::AAFreqs
|
6
|
-
# a fasta object
|
7
|
-
attr_accessor :fasta
|
8
6
|
# hash by capital one-letter amino acid symbols giving the frequency of
|
9
7
|
# seeing that amino acid. Frequencies should add to 1.
|
10
8
|
attr_accessor :aafreqs
|
11
9
|
|
12
10
|
# fasta is fasta object!
|
13
11
|
def initialize(fasta=nil)
|
14
|
-
|
15
|
-
|
16
|
-
@aafreqs = calculate_frequencies(@fasta)
|
12
|
+
if fasta
|
13
|
+
@aafreqs = calculate_frequencies(fasta.prots)
|
17
14
|
end
|
18
15
|
end
|
19
16
|
|
20
|
-
# creates an aafreqs hash
|
21
|
-
def calculate_frequencies(
|
17
|
+
# takes an enumerable of objects responding to :aaseq and creates an aafreqs hash
|
18
|
+
def calculate_frequencies(objs)
|
22
19
|
hash = {}
|
23
20
|
total_aas = 0
|
24
21
|
('A'..'Z').each do |x|
|
25
22
|
hash[x] = 0
|
26
23
|
end
|
27
24
|
hash['*'] = 0
|
28
|
-
|
29
|
-
aaseq =
|
25
|
+
objs.each do |obj|
|
26
|
+
aaseq = obj.aaseq
|
30
27
|
total_aas += aaseq.size
|
31
28
|
aaseq.split('').each do |x|
|
32
29
|
hash[x] += 1
|
data/lib/spec_id/digestor.rb
CHANGED
@@ -100,38 +100,37 @@ class Digestor
|
|
100
100
|
# The prot_aaseq is used if the mass_hash contains the keys
|
101
101
|
# :add_C_term_protein or :add_N_term_protein
|
102
102
|
#
|
103
|
+
# mass_hash requires the key :h_plus or :h depending on h_plus option.
|
103
104
|
# prot_aaseqs is parallel to pep_aaseqs_ar where each is a group of
|
104
105
|
# peptides matching a protein aaseq
|
105
|
-
# returns another parallel array of passing
|
106
|
+
# returns another parallel array of passing peptides per protein
|
106
107
|
def limit_sizes(prot_aaseqs, pep_aaseqs_ar, min_mh, max_mh, mass_hash, h_plus=false)
|
107
108
|
if mass_hash.key?(:add_C_term_protein) or mass_hash.key?(:add_N_term_protein)
|
108
109
|
raise NotImplementedError, "need to add ability to change weights of peptides from the ends of proteins"
|
109
110
|
else
|
110
111
|
# figure out how much must be added to each peptide
|
111
112
|
# include the h2o, the h, and N and C terminal static mods
|
112
|
-
|
113
|
-
|
113
|
+
h_plus_key = h_plus ? :h_plus : :h
|
114
|
+
extra_add = mass_hash[h_plus_key]
|
114
115
|
[:add_N_term_peptide, :add_C_term_peptide].each do |sym|
|
115
116
|
if mass_hash.key?(sym)
|
116
|
-
|
117
|
+
extra_add += mass_hash[sym]
|
117
118
|
end
|
118
119
|
end
|
119
|
-
|
120
|
-
|
120
|
+
mc = Mass::Calculator.new(mass_hash, extra_add)
|
121
|
+
|
122
|
+
masses_per_group = pep_aaseqs_ar.map do |pep_aaseqs|
|
123
|
+
mc.masses(pep_aaseqs)
|
124
|
+
end
|
121
125
|
|
122
|
-
pep_aaseqs_ar.map do
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
puts 'NOT FOUND'
|
128
|
-
p let
|
129
|
-
end
|
130
|
-
sum += hash_by_aa_string[let]
|
126
|
+
masses_per_group.zip(pep_aaseqs_ar).map do |masses, aaseqs|
|
127
|
+
passing = []
|
128
|
+
aaseqs.zip(masses) do |aaseq, mh_plus|
|
129
|
+
if ( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
|
130
|
+
passing << aaseq
|
131
131
|
end
|
132
|
-
mh_plus = sum + final_add
|
133
|
-
( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
|
134
132
|
end
|
133
|
+
passing
|
135
134
|
end
|
136
135
|
end
|
137
136
|
end
|
data/lib/spec_id/mass.rb
CHANGED
@@ -29,13 +29,13 @@ class Mass
|
|
29
29
|
:U => 150.95364, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
|
30
30
|
:X => 118.805716, # the average of the mono masses of the 20 amino acids
|
31
31
|
:* => 118.805716, # same as X
|
32
|
+
:Z => (129.04259 + 128.05858) / 2, # average glutamic acid and glutamine
|
32
33
|
|
33
34
|
# elements etc.
|
34
35
|
:h => 1.00783,
|
35
36
|
:h_plus => 1.00728,
|
36
37
|
:o => 15.9949146,
|
37
38
|
:h2o => 18.01056,
|
38
|
-
|
39
39
|
}
|
40
40
|
AVG = {
|
41
41
|
:A => 71.0788,
|
@@ -64,6 +64,7 @@ class Mass
|
|
64
64
|
:U => 150.03, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
|
65
65
|
:X => 118.88603, # the average of the masses of the 20 amino acids
|
66
66
|
:* => 118.88603, # same as X
|
67
|
+
:Z => (129.1155+ 128.1307) / 2, # average glutamic acid and glutamine
|
67
68
|
|
68
69
|
# elements etc.
|
69
70
|
:h => 1.00794,
|
@@ -112,5 +113,66 @@ class Mass
|
|
112
113
|
end
|
113
114
|
copy_hash
|
114
115
|
end
|
116
|
+
|
117
|
+
# returns an array of masses parallel to array passed in
|
118
|
+
# If you want the mass with H+, then pass in the mass as h_plus
|
119
|
+
# The mass hash must repond to
|
120
|
+
# :h2o (water)
|
121
|
+
# and at least the twenty amino acids (by string or symbol)
|
122
|
+
# The mass hash may respond to :add_N_term_peptide or :add_C_term_peptide
|
123
|
+
# in which case these will be added to the final mass
|
124
|
+
def self.masses(aaseqs, mass_hash=Mass::MONO, h_plus=0.0)
|
125
|
+
final_add = mass_hash[:h2o] + h_plus
|
126
|
+
[:add_N_term_peptide, :add_C_term_peptide].each do |sym|
|
127
|
+
if mass_hash.key?(sym)
|
128
|
+
final_add += mass_hash[sym]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
hash_by_aa_string = {}
|
132
|
+
mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
|
133
|
+
|
134
|
+
aaseqs.map do |pep_aaseqs|
|
135
|
+
sum = 0.0
|
136
|
+
aaseq.split('').each do |let|
|
137
|
+
sum += hash_by_aa_string[let]
|
138
|
+
end
|
139
|
+
mh_plus = sum + final_add
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
end
|
145
|
+
|
146
|
+
class Mass::Calculator
|
147
|
+
|
148
|
+
# mass_hash must respond to :h2o or 'h2o'. This is added to represent the
|
149
|
+
# tails of the peptide. add_extra is outside of that (e.g., an H+)
|
150
|
+
def initialize(mass_hash, add_extra=0.0)
|
151
|
+
@mass_hash = mass_hash_to_s(mass_hash)
|
152
|
+
@final_add = @mass_hash['h2o'] + add_extra
|
153
|
+
end
|
154
|
+
|
155
|
+
def mass_hash_to_s(mass_hash)
|
156
|
+
new_hash = {}
|
157
|
+
mass_hash.each do |k,v|
|
158
|
+
new_hash[k.to_s] = v
|
159
|
+
end
|
160
|
+
new_hash
|
161
|
+
end
|
162
|
+
|
163
|
+
def masses(aaseqs)
|
164
|
+
aaseqs.map do |aaseq|
|
165
|
+
sum = @final_add # <- add in the initialization
|
166
|
+
aaseq.split('').each do |let|
|
167
|
+
if @mass_hash.key? let
|
168
|
+
sum += @mass_hash[let]
|
169
|
+
else
|
170
|
+
abort "LETTER not found in mass_hash: #{let}"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
sum
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
115
177
|
end
|
116
178
|
|