mspire 0.3.1 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +2 -2
- data/bin/bioworks_to_pepxml.rb +15 -3
- data/bin/ms_to_lmat.rb +2 -1
- data/bin/sqt_group.rb +26 -0
- data/changelog.txt +36 -0
- data/lib/ms/msrun.rb +3 -1
- data/lib/ms/parser/mzdata/dom.rb +14 -14
- data/lib/ms/scan.rb +3 -3
- data/lib/mspire.rb +1 -1
- data/lib/sample_enzyme.rb +39 -0
- data/lib/spec_id.rb +18 -0
- data/lib/spec_id/aa_freqs.rb +6 -9
- data/lib/spec_id/digestor.rb +16 -17
- data/lib/spec_id/mass.rb +63 -1
- data/lib/spec_id/parser/proph.rb +101 -2
- data/lib/spec_id/precision/filter.rb +3 -2
- data/lib/spec_id/precision/filter/cmdline.rb +3 -1
- data/lib/spec_id/precision/filter/output.rb +1 -0
- data/lib/spec_id/precision/prob.rb +88 -21
- data/lib/spec_id/precision/prob/cmdline.rb +28 -16
- data/lib/spec_id/precision/prob/output.rb +8 -2
- data/lib/spec_id/proph/pep_summary.rb +25 -12
- data/lib/spec_id/sequest.rb +28 -0
- data/lib/spec_id/sequest/pepxml.rb +142 -197
- data/lib/spec_id/sqt.rb +349 -0
- data/lib/spec_id/srf.rb +33 -23
- data/lib/validator.rb +40 -57
- data/lib/validator/aa.rb +3 -90
- data/lib/validator/aa_est.rb +112 -0
- data/lib/validator/cmdline.rb +163 -31
- data/lib/validator/decoy.rb +15 -7
- data/lib/validator/digestion_based.rb +5 -4
- data/lib/validator/q_value.rb +32 -0
- data/script/peps_per_bin.rb +67 -0
- data/script/sqt_to_meta.rb +24 -0
- data/specs/bin/bioworks_to_pepxml_spec.rb +3 -3
- data/specs/bin/fasta_shaker_spec.rb +2 -2
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +7 -10
- data/specs/bin/filter_and_validate_spec.rb +25 -6
- data/specs/bin/ms_to_lmat_spec.rb +2 -2
- data/specs/bin/prob_validate_spec.rb +5 -3
- data/specs/sample_enzyme_spec.rb +86 -1
- data/specs/spec_helper.rb +11 -9
- data/specs/spec_id/bioworks_spec.rb +2 -1
- data/specs/spec_id/precision/filter_spec.rb +5 -5
- data/specs/spec_id/precision/prob_spec.rb +0 -67
- data/specs/spec_id/proph/pep_summary_spec.rb +42 -87
- data/specs/spec_id/protein_summary_spec.rb +4 -4
- data/specs/spec_id/sequest/pepxml_spec.rb +1 -79
- data/specs/spec_id/sequest_spec.rb +38 -0
- data/specs/spec_id/sqt_spec.rb +111 -3
- data/specs/spec_id_spec.rb +2 -0
- data/specs/transmem/phobius_spec.rb +3 -1
- data/specs/transmem/toppred_spec.rb +1 -1
- data/specs/validator/aa_est_spec.rb +66 -0
- data/specs/validator/aa_spec.rb +1 -68
- data/specs/validator/background_spec.rb +2 -0
- data/specs/validator/bias_spec.rb +3 -27
- data/specs/validator/decoy_spec.rb +2 -2
- data/specs/validator/transmem_spec.rb +2 -1
- data/test_files/small.sqt +87 -0
- metadata +312 -293
data/Rakefile
CHANGED
@@ -238,8 +238,8 @@ spec = Gem::Specification.new do |s|
|
|
238
238
|
s.rdoc_options = rdoc_options
|
239
239
|
s.extra_rdoc_files = rdoc_extra_includes
|
240
240
|
s.executables = FL["bin/*"].map {|file| File.basename(file) }
|
241
|
-
s.add_dependency('libjtp', '~> 0.2.
|
242
|
-
s.add_dependency('axml')
|
241
|
+
s.add_dependency('libjtp', '~> 0.2.13')
|
242
|
+
s.add_dependency('axml', '~> 0.0.0')
|
243
243
|
s.requirements << '"libxml" is the prefered xml parser right now. libxml, xmlparser, REXML and regular expressions are used as fallback in some routines.'
|
244
244
|
s.requirements << 'some plotting functions will not be available without the "gnuplot" gem (and underlying gnuplot binary)'
|
245
245
|
s.requirements << 'the "t2x" binary (in archive) or readw.exe is required to convert .RAW files to mzXML in some applications'
|
data/bin/bioworks_to_pepxml.rb
CHANGED
@@ -43,14 +43,26 @@ opt_obj = OptionParser.new do |op|
|
|
43
43
|
op.separator "Options:"
|
44
44
|
op.on('-h', '--help', "display this and more notes and exit") {|v| opt.help = v }
|
45
45
|
op.on('-o', '--outdir path', "output directory d: '#{DEFAULT_OUTDIR}'") {|v| opt.outdir = v }
|
46
|
+
op.on('--sample_enzyme <type>', "For digested samples run with no enzymatic",
|
47
|
+
"search constraint, the enzyme used for",
|
48
|
+
"digestion, options: 'Trypsin_KR_P'") {|v|
|
49
|
+
case v
|
50
|
+
when 'Trypsin_KR_P'
|
51
|
+
opt.sample_enzyme = SampleEnzyme.new("trypsin")
|
52
|
+
else
|
53
|
+
raise ArgumentError, "Don't recognize enzyme: #{v}"
|
54
|
+
end
|
55
|
+
}
|
56
|
+
op.on('-a', '--all_hits', "includes all hits, not just top xcorr") {|v| opt.all_hits = v }
|
57
|
+
op.on('--deltacn_orig', "top hit deltacn = 0.0, (no deltacnstar att)") {|v| opt.deltacn_orig = v }
|
58
|
+
op.on('-m', '--mspath path', "path to MS files d: '#{DEFAULT_MZ_PATH}'") {|v| opt.mspath = v }
|
59
|
+
op.on('--copy_mzxml', "copies mzXML files to outdir path"){|v| opt.copy_mzxml = v }
|
46
60
|
|
47
61
|
op.separator ""
|
48
62
|
op.separator "bioworks.xml files may require additional options:"
|
49
63
|
op.separator ""
|
50
64
|
op.on('-p', '--params file', "sequest params file d: '#{DEFAULT_PARAMS_FILE}'") {|v| opt.params = v }
|
51
65
|
op.on('-d', '--dbpath path', "path to databases d: '#{DEFAULT_DATABASE_PATH}'") {|v| opt.dbpath = v }
|
52
|
-
op.on('-m', '--mspath path', "path to MS files d: '#{DEFAULT_MZ_PATH}'") {|v| opt.mspath = v }
|
53
|
-
op.on('--copy_mzxml', "copies mzXML files to outdir path"){|v| opt.copy_mzxml = v }
|
54
66
|
op.on('--model <LCQ|Orbi|string>', "MS model (xml) d: '#{DEFAULT_MS_MODEL}'") {|v| opt.model = v }
|
55
67
|
op.on('--mass_analyzer <string>', "Mass Analyzer (xml) d: '#{DEFAULT_MASS_ANALYZER}'") {|v| opt.mass_analyzer = v }
|
56
68
|
|
@@ -131,5 +143,5 @@ opt.params ||= DEFAULT_PARAMS_FILE
|
|
131
143
|
opt.mass_analyzer ||= DEFAULT_MASS_ANALYZER
|
132
144
|
opt.model ||= DEFAULT_MS_MODEL
|
133
145
|
|
134
|
-
xml_objs = Sequest::PepXML.set_from_bioworks(bioworks_file, {:params => opt.params, :ms_data => opt.mspath, :out_path => opt.outdir, :model => model, :backup_db_path => opt.dbpath, :copy_mzxml => opt.copy_mzxml, :ms_mass_analyzer => opt.mass_analyzer, :print => true})
|
146
|
+
xml_objs = Sequest::PepXML.set_from_bioworks(bioworks_file, {:params => opt.params, :ms_data => opt.mspath, :out_path => opt.outdir, :model => model, :backup_db_path => opt.dbpath, :copy_mzxml => opt.copy_mzxml, :ms_mass_analyzer => opt.mass_analyzer, :print => true, :all_hits => opt.all_hits, :deltacn_orig => opt.deltacn_orig, :sample_enzyme => opt.sample_enzyme})
|
135
147
|
|
data/bin/ms_to_lmat.rb
CHANGED
@@ -47,7 +47,8 @@ ARGV.each do |file|
|
|
47
47
|
}
|
48
48
|
args.merge!(opt)
|
49
49
|
lmat = LMat.new.from_times_and_spectra(times, spectra, args)
|
50
|
-
|
50
|
+
ext = File.extname(file)
|
51
|
+
outfile = file.sub(/#{Regexp.escape(ext)}$/, opt[:newext])
|
51
52
|
if args[:ascii]
|
52
53
|
outfile << "a"
|
53
54
|
lmat.print(outfile)
|
data/bin/sqt_group.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
require 'spec_id/sqt'
|
5
|
+
|
6
|
+
$OUTFILE = 'bioworks.sqg'
|
7
|
+
|
8
|
+
opts = OptionParser.new do |op|
|
9
|
+
op.banner = "usage: #{File.basename(__FILE__)} <file1>.sqt <file2>.sqt ..."
|
10
|
+
op.separator "outputs: 'bioworks.sqg'"
|
11
|
+
op.separator ""
|
12
|
+
op.separator " A '.sqg' file is an ascii text file with a list"
|
13
|
+
op.separator " of the sqt files (full path names) in that group."
|
14
|
+
op.separator ""
|
15
|
+
op.on('-o', '--output <filename>', 'a different output name') {|v| $OUTFILE }
|
16
|
+
end
|
17
|
+
|
18
|
+
if ARGV.size == 0
|
19
|
+
puts opts
|
20
|
+
exit
|
21
|
+
end
|
22
|
+
|
23
|
+
obj = SQTGroup.new
|
24
|
+
obj.filenames = ARGV.to_a
|
25
|
+
obj.to_sqg($OUTFILE)
|
26
|
+
|
data/changelog.txt
CHANGED
@@ -126,3 +126,39 @@ interfaces and implementations (using ArrayClass)
|
|
126
126
|
## version 0.3.1
|
127
127
|
|
128
128
|
1. Bug fix in srf filtering (num_hits adjusted)
|
129
|
+
|
130
|
+
## version 0.3.2
|
131
|
+
|
132
|
+
1. Uses sequest peptide_mass_tolerance filter on srf group files by default
|
133
|
+
now.
|
134
|
+
|
135
|
+
## version 0.3.3
|
136
|
+
|
137
|
+
1. Worked out minor kinks in prob_precision.rb
|
138
|
+
|
139
|
+
## version 0.3.4
|
140
|
+
|
141
|
+
1. filters >= +3 charged ions now.
|
142
|
+
|
143
|
+
## version 0.3.5
|
144
|
+
|
145
|
+
1. fixed creation of background distribution in validators (hash_by base_name,
|
146
|
+
first_scan, charge now)
|
147
|
+
|
148
|
+
## version 0.3.6
|
149
|
+
|
150
|
+
1. split off bad_aa_est from bad_aa
|
151
|
+
|
152
|
+
## version 0.3.7
|
153
|
+
|
154
|
+
1. can deal with No_Enzyme searches now (while still capable of setting
|
155
|
+
sample_enzyme)
|
156
|
+
|
157
|
+
## version 0.3.8
|
158
|
+
|
159
|
+
1. can set a decoy to target ratio for decoy validation
|
160
|
+
2. added mass calculator in Mass::Calculator
|
161
|
+
|
162
|
+
## version 0.3.9
|
163
|
+
|
164
|
+
1. doesn't clobber mzdata filename in ms_to_lmat.rb conversion
|
data/lib/ms/msrun.rb
CHANGED
@@ -30,7 +30,9 @@ class MS::MSRun
|
|
30
30
|
myopts = opts.dup ; myopts[:msrun] = self
|
31
31
|
if file
|
32
32
|
filetype_and_version = MS::Parser.filetype_and_version(file)
|
33
|
-
MS::Parser.new(filetype_and_version, :msrun)
|
33
|
+
parser = MS::Parser.new(filetype_and_version, :msrun)
|
34
|
+
parser.parse(file, myopts)
|
35
|
+
#MS::Parser.new(filetype_and_version, :msrun).parse(file, myopts)
|
34
36
|
(@filetype, @version) = filetype_and_version
|
35
37
|
end
|
36
38
|
end
|
data/lib/ms/parser/mzdata/dom.rb
CHANGED
@@ -51,23 +51,20 @@ class MS::Parser::MzData::DOM
|
|
51
51
|
# %w(num msLevel retentionTime startMz endMz precursors spectrum)
|
52
52
|
|
53
53
|
root = get_root_node_from_file(file)
|
54
|
-
scan_count = 0
|
55
54
|
description = root.find_first('child::description')
|
56
55
|
bioworks33 = is_bioworks33?(description)
|
57
56
|
spectrum_list = description.next
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
end
|
57
|
+
|
58
|
+
scans = []
|
59
|
+
|
60
|
+
# bioworks 33 gives incorrect scan count
|
61
|
+
stated_num_scans = spectrum_list['count'].to_i
|
64
62
|
|
65
63
|
# if I move from node to node, it means I've checked that it's a sequence
|
66
64
|
# and that the elements are req'd
|
67
65
|
if spectrum_list.child?
|
68
66
|
spectrum_n = spectrum_list.child
|
69
67
|
loop do
|
70
|
-
scan_count += 1
|
71
68
|
scan = MS::Scan.new(9)
|
72
69
|
id = spectrum_n["id"].to_i
|
73
70
|
id_to_scan_hash[id] = scan
|
@@ -81,11 +78,9 @@ class MS::Parser::MzData::DOM
|
|
81
78
|
spec_inst_n = spec_settings_n.find_first('child::spectrumInstrument')
|
82
79
|
scan[1] = spec_inst_n['msLevel'].to_i
|
83
80
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
scans[scan_count] = scan
|
88
|
-
end
|
81
|
+
# we could use a scan_count, but in bioworks 33, we can't trust the
|
82
|
+
# scan count! So, we just collect them
|
83
|
+
scans << scan
|
89
84
|
|
90
85
|
scan[3] = spec_inst_n['mzRangeStart'].to_f
|
91
86
|
scan[4] = spec_inst_n['mzRangeStop'].to_f
|
@@ -149,7 +144,12 @@ class MS::Parser::MzData::DOM
|
|
149
144
|
MS::MSRun.add_parent_scan(scans, opts[:spectra])
|
150
145
|
end
|
151
146
|
msrun_obj.scans = scans
|
152
|
-
msrun_obj.scan_count =
|
147
|
+
msrun_obj.scan_count = scans.size
|
148
|
+
unless bioworks33 # we know the scan count is off here
|
149
|
+
if msrun_obj.scan_count != stated_num_scans
|
150
|
+
warn "num collected scans (#{scans.size}) does not agree with stated num scans (#{stated_num_scans})!"
|
151
|
+
end
|
152
|
+
end
|
153
153
|
msrun_obj.start_time = msrun_obj.scans.first.time
|
154
154
|
msrun_obj.end_time = msrun_obj.scans.last.time
|
155
155
|
end
|
data/lib/ms/scan.rb
CHANGED
@@ -28,7 +28,7 @@ class MS::Scan
|
|
28
28
|
atts = %w(num ms_level time start_mz end_mz)
|
29
29
|
display = atts.map do |att|
|
30
30
|
if val = send(att.to_sym)
|
31
|
-
"
|
31
|
+
"#{att}=#{val}"
|
32
32
|
else
|
33
33
|
nil
|
34
34
|
end
|
@@ -38,9 +38,9 @@ class MS::Scan
|
|
38
38
|
if spectrum
|
39
39
|
spectrum.mz.size
|
40
40
|
else
|
41
|
-
nil
|
41
|
+
'nil'
|
42
42
|
end
|
43
|
-
"<MS::Scan:#{__id__} " + display.join(", ") + "
|
43
|
+
"<MS::Scan:#{__id__} " + display.join(", ") + " precursors=#{precursors.inspect}" + " spectrum(size)=#{spec_display}" + " >"
|
44
44
|
end
|
45
45
|
|
46
46
|
# returns the string (space delimited): "ms_level num time [prec_mz prec_inten]"
|
data/lib/mspire.rb
CHANGED
data/lib/sample_enzyme.rb
CHANGED
@@ -23,6 +23,7 @@ class SampleEnzyme
|
|
23
23
|
# For other enzymes, you must set :cut, :no_cut, :name, and :sense
|
24
24
|
# will yield the object if you want to set the values that way
|
25
25
|
def initialize(name=nil)
|
26
|
+
@num_missed_cleavages_regex = nil
|
26
27
|
@sense = nil
|
27
28
|
@cut = nil
|
28
29
|
@no_cut = nil
|
@@ -62,6 +63,44 @@ class SampleEnzyme
|
|
62
63
|
self.new.from_pepxml_node(node)
|
63
64
|
end
|
64
65
|
|
66
|
+
# takes an amino acid sequence (e.g., -.PEPTIDK.L)
|
67
|
+
# returns the number of missed cleavages
|
68
|
+
def num_missed_cleavages(aaseq)
|
69
|
+
raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
|
70
|
+
@num_missed_cleavages_regex =
|
71
|
+
if @num_missed_cleavages_regex ; @num_missed_cleavages_regex
|
72
|
+
else
|
73
|
+
regex_string = "[#{@cut}]"
|
74
|
+
if @no_cut and @no_cut != ''
|
75
|
+
regex_string << "[^#{@no_cut}]"
|
76
|
+
end
|
77
|
+
/#{regex_string}/
|
78
|
+
end
|
79
|
+
arr = aaseq.scan(@num_missed_cleavages_regex)
|
80
|
+
num = arr.size
|
81
|
+
if aaseq[-1,1] =~ @num_missed_cleavages_regex
|
82
|
+
num -= 1
|
83
|
+
end
|
84
|
+
num
|
85
|
+
end
|
86
|
+
|
87
|
+
# requires full sequence (with heads and tails)
|
88
|
+
def num_tol_term(sequence)
|
89
|
+
raise NotImplementedError, 'need to implement for N terminal sense' if sense == 'N'
|
90
|
+
no_cut = @no_cut || ''
|
91
|
+
num_tol = 0
|
92
|
+
first, middle, last = SpecID::Pep.split_sequence(sequence)
|
93
|
+
last_of_middle = middle[-1,1]
|
94
|
+
first_of_middle = middle[0,1]
|
95
|
+
if ( @cut.include?(first) && !no_cut.include?(first_of_middle) ) || first == '-'
|
96
|
+
num_tol += 1
|
97
|
+
end
|
98
|
+
if @cut.include?(last_of_middle) && !no_cut.include?(last) || last == '-'
|
99
|
+
num_tol += 1
|
100
|
+
end
|
101
|
+
num_tol
|
102
|
+
end
|
103
|
+
|
65
104
|
# returns all peptides of missed cleavages <= 'missed_cleavages'
|
66
105
|
# so 2 missed cleavages will return all no missed cleavage peptides
|
67
106
|
# all 1 missed cleavages and all 2 missed cleavages.
|
data/lib/spec_id.rb
CHANGED
@@ -7,6 +7,7 @@ require 'spec_id/bioworks'
|
|
7
7
|
require 'spec_id/sequest'
|
8
8
|
require 'spec_id/proph/prot_summary'
|
9
9
|
require 'spec_id_xml'
|
10
|
+
require 'spec_id/sqt'
|
10
11
|
require 'spec_id/mass'
|
11
12
|
require 'fasta'
|
12
13
|
|
@@ -71,6 +72,10 @@ module SpecID
|
|
71
72
|
Proph::ProtSummary.new(file)
|
72
73
|
when 'pepproph'
|
73
74
|
Proph::PepSummary.new(file)
|
75
|
+
when 'sqg'
|
76
|
+
SQTGroup.new(file)
|
77
|
+
when 'sqt'
|
78
|
+
SQTGroup.new([file])
|
74
79
|
else
|
75
80
|
abort "UNRECOGNIZED file type for #{file}"
|
76
81
|
end
|
@@ -447,6 +452,8 @@ module SpecID
|
|
447
452
|
def self.file_type(file)
|
448
453
|
if file =~ /\.srg$/
|
449
454
|
return 'srg'
|
455
|
+
elsif file =~ /\.sqg$/
|
456
|
+
return 'sqg'
|
450
457
|
end
|
451
458
|
if IO.read(file, 7,438) == 'Enzyme:'
|
452
459
|
return 'srf'
|
@@ -461,6 +468,17 @@ module SpecID
|
|
461
468
|
elsif lines =~ /<msms_pipeline_analysis.*<peptideprophet_summary/m
|
462
469
|
return 'pepproph'
|
463
470
|
end
|
471
|
+
# assumes the header of a sqt file is less than 200 lines ...
|
472
|
+
200.times do
|
473
|
+
line = fh.gets
|
474
|
+
if line
|
475
|
+
lines << line
|
476
|
+
else ; break
|
477
|
+
end
|
478
|
+
end
|
479
|
+
if lines =~ /^H\tDatabase/ and lines =~ /^H\tSQTGenerator/
|
480
|
+
return 'sqt'
|
481
|
+
end
|
464
482
|
end
|
465
483
|
end
|
466
484
|
|
data/lib/spec_id/aa_freqs.rb
CHANGED
@@ -3,30 +3,27 @@ require 'fasta'
|
|
3
3
|
module SpecID ; end
|
4
4
|
|
5
5
|
class SpecID::AAFreqs
|
6
|
-
# a fasta object
|
7
|
-
attr_accessor :fasta
|
8
6
|
# hash by capital one-letter amino acid symbols giving the frequency of
|
9
7
|
# seeing that amino acid. Frequencies should add to 1.
|
10
8
|
attr_accessor :aafreqs
|
11
9
|
|
12
10
|
# fasta is fasta object!
|
13
11
|
def initialize(fasta=nil)
|
14
|
-
|
15
|
-
|
16
|
-
@aafreqs = calculate_frequencies(@fasta)
|
12
|
+
if fasta
|
13
|
+
@aafreqs = calculate_frequencies(fasta.prots)
|
17
14
|
end
|
18
15
|
end
|
19
16
|
|
20
|
-
# creates an aafreqs hash
|
21
|
-
def calculate_frequencies(
|
17
|
+
# takes an enumerable of objects responding to :aaseq and creates an aafreqs hash
|
18
|
+
def calculate_frequencies(objs)
|
22
19
|
hash = {}
|
23
20
|
total_aas = 0
|
24
21
|
('A'..'Z').each do |x|
|
25
22
|
hash[x] = 0
|
26
23
|
end
|
27
24
|
hash['*'] = 0
|
28
|
-
|
29
|
-
aaseq =
|
25
|
+
objs.each do |obj|
|
26
|
+
aaseq = obj.aaseq
|
30
27
|
total_aas += aaseq.size
|
31
28
|
aaseq.split('').each do |x|
|
32
29
|
hash[x] += 1
|
data/lib/spec_id/digestor.rb
CHANGED
@@ -100,38 +100,37 @@ class Digestor
|
|
100
100
|
# The prot_aaseq is used if the mass_hash contains the keys
|
101
101
|
# :add_C_term_protein or :add_N_term_protein
|
102
102
|
#
|
103
|
+
# mass_hash requires the key :h_plus or :h depending on h_plus option.
|
103
104
|
# prot_aaseqs is parallel to pep_aaseqs_ar where each is a group of
|
104
105
|
# peptides matching a protein aaseq
|
105
|
-
# returns another parallel array of passing
|
106
|
+
# returns another parallel array of passing peptides per protein
|
106
107
|
def limit_sizes(prot_aaseqs, pep_aaseqs_ar, min_mh, max_mh, mass_hash, h_plus=false)
|
107
108
|
if mass_hash.key?(:add_C_term_protein) or mass_hash.key?(:add_N_term_protein)
|
108
109
|
raise NotImplementedError, "need to add ability to change weights of peptides from the ends of proteins"
|
109
110
|
else
|
110
111
|
# figure out how much must be added to each peptide
|
111
112
|
# include the h2o, the h, and N and C terminal static mods
|
112
|
-
|
113
|
-
|
113
|
+
h_plus_key = h_plus ? :h_plus : :h
|
114
|
+
extra_add = mass_hash[h_plus_key]
|
114
115
|
[:add_N_term_peptide, :add_C_term_peptide].each do |sym|
|
115
116
|
if mass_hash.key?(sym)
|
116
|
-
|
117
|
+
extra_add += mass_hash[sym]
|
117
118
|
end
|
118
119
|
end
|
119
|
-
|
120
|
-
|
120
|
+
mc = Mass::Calculator.new(mass_hash, extra_add)
|
121
|
+
|
122
|
+
masses_per_group = pep_aaseqs_ar.map do |pep_aaseqs|
|
123
|
+
mc.masses(pep_aaseqs)
|
124
|
+
end
|
121
125
|
|
122
|
-
pep_aaseqs_ar.map do
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
puts 'NOT FOUND'
|
128
|
-
p let
|
129
|
-
end
|
130
|
-
sum += hash_by_aa_string[let]
|
126
|
+
masses_per_group.zip(pep_aaseqs_ar).map do |masses, aaseqs|
|
127
|
+
passing = []
|
128
|
+
aaseqs.zip(masses) do |aaseq, mh_plus|
|
129
|
+
if ( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
|
130
|
+
passing << aaseq
|
131
131
|
end
|
132
|
-
mh_plus = sum + final_add
|
133
|
-
( (mh_plus >= min_mh) and (mh_plus <= max_mh) )
|
134
132
|
end
|
133
|
+
passing
|
135
134
|
end
|
136
135
|
end
|
137
136
|
end
|
data/lib/spec_id/mass.rb
CHANGED
@@ -29,13 +29,13 @@ class Mass
|
|
29
29
|
:U => 150.95364, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
|
30
30
|
:X => 118.805716, # the average of the mono masses of the 20 amino acids
|
31
31
|
:* => 118.805716, # same as X
|
32
|
+
:Z => (129.04259 + 128.05858) / 2, # average glutamic acid and glutamine
|
32
33
|
|
33
34
|
# elements etc.
|
34
35
|
:h => 1.00783,
|
35
36
|
:h_plus => 1.00728,
|
36
37
|
:o => 15.9949146,
|
37
38
|
:h2o => 18.01056,
|
38
|
-
|
39
39
|
}
|
40
40
|
AVG = {
|
41
41
|
:A => 71.0788,
|
@@ -64,6 +64,7 @@ class Mass
|
|
64
64
|
:U => 150.03, # (selenocysteine) http://www.matrix-science.com/help/aa_help.html
|
65
65
|
:X => 118.88603, # the average of the masses of the 20 amino acids
|
66
66
|
:* => 118.88603, # same as X
|
67
|
+
:Z => (129.1155+ 128.1307) / 2, # average glutamic acid and glutamine
|
67
68
|
|
68
69
|
# elements etc.
|
69
70
|
:h => 1.00794,
|
@@ -112,5 +113,66 @@ class Mass
|
|
112
113
|
end
|
113
114
|
copy_hash
|
114
115
|
end
|
116
|
+
|
117
|
+
# returns an array of masses parallel to array passed in
|
118
|
+
# If you want the mass with H+, then pass in the mass as h_plus
|
119
|
+
# The mass hash must repond to
|
120
|
+
# :h2o (water)
|
121
|
+
# and at least the twenty amino acids (by string or symbol)
|
122
|
+
# The mass hash may respond to :add_N_term_peptide or :add_C_term_peptide
|
123
|
+
# in which case these will be added to the final mass
|
124
|
+
def self.masses(aaseqs, mass_hash=Mass::MONO, h_plus=0.0)
|
125
|
+
final_add = mass_hash[:h2o] + h_plus
|
126
|
+
[:add_N_term_peptide, :add_C_term_peptide].each do |sym|
|
127
|
+
if mass_hash.key?(sym)
|
128
|
+
final_add += mass_hash[sym]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
hash_by_aa_string = {}
|
132
|
+
mass_hash.each {|k,v| hash_by_aa_string[k.to_s] = mass_hash[k] }
|
133
|
+
|
134
|
+
aaseqs.map do |pep_aaseqs|
|
135
|
+
sum = 0.0
|
136
|
+
aaseq.split('').each do |let|
|
137
|
+
sum += hash_by_aa_string[let]
|
138
|
+
end
|
139
|
+
mh_plus = sum + final_add
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
|
144
|
+
end
|
145
|
+
|
146
|
+
class Mass::Calculator
|
147
|
+
|
148
|
+
# mass_hash must respond to :h2o or 'h2o'. This is added to represent the
|
149
|
+
# tails of the peptide. add_extra is outside of that (e.g., an H+)
|
150
|
+
def initialize(mass_hash, add_extra=0.0)
|
151
|
+
@mass_hash = mass_hash_to_s(mass_hash)
|
152
|
+
@final_add = @mass_hash['h2o'] + add_extra
|
153
|
+
end
|
154
|
+
|
155
|
+
def mass_hash_to_s(mass_hash)
|
156
|
+
new_hash = {}
|
157
|
+
mass_hash.each do |k,v|
|
158
|
+
new_hash[k.to_s] = v
|
159
|
+
end
|
160
|
+
new_hash
|
161
|
+
end
|
162
|
+
|
163
|
+
def masses(aaseqs)
|
164
|
+
aaseqs.map do |aaseq|
|
165
|
+
sum = @final_add # <- add in the initialization
|
166
|
+
aaseq.split('').each do |let|
|
167
|
+
if @mass_hash.key? let
|
168
|
+
sum += @mass_hash[let]
|
169
|
+
else
|
170
|
+
abort "LETTER not found in mass_hash: #{let}"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
sum
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
115
177
|
end
|
116
178
|
|