mspire 0.5.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +24 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/lib/cv/description.rb +18 -0
- data/lib/cv/param.rb +33 -0
- data/lib/cv.rb +3 -0
- data/lib/io/bookmark.rb +13 -0
- data/lib/merge.rb +7 -0
- data/lib/ms/cvlist.rb +76 -0
- data/lib/ms/digester.rb +245 -0
- data/lib/ms/fasta.rb +86 -0
- data/lib/ms/ident/peptide/db.rb +243 -0
- data/lib/ms/ident/peptide.rb +72 -0
- data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
- data/lib/ms/ident/peptide_hit.rb +26 -0
- data/lib/ms/ident/pepxml/modifications.rb +83 -0
- data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
- data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
- data/lib/ms/ident/pepxml/parameters.rb +14 -0
- data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
- data/lib/ms/ident/pepxml/search_database.rb +49 -0
- data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
- data/lib/ms/ident/pepxml/search_hit.rb +144 -0
- data/lib/ms/ident/pepxml/search_result.rb +35 -0
- data/lib/ms/ident/pepxml/search_summary.rb +92 -0
- data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
- data/lib/ms/ident/pepxml.rb +112 -0
- data/lib/ms/ident/protein.rb +33 -0
- data/lib/ms/ident/protein_group.rb +80 -0
- data/lib/ms/ident/search.rb +114 -0
- data/lib/ms/ident.rb +37 -0
- data/lib/ms/isotope/aa.rb +59 -0
- data/lib/ms/mascot.rb +6 -0
- data/lib/ms/mass/aa.rb +79 -0
- data/lib/ms/mass.rb +55 -0
- data/lib/ms/mzml/index_list.rb +98 -0
- data/lib/ms/mzml/plms1.rb +34 -0
- data/lib/ms/mzml.rb +197 -0
- data/lib/ms/obo.rb +38 -0
- data/lib/ms/plms1.rb +156 -0
- data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
- data/lib/ms/quant/qspec.rb +112 -0
- data/lib/ms/spectrum.rb +154 -8
- data/lib/ms.rb +3 -10
- data/lib/msplat.rb +2 -0
- data/lib/obo/ims.rb +5 -0
- data/lib/obo/ms.rb +7 -0
- data/lib/obo/ontology.rb +41 -0
- data/lib/obo/unit.rb +5 -0
- data/lib/openany.rb +23 -0
- data/lib/write_file_or_string.rb +18 -0
- data/obo/ims.obo +562 -0
- data/obo/ms.obo +11677 -0
- data/obo/unit.obo +2563 -0
- data/spec/ms/cvlist_spec.rb +60 -0
- data/spec/ms/digester_spec.rb +351 -0
- data/spec/ms/fasta_spec.rb +100 -0
- data/spec/ms/ident/peptide/db_spec.rb +108 -0
- data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
- data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
- data/spec/ms/ident/pepxml_spec.rb +442 -0
- data/spec/ms/ident/protein_group_spec.rb +68 -0
- data/spec/ms/mass_spec.rb +8 -0
- data/spec/ms/mzml/index_list_spec.rb +122 -0
- data/spec/ms/mzml/plms1_spec.rb +62 -0
- data/spec/ms/mzml_spec.rb +50 -0
- data/spec/ms/plms1_spec.rb +38 -0
- data/spec/ms/quant/qspec_spec.rb +25 -0
- data/spec/msplat_spec.rb +24 -0
- data/spec/obo_spec.rb +25 -0
- data/spec/spec_helper.rb +25 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
- data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
- data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
- data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
- data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
- data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
- data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
- data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
- data/spec/testfiles/plms1/output.key +0 -0
- metadata +157 -40
- data/README +0 -77
- data/changelog.txt +0 -196
- data/lib/ms/calc.rb +0 -32
- data/lib/ms/data/interleaved.rb +0 -60
- data/lib/ms/data/lazy_io.rb +0 -73
- data/lib/ms/data/lazy_string.rb +0 -15
- data/lib/ms/data/simple.rb +0 -59
- data/lib/ms/data/transposed.rb +0 -41
- data/lib/ms/data.rb +0 -57
- data/lib/ms/format/format_error.rb +0 -12
- data/lib/ms/support/binary_search.rb +0 -126
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
require 'ms/digester'
|
|
2
|
+
require 'ms/fasta'
|
|
3
|
+
require 'optparse'
|
|
4
|
+
|
|
5
|
+
module MS ; end
|
|
6
|
+
module MS::Ident ; end
|
|
7
|
+
module MS::Ident::Peptide ; end
|
|
8
|
+
|
|
9
|
+
# the object itself is a modified Hash.
|
|
10
|
+
# It is initialized with the database file and a protein array can be
|
|
11
|
+
# retrieved with the #[] method given an amino acid sequence. All other
|
|
12
|
+
# methods are untested at this time and should be avoided!
|
|
13
|
+
class MS::Ident::Peptide::Db < Hash
|
|
14
|
+
MAX_NUM_AA_EXPANSION = 3
|
|
15
|
+
|
|
16
|
+
# the twenty standard amino acids
|
|
17
|
+
STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
|
|
18
|
+
|
|
19
|
+
DEFAULT_PEPTIDE_CENTRIC_DB = {:missed_cleavages => 2, :min_length => 4, :enzyme => MS::Digester[:trypsin], :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
|
|
20
|
+
|
|
21
|
+
PROTEIN_DELIMITER = "\t"
|
|
22
|
+
KEY_VALUE_DELIMITER = ": "
|
|
23
|
+
|
|
24
|
+
def self.cmdline(argv)
|
|
25
|
+
|
|
26
|
+
opt = {
|
|
27
|
+
:remove_digestion_file => true,
|
|
28
|
+
:enzyme => MS::Digester[:trypsin]
|
|
29
|
+
}
|
|
30
|
+
opts = OptionParser.new do |op|
|
|
31
|
+
op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
|
|
32
|
+
op.separator "output: "
|
|
33
|
+
op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
|
|
34
|
+
op.separator "format:"
|
|
35
|
+
op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
|
|
36
|
+
op.separator ""
|
|
37
|
+
op.separator " Initiator Methionines - by default, will generate two peptides"
|
|
38
|
+
op.separator " for any peptide found at the N-termini starting with 'M'"
|
|
39
|
+
op.separator " (i.e., one with and one without the leading methionine)"
|
|
40
|
+
op.separator ""
|
|
41
|
+
op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
|
|
42
|
+
op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
|
|
43
|
+
op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
|
|
44
|
+
op.on("--no-expand-x", "don't enumerate aa 'X' possibilities") { opt[:expand_aa] = nil }
|
|
45
|
+
op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = MS::Insilico::Digester.const_get(v.upcase) }
|
|
46
|
+
op.on("--list-enzymes", "lists approved enzymes and exits") do
|
|
47
|
+
puts MS::Digester::ENZYMES.keys.join("\n")
|
|
48
|
+
exit
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
opts.parse!(argv)
|
|
53
|
+
|
|
54
|
+
if argv.size == 0
|
|
55
|
+
puts opts || exit
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
argv.map do |file|
|
|
59
|
+
MS::Ident::Peptide::Db.peptide_centric_db(file, opt)
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# writes a new file with the added 'min_aaseq<Integer>'
|
|
64
|
+
# creates a temporary digestion file that contains all peptides digesting
|
|
65
|
+
# with certain missed_cleavages (i.e., min_seq_length is not applied to
|
|
66
|
+
# this file but on the final peptide centric db)
|
|
67
|
+
# returns the full name of the written file.
|
|
68
|
+
def self.peptide_centric_db(fasta_file, opts={})
|
|
69
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
|
70
|
+
|
|
71
|
+
(missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa)
|
|
72
|
+
start_time = Time.now
|
|
73
|
+
print "Digesting #{fasta_file} ..." if $VERBOSE
|
|
74
|
+
|
|
75
|
+
if expand_aa
|
|
76
|
+
letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
base = fasta_file.chomp(File.extname(fasta_file))
|
|
80
|
+
digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
|
|
81
|
+
File.open(digestion_file, "w") do |fh|
|
|
82
|
+
MS::Fasta.open(fasta_file) do |fasta|
|
|
83
|
+
fasta.each do |prot|
|
|
84
|
+
peptides = enzyme.digest(prot.sequence, missed_cleavages)
|
|
85
|
+
if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
|
|
86
|
+
m_peps = []
|
|
87
|
+
init_methionine_peps = []
|
|
88
|
+
peptides.each do |pep|
|
|
89
|
+
# if the peptide is at the beginning of the protein sequence
|
|
90
|
+
if prot.sequence[0,pep.size] == pep
|
|
91
|
+
m_peps << pep[1..-1]
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
peptides.push(*m_peps)
|
|
95
|
+
end
|
|
96
|
+
if expand_aa
|
|
97
|
+
peptides = peptides.map do |pep|
|
|
98
|
+
if pep =~ letters_to_expand_re
|
|
99
|
+
expand_peptides(pep, expand_aa)
|
|
100
|
+
else
|
|
101
|
+
pep
|
|
102
|
+
end
|
|
103
|
+
end.flatten
|
|
104
|
+
end
|
|
105
|
+
fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
start_time = Time.now
|
|
113
|
+
print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
|
|
114
|
+
|
|
115
|
+
hash = Hash.new {|h,k| h[k] = [] }
|
|
116
|
+
::IO.foreach(digestion_file) do |line|
|
|
117
|
+
(prot, *peps) = line.chomp!.split(/\s+/)
|
|
118
|
+
# prot is something like this: "sp|P31946|1433B_HUMAN" in uniprot
|
|
119
|
+
peps.each do |pep|
|
|
120
|
+
if pep.size >= min_length
|
|
121
|
+
hash[pep] << prot
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
|
126
|
+
|
|
127
|
+
base = digestion_file.chomp(File.extname(digestion_file))
|
|
128
|
+
final_outfile = base + ".min_aaseq#{min_length}" + ".yml"
|
|
129
|
+
|
|
130
|
+
start_time = Time.now
|
|
131
|
+
print "Writing #{hash.size} peptides to #{} ..." if $VERBOSE
|
|
132
|
+
|
|
133
|
+
File.open(final_outfile, 'w') do |out|
|
|
134
|
+
hash.each do |k,v|
|
|
135
|
+
out.puts( [k, v.join(PROTEIN_DELIMITER)].join(KEY_VALUE_DELIMITER) )
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
|
139
|
+
|
|
140
|
+
if remove_digestion_file
|
|
141
|
+
File.unlink(digestion_file)
|
|
142
|
+
end
|
|
143
|
+
File.expand_path(final_outfile)
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# does combinatorial expansion of all letters requesting it.
|
|
147
|
+
# expand_aa is hash like: {'X'=>STANDARD_AA}
|
|
148
|
+
# returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
|
|
149
|
+
# be expanded
|
|
150
|
+
# returns an empty array if there is no expansion
|
|
151
|
+
def self.expand_peptides(peptide, expand_aa)
|
|
152
|
+
letters_in_order = expand_aa.keys.sort
|
|
153
|
+
index_and_key = []
|
|
154
|
+
peptide.split('').each_with_index do |char,i|
|
|
155
|
+
if let_index = letters_in_order.index(char)
|
|
156
|
+
index_and_key << [i, letters_in_order[let_index]]
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
if index_and_key.size > MAX_NUM_AA_EXPANSION
|
|
160
|
+
return nil
|
|
161
|
+
end
|
|
162
|
+
to_expand = [peptide]
|
|
163
|
+
index_and_key.each do |i,letter|
|
|
164
|
+
new_peps = []
|
|
165
|
+
while current_pep = to_expand.shift do
|
|
166
|
+
new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
|
|
167
|
+
end
|
|
168
|
+
to_expand = new_peps.flatten
|
|
169
|
+
end
|
|
170
|
+
to_expand
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def initialize(db_file)
|
|
174
|
+
self.replace(YAML.load_file(db_file))
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
alias_method :old_bracket, '[]'.to_sym
|
|
178
|
+
|
|
179
|
+
# returns the protein id's as an array
|
|
180
|
+
def [](key)
|
|
181
|
+
old_bracket(key).chomp.split(PROTEIN_DELIMITER)
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
# an object for on disk retrieval of db entries
|
|
185
|
+
# proteins are returned as an array.
|
|
186
|
+
# behaves much like a hash once it is opened.
|
|
187
|
+
class IO
|
|
188
|
+
include Enumerable
|
|
189
|
+
def self.open(filename, &block)
|
|
190
|
+
raise ArgumentError unless block
|
|
191
|
+
File.open(filename) do |io|
|
|
192
|
+
block.call(self.new(io))
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
attr_accessor :io
|
|
197
|
+
attr_accessor :index
|
|
198
|
+
|
|
199
|
+
def initialize(io)
|
|
200
|
+
@io = io
|
|
201
|
+
@index = {}
|
|
202
|
+
re = /^(\w+)#{Regexp.escape(KEY_VALUE_DELIMITER)}/
|
|
203
|
+
prev_io_pos = io.pos
|
|
204
|
+
triplets = io.each_line.map do |line|
|
|
205
|
+
key = re.match(line)[1]
|
|
206
|
+
[key, prev_io_pos + key.bytesize+KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
|
|
207
|
+
end
|
|
208
|
+
triplets.each do |key, start, end_pos|
|
|
209
|
+
@index[key] = [start, end_pos-start]
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# returns an array of proteins for the given key (peptide aaseq)
|
|
214
|
+
def [](key)
|
|
215
|
+
(start, length) = @index[key]
|
|
216
|
+
return nil unless start
|
|
217
|
+
@io.seek(start)
|
|
218
|
+
string = @io.read(length)
|
|
219
|
+
string.chomp!
|
|
220
|
+
string.split("\t")
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# number of entries
|
|
224
|
+
def size ; @index.size end
|
|
225
|
+
alias_method :length, :size
|
|
226
|
+
|
|
227
|
+
def keys
|
|
228
|
+
@index.keys
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# all the protein lists
|
|
232
|
+
def values
|
|
233
|
+
keys.map {|key| self[key] }
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# yields a pair of aaseq and protein array
|
|
237
|
+
def each(&block)
|
|
238
|
+
@index.each do |key, start_length|
|
|
239
|
+
block.call([key, self[key]])
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
module MS ; end
|
|
2
|
+
module MS::Ident ; end
|
|
3
|
+
|
|
4
|
+
# A 'sequence' is a notation of a peptide that includes the leading and
|
|
5
|
+
# trailing amino acid after cleavage (e.g., K.PEPTIDER.E or -.STARTK.L )
|
|
6
|
+
# and may contain post-translational modification information.
|
|
7
|
+
#
|
|
8
|
+
# 'aaseq' is the amino acid sequence of just the peptide with no leading or
|
|
9
|
+
# trailing notation (e.g., PEPTIDER or LAKKLY)
|
|
10
|
+
module MS::Ident::Peptide
|
|
11
|
+
Nonstandard_AA_re = /[^A-Z\.\-]/
|
|
12
|
+
|
|
13
|
+
class << self
|
|
14
|
+
|
|
15
|
+
# Takes a peptide sequence of the form '-.PEPTIDE.R', removes non-standard
|
|
16
|
+
# amino acids, and returns the center piece
|
|
17
|
+
def sequence_to_aaseq(sequence)
|
|
18
|
+
after_removed = remove_non_amino_acids(sequence)
|
|
19
|
+
pieces = after_removed.split('.')
|
|
20
|
+
case pieces.size
|
|
21
|
+
when 3
|
|
22
|
+
pieces[1]
|
|
23
|
+
when 2
|
|
24
|
+
if pieces[0].size > 1 ## N termini
|
|
25
|
+
pieces[0]
|
|
26
|
+
else ## C termini
|
|
27
|
+
pieces[1]
|
|
28
|
+
end
|
|
29
|
+
when 1 ## this must be a parse error!
|
|
30
|
+
pieces[0] ## which is the peptide itself
|
|
31
|
+
else
|
|
32
|
+
abort "bad peptide sequence: #{sequence.inspect}"
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# removes non standard amino acids specified by Nonstandard_AA_re
|
|
37
|
+
def remove_non_amino_acids(sequence)
|
|
38
|
+
sequence.gsub(Nonstandard_AA_re, '')
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# remove non amino acids and split the sequence
|
|
42
|
+
def prepare_sequence(sequence)
|
|
43
|
+
nv = remove_non_amino_acids(sequence)
|
|
44
|
+
split_sequence(nv)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Returns prev, peptide, next from sequence. Parse errors return
|
|
48
|
+
# nil,nil,nil
|
|
49
|
+
# R.PEPTIDE.A # -> R, PEPTIDE, A
|
|
50
|
+
# R.PEPTIDE.- # -> R, PEPTIDE, -
|
|
51
|
+
# PEPTIDE.A # -> -, PEPTIDE, A
|
|
52
|
+
# A.PEPTIDE # -> A, PEPTIDE, -
|
|
53
|
+
# PEPTIDE # -> nil,nil,nil
|
|
54
|
+
def split_sequence(sequence)
|
|
55
|
+
pieces = sequence.split('.')
|
|
56
|
+
case pieces.size
|
|
57
|
+
when 3
|
|
58
|
+
pieces
|
|
59
|
+
when 2
|
|
60
|
+
if pieces[0].size > 1 ## N termini
|
|
61
|
+
['-', pieces[0], pieces[1]]
|
|
62
|
+
else ## C termini
|
|
63
|
+
[pieces[0], pieces[1], '-']
|
|
64
|
+
end
|
|
65
|
+
when 1 ## this must be a parse error!
|
|
66
|
+
[nil,nil,nil]
|
|
67
|
+
when 0
|
|
68
|
+
[nil,nil,nil]
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
require 'ms/ident/search'
|
|
2
|
+
require 'ms/ident/peptide_hit'
|
|
3
|
+
|
|
4
|
+
module MS ; end
|
|
5
|
+
module MS::Ident ; end
|
|
6
|
+
|
|
7
|
+
class MS::Ident::PeptideHit
|
|
8
|
+
module Qvalue
|
|
9
|
+
FILE_EXTENSION = '.phq.tsv'
|
|
10
|
+
FILE_DELIMITER = "\t"
|
|
11
|
+
HEADER = %w(run_id id aaseq charge qvalue)
|
|
12
|
+
|
|
13
|
+
class << self
|
|
14
|
+
|
|
15
|
+
# writes to the file, adding an extension
|
|
16
|
+
def to_phq(base, hits, qvalues=[])
|
|
17
|
+
to_file(base + FILE_EXTENSION, hits, qvalues)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# writes the peptide hits to a phq.tsv file. qvalues is a parallel array
|
|
21
|
+
# to hits that can provide qvalues if not inherent to the hits
|
|
22
|
+
# returns the filename.
|
|
23
|
+
def to_file(filename, hits, qvalues=[])
|
|
24
|
+
File.open(filename,'w') do |out|
|
|
25
|
+
out.puts HEADER.join(FILE_DELIMITER)
|
|
26
|
+
hits.zip(qvalues) do |hit, qvalue|
|
|
27
|
+
out.puts [hit.search.id, hit.id, hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
filename
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# returns an array of PeptideHit objects from a phq.tsv
|
|
34
|
+
def from_file(filename)
|
|
35
|
+
searches = Hash.new {|h,id| h[id] = MS::Ident::Search.new(id) }
|
|
36
|
+
peptide_hits = []
|
|
37
|
+
File.open(filename) do |io|
|
|
38
|
+
header = io.readline.chomp.split(FILE_DELIMITER)
|
|
39
|
+
raise "bad headers" unless header == HEADER
|
|
40
|
+
io.each do |line|
|
|
41
|
+
line.chomp!
|
|
42
|
+
(run_id, id, aaseq, charge, qvalue) = line.split(FILE_DELIMITER)
|
|
43
|
+
ph = MS::Ident::PeptideHit.new
|
|
44
|
+
ph.search = searches[run_id]
|
|
45
|
+
ph.id = id; ph.aaseq = aaseq ; ph.charge = charge.to_i ; ph.qvalue = qvalue.to_f
|
|
46
|
+
peptide_hits << ph
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
peptide_hits
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
alias_method :from_phq, :from_file
|
|
53
|
+
|
|
54
|
+
end
|
|
55
|
+
end # Qvalue
|
|
56
|
+
end # Peptide Hit
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
require 'merge'
|
|
2
|
+
|
|
3
|
+
module MS ; end
|
|
4
|
+
module MS::Ident ; end
|
|
5
|
+
|
|
6
|
+
module MS::Ident::PeptideHitLike
|
|
7
|
+
attr_accessor :id
|
|
8
|
+
attr_accessor :search
|
|
9
|
+
attr_accessor :missed_cleavages
|
|
10
|
+
attr_accessor :aaseq
|
|
11
|
+
attr_accessor :charge
|
|
12
|
+
# an array of MS::Ident::ProteinLike objects
|
|
13
|
+
attr_accessor :proteins
|
|
14
|
+
# relative to the set the hit is contained in!
|
|
15
|
+
attr_accessor :qvalue
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
class MS::Ident::PeptideHit
|
|
19
|
+
include MS::Ident::PeptideHitLike
|
|
20
|
+
include Merge
|
|
21
|
+
|
|
22
|
+
def initialize(hash)
|
|
23
|
+
merge!(hash)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
require 'merge'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
|
|
4
|
+
module MS ; end
|
|
5
|
+
module MS::Ident ; end
|
|
6
|
+
class MS::Ident::Pepxml ; end
|
|
7
|
+
|
|
8
|
+
# Modified aminoacid, static or variable
|
|
9
|
+
# unless otherwise stated, all attributes can be anything
|
|
10
|
+
class MS::Ident::Pepxml::AminoacidModification
|
|
11
|
+
include Merge
|
|
12
|
+
# The amino acid (one letter code)
|
|
13
|
+
attr_accessor :aminoacid
|
|
14
|
+
# Mass difference with respect to unmodified aminoacid, as a Float
|
|
15
|
+
attr_accessor :massdiff
|
|
16
|
+
# Mass of modified aminoacid, Float
|
|
17
|
+
attr_accessor :mass
|
|
18
|
+
# Y if both modified and unmodified aminoacid could be present in the
|
|
19
|
+
# dataset, N if only modified aminoacid can be present
|
|
20
|
+
attr_accessor :variable
|
|
21
|
+
# whether modification can reside only at protein terminus (specified 'n',
|
|
22
|
+
# 'c', or 'nc')
|
|
23
|
+
attr_accessor :peptide_terminus
|
|
24
|
+
# Symbol used by search engine to designate this modification
|
|
25
|
+
attr_accessor :symbol
|
|
26
|
+
# 'Y' if each peptide must have only modified or unmodified aminoacid, 'N' if a
|
|
27
|
+
# peptide may contain both modified and unmodified aminoacid
|
|
28
|
+
attr_accessor :binary
|
|
29
|
+
|
|
30
|
+
def initialize(hash={})
|
|
31
|
+
merge!(hash)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# returns the builder or an xml string if no builder supplied
|
|
35
|
+
def to_xml(builder=nil)
|
|
36
|
+
xmlb = builder || Nokogiri::XML::Builder.new
|
|
37
|
+
# note massdiff: must begin with either + (nonnegative) or - [e.g.
|
|
38
|
+
# +1.05446 or -2.3342] consider Numeric#to_plus_minus_string in
|
|
39
|
+
# MS::Ident::Pepxml
|
|
40
|
+
attrs = [:aminoacid, :massdiff, :mass, :variable, :peptide_terminus, :symbol, :binary].map {|at| v=send(at) ; [at,v] if v }.compact
|
|
41
|
+
hash = Hash[attrs]
|
|
42
|
+
hash[:massdiff] = hash[:massdiff].to_plus_minus_string
|
|
43
|
+
xmlb.aminoacid_modification(hash)
|
|
44
|
+
builder || xmlb.doc.root.to_xml
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Modified aminoacid, static or variable
|
|
49
|
+
class MS::Ident::Pepxml::TerminalModification
|
|
50
|
+
include Merge
|
|
51
|
+
# n for N-terminus, c for C-terminus
|
|
52
|
+
attr_accessor :terminus
|
|
53
|
+
# Mass difference with respect to unmodified terminus
|
|
54
|
+
attr_accessor :massdiff
|
|
55
|
+
# Mass of modified terminus
|
|
56
|
+
attr_accessor :mass
|
|
57
|
+
# Y if both modified and unmodified terminus could be present in the
|
|
58
|
+
# dataset, N if only modified terminus can be present
|
|
59
|
+
attr_accessor :variable
|
|
60
|
+
# MSial symbol used by search engine to designate this modification
|
|
61
|
+
attr_accessor :symbol
|
|
62
|
+
# whether modification can reside only at protein terminus (specified n or
|
|
63
|
+
# c)
|
|
64
|
+
attr_accessor :protein_terminus
|
|
65
|
+
attr_accessor :description
|
|
66
|
+
|
|
67
|
+
def initialize(hash={})
|
|
68
|
+
hash.each {|k,v| send("#{k}=", v) }
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# returns the builder or an xml string if no builder supplied
|
|
72
|
+
def to_xml(builder=nil)
|
|
73
|
+
xmlb = builder || Nokogiri::XML::Builder.new
|
|
74
|
+
#short_element_xml_from_instance_vars("terminal_modification")
|
|
75
|
+
attrs = [:terminus, :massdiff, :mass, :variable, :protein_terminus, :description].map {|at| v=send(at) ; [at,v] if v }
|
|
76
|
+
hash = Hash[attrs]
|
|
77
|
+
hash[:massdiff] = hash[:massdiff].to_plus_minus_string
|
|
78
|
+
xmlb.terminal_modification(hash)
|
|
79
|
+
builder || xmlb.doc.root.to_xml
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
require 'merge'
|
|
2
|
+
|
|
3
|
+
require 'ms/ident/pepxml/msms_run_summary'
|
|
4
|
+
|
|
5
|
+
module MS ; end
|
|
6
|
+
module MS::Ident ; end
|
|
7
|
+
class MS::Ident::Pepxml; end
|
|
8
|
+
|
|
9
|
+
class MS::Ident::Pepxml::MsmsPipelineAnalysis
|
|
10
|
+
include Merge
|
|
11
|
+
XMLNS = "http://regis-web.systemsbiology.net/pepXML"
|
|
12
|
+
XMLNS_XSI = "http://www.w3.org/2001/XMLSchema-instance"
|
|
13
|
+
# (this doesn't actually exist), also, the space is supposed to be there
|
|
14
|
+
XSI_SCHEMA_LOCATION_BASE = "http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v"
|
|
15
|
+
# the only additions concerning a writer are from v18 are to the 'spectrum': retention_time_sec and activationMethodType
|
|
16
|
+
PEPXML_VERSION = 115
|
|
17
|
+
|
|
18
|
+
#include SpecIDXML
|
|
19
|
+
# Version 1.2.3
|
|
20
|
+
#attr_writer :date
|
|
21
|
+
#attr_writer :xmlns, :xmlns_xsi, :xsi_schemaLocation
|
|
22
|
+
#attr_accessor :summary_xml
|
|
23
|
+
|
|
24
|
+
attr_accessor :xmlns
|
|
25
|
+
attr_accessor :xmlns_xsi
|
|
26
|
+
attr_accessor :xsi_schema_location
|
|
27
|
+
# an Integer
|
|
28
|
+
attr_accessor :pepxml_version
|
|
29
|
+
# self referential path to the outputfile
|
|
30
|
+
attr_accessor :summary_xml
|
|
31
|
+
attr_accessor :msms_run_summary
|
|
32
|
+
attr_writer :date
|
|
33
|
+
|
|
34
|
+
def block_arg
|
|
35
|
+
@msms_run_summary = MS::Ident::Pepxml::MsmsRunSummary.new
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# if block given, yields a new msms_run_summary to return value of block
|
|
39
|
+
def initialize(hash={}, &block)
|
|
40
|
+
@xmlns = XMLNS
|
|
41
|
+
@xmlns_xsi = XMLNS_XSI
|
|
42
|
+
@xsi_schema_location = xsi_schema_location
|
|
43
|
+
@pepxml_version = PEPXML_VERSION
|
|
44
|
+
merge!(hash, &block)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# returns the location based on the pepxml version number
|
|
48
|
+
def xsi_schema_location
|
|
49
|
+
XSI_SCHEMA_LOCATION_BASE + pepxml_version.to_s + '.xsd'
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# if no date string given, then it will set to Time.now
|
|
53
|
+
def date
|
|
54
|
+
return @date if @date
|
|
55
|
+
tarr = Time.now.to_a
|
|
56
|
+
tarr[3..5].reverse.join('-') + "T#{tarr[0..2].reverse.join(':')}"
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# uses the filename as summary_xml (if it is nil) attribute and builds a complete, valid xml document,
|
|
60
|
+
# writing it to the filename
|
|
61
|
+
def to_xml(builder)
|
|
62
|
+
xmlb = builder || Nokogiri::XML::Builder.new
|
|
63
|
+
xmlb.msms_pipeline_analysis(:date => date, :xmlns => xmlns, 'xsi:schemaLocation'.to_sym => xsi_schema_location, :summary_xml => summary_xml) do |xmlb|
|
|
64
|
+
msms_run_summary.to_xml(xmlb) if msms_run_summary
|
|
65
|
+
end
|
|
66
|
+
builder || xmlb.doc.root.to_xml
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
require 'merge'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
|
|
4
|
+
require 'ms/ident/pepxml/sample_enzyme'
|
|
5
|
+
require 'ms/ident/pepxml/search_summary'
|
|
6
|
+
require 'ms/ident/pepxml/spectrum_query'
|
|
7
|
+
|
|
8
|
+
module MS ; end
|
|
9
|
+
module MS::Ident ; end
|
|
10
|
+
class MS::Ident::Pepxml; end
|
|
11
|
+
|
|
12
|
+
class MS::Ident::Pepxml::MsmsRunSummary
|
|
13
|
+
include Merge
|
|
14
|
+
# The name of the pep xml file without any extension
|
|
15
|
+
attr_accessor :base_name
|
|
16
|
+
# The name of the mass spec manufacturer
|
|
17
|
+
attr_accessor :ms_manufacturer
|
|
18
|
+
attr_accessor :ms_model
|
|
19
|
+
attr_accessor :ms_mass_analyzer
|
|
20
|
+
attr_accessor :ms_detector
|
|
21
|
+
attr_accessor :raw_data_type
|
|
22
|
+
attr_accessor :raw_data
|
|
23
|
+
attr_accessor :ms_ionization
|
|
24
|
+
attr_accessor :pepxml_version
|
|
25
|
+
|
|
26
|
+
# A SampleEnzyme object (responds to: name, cut, no_cut, sense)
|
|
27
|
+
attr_accessor :sample_enzyme
|
|
28
|
+
# A SearchSummary object
|
|
29
|
+
attr_accessor :search_summary
|
|
30
|
+
# An array of spectrum_queries
|
|
31
|
+
attr_accessor :spectrum_queries
|
|
32
|
+
|
|
33
|
+
def block_arg
|
|
34
|
+
[@sample_enzyme = MS::Ident::Pepxml::SampleEnzyme.new,
|
|
35
|
+
@search_summary = MS::Ident::Pepxml::SearchSummary.new,
|
|
36
|
+
@spectrum_queries ]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# takes a hash of name, value pairs
|
|
40
|
+
# if block given, yields a SampleEnzyme object, a SearchSummary and an array
|
|
41
|
+
# for SpectrumQueries
|
|
42
|
+
def initialize(hash={}, &block)
|
|
43
|
+
@spectrum_queries = []
|
|
44
|
+
merge!(hash, &block)
|
|
45
|
+
block.call(block_arg) if block
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# optionally takes an xml builder object and returns the builder, or the xml
|
|
49
|
+
# string if no builder was given
|
|
50
|
+
# sets the index attribute of each spectrum query if it is not already set
|
|
51
|
+
def to_xml(builder=nil)
|
|
52
|
+
xmlb = builder || Nokogiri::XML::Builder.new
|
|
53
|
+
hash = {:base_name => base_name, :msManufacturer => ms_manufacturer, :msModel => ms_model, :msIonization => ms_ionization, :msMassAnalyzer => ms_mass_analyzer, :msDetector => ms_detector, :raw_data_type => raw_data_type, :raw_data => raw_data}
|
|
54
|
+
hash.each {|k,v| hash.delete(k) unless v }
|
|
55
|
+
xmlb.msms_run_summary(hash) do |xmlb|
|
|
56
|
+
sample_enzyme.to_xml(xmlb) if sample_enzyme
|
|
57
|
+
search_summary.to_xml(xmlb) if search_summary
|
|
58
|
+
spectrum_queries.each_with_index do |sq,i|
|
|
59
|
+
sq.index = i+1 unless sq.index
|
|
60
|
+
sq.to_xml(xmlb)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
builder || xmlb.doc.root.to_xml
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def self.from_pepxml_node(node)
|
|
67
|
+
self.new.from_pepxml_node(node)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# peps correspond to search_results
|
|
71
|
+
def from_pepxml_node(node)
|
|
72
|
+
@base_name = node['base_name']
|
|
73
|
+
@ms_manufacturer = node['msManufacturer']
|
|
74
|
+
@ms_model = node['msModel']
|
|
75
|
+
@ms_manufacturer = node['msIonization']
|
|
76
|
+
@ms_mass_analyzer = node['msMassAnalyzer']
|
|
77
|
+
@ms_detector = node['msDetector']
|
|
78
|
+
@raw_data_type = node['raw_data_type']
|
|
79
|
+
@raw_data = node['raw_data']
|
|
80
|
+
self
|
|
81
|
+
end
|
|
82
|
+
end
|