mspire 0.5.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +24 -0
- data/Rakefile +51 -0
- data/VERSION +1 -0
- data/lib/cv/description.rb +18 -0
- data/lib/cv/param.rb +33 -0
- data/lib/cv.rb +3 -0
- data/lib/io/bookmark.rb +13 -0
- data/lib/merge.rb +7 -0
- data/lib/ms/cvlist.rb +76 -0
- data/lib/ms/digester.rb +245 -0
- data/lib/ms/fasta.rb +86 -0
- data/lib/ms/ident/peptide/db.rb +243 -0
- data/lib/ms/ident/peptide.rb +72 -0
- data/lib/ms/ident/peptide_hit/qvalue.rb +56 -0
- data/lib/ms/ident/peptide_hit.rb +26 -0
- data/lib/ms/ident/pepxml/modifications.rb +83 -0
- data/lib/ms/ident/pepxml/msms_pipeline_analysis.rb +70 -0
- data/lib/ms/ident/pepxml/msms_run_summary.rb +82 -0
- data/lib/ms/ident/pepxml/parameters.rb +14 -0
- data/lib/ms/ident/pepxml/sample_enzyme.rb +165 -0
- data/lib/ms/ident/pepxml/search_database.rb +49 -0
- data/lib/ms/ident/pepxml/search_hit/modification_info.rb +79 -0
- data/lib/ms/ident/pepxml/search_hit.rb +144 -0
- data/lib/ms/ident/pepxml/search_result.rb +35 -0
- data/lib/ms/ident/pepxml/search_summary.rb +92 -0
- data/lib/ms/ident/pepxml/spectrum_query.rb +85 -0
- data/lib/ms/ident/pepxml.rb +112 -0
- data/lib/ms/ident/protein.rb +33 -0
- data/lib/ms/ident/protein_group.rb +80 -0
- data/lib/ms/ident/search.rb +114 -0
- data/lib/ms/ident.rb +37 -0
- data/lib/ms/isotope/aa.rb +59 -0
- data/lib/ms/mascot.rb +6 -0
- data/lib/ms/mass/aa.rb +79 -0
- data/lib/ms/mass.rb +55 -0
- data/lib/ms/mzml/index_list.rb +98 -0
- data/lib/ms/mzml/plms1.rb +34 -0
- data/lib/ms/mzml.rb +197 -0
- data/lib/ms/obo.rb +38 -0
- data/lib/ms/plms1.rb +156 -0
- data/lib/ms/quant/qspec/protein_group_comparison.rb +22 -0
- data/lib/ms/quant/qspec.rb +112 -0
- data/lib/ms/spectrum.rb +154 -8
- data/lib/ms.rb +3 -10
- data/lib/msplat.rb +2 -0
- data/lib/obo/ims.rb +5 -0
- data/lib/obo/ms.rb +7 -0
- data/lib/obo/ontology.rb +41 -0
- data/lib/obo/unit.rb +5 -0
- data/lib/openany.rb +23 -0
- data/lib/write_file_or_string.rb +18 -0
- data/obo/ims.obo +562 -0
- data/obo/ms.obo +11677 -0
- data/obo/unit.obo +2563 -0
- data/spec/ms/cvlist_spec.rb +60 -0
- data/spec/ms/digester_spec.rb +351 -0
- data/spec/ms/fasta_spec.rb +100 -0
- data/spec/ms/ident/peptide/db_spec.rb +108 -0
- data/spec/ms/ident/pepxml/sample_enzyme_spec.rb +181 -0
- data/spec/ms/ident/pepxml/search_hit/modification_info_spec.rb +37 -0
- data/spec/ms/ident/pepxml_spec.rb +442 -0
- data/spec/ms/ident/protein_group_spec.rb +68 -0
- data/spec/ms/mass_spec.rb +8 -0
- data/spec/ms/mzml/index_list_spec.rb +122 -0
- data/spec/ms/mzml/plms1_spec.rb +62 -0
- data/spec/ms/mzml_spec.rb +50 -0
- data/spec/ms/plms1_spec.rb +38 -0
- data/spec/ms/quant/qspec_spec.rb +25 -0
- data/spec/msplat_spec.rb +24 -0
- data/spec/obo_spec.rb +25 -0
- data/spec/spec_helper.rb +25 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
- data/spec/testfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
- data/spec/testfiles/ms/mzml/j24z.idx_comp.3.mzML +271 -0
- data/spec/testfiles/ms/mzml/openms.noidx_nocomp.12.mzML +330 -0
- data/spec/testfiles/ms/quant/kill_extra_tabs.rb +13 -0
- data/spec/testfiles/ms/quant/max_quant_output.provenance.txt +15 -0
- data/spec/testfiles/ms/quant/max_quant_output.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.killedextratabs.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_final.txt_qspecgp +0 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.CSV.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.oneprot.tsv_qspecgp.csv +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_qspec.txt +199 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt +134 -0
- data/spec/testfiles/ms/quant/pdcd5_lfq_tabdel.txt_qspecgp +134 -0
- data/spec/testfiles/ms/quant/remove_rest_of_proteins.rb +13 -0
- data/spec/testfiles/ms/quant/unlog_transform.rb +13 -0
- data/spec/testfiles/plms1/output.key +0 -0
- metadata +157 -40
- data/README +0 -77
- data/changelog.txt +0 -196
- data/lib/ms/calc.rb +0 -32
- data/lib/ms/data/interleaved.rb +0 -60
- data/lib/ms/data/lazy_io.rb +0 -73
- data/lib/ms/data/lazy_string.rb +0 -15
- data/lib/ms/data/simple.rb +0 -59
- data/lib/ms/data/transposed.rb +0 -41
- data/lib/ms/data.rb +0 -57
- data/lib/ms/format/format_error.rb +0 -12
- data/lib/ms/support/binary_search.rb +0 -126
@@ -0,0 +1,243 @@
|
|
1
|
+
require 'ms/digester'
|
2
|
+
require 'ms/fasta'
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
module MS ; end
|
6
|
+
module MS::Ident ; end
|
7
|
+
module MS::Ident::Peptide ; end
|
8
|
+
|
9
|
+
# the object itself is a modified Hash.
|
10
|
+
# It is initialized with the database file and a protein array can be
|
11
|
+
# retrieved with the #[] method given an amino acid sequence. All other
|
12
|
+
# methods are untested at this time and should be avoided!
|
13
|
+
class MS::Ident::Peptide::Db < Hash
|
14
|
+
MAX_NUM_AA_EXPANSION = 3
|
15
|
+
|
16
|
+
# the twenty standard amino acids
|
17
|
+
STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
|
18
|
+
|
19
|
+
DEFAULT_PEPTIDE_CENTRIC_DB = {:missed_cleavages => 2, :min_length => 4, :enzyme => MS::Digester[:trypsin], :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
|
20
|
+
|
21
|
+
PROTEIN_DELIMITER = "\t"
|
22
|
+
KEY_VALUE_DELIMITER = ": "
|
23
|
+
|
24
|
+
def self.cmdline(argv)
|
25
|
+
|
26
|
+
opt = {
|
27
|
+
:remove_digestion_file => true,
|
28
|
+
:enzyme => MS::Digester[:trypsin]
|
29
|
+
}
|
30
|
+
opts = OptionParser.new do |op|
|
31
|
+
op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
|
32
|
+
op.separator "output: "
|
33
|
+
op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
|
34
|
+
op.separator "format:"
|
35
|
+
op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
|
36
|
+
op.separator ""
|
37
|
+
op.separator " Initiator Methionines - by default, will generate two peptides"
|
38
|
+
op.separator " for any peptide found at the N-termini starting with 'M'"
|
39
|
+
op.separator " (i.e., one with and one without the leading methionine)"
|
40
|
+
op.separator ""
|
41
|
+
op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
|
42
|
+
op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
|
43
|
+
op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
|
44
|
+
op.on("--no-expand-x", "don't enumerate aa 'X' possibilities") { opt[:expand_aa] = nil }
|
45
|
+
op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = MS::Insilico::Digester.const_get(v.upcase) }
|
46
|
+
op.on("--list-enzymes", "lists approved enzymes and exits") do
|
47
|
+
puts MS::Digester::ENZYMES.keys.join("\n")
|
48
|
+
exit
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
opts.parse!(argv)
|
53
|
+
|
54
|
+
if argv.size == 0
|
55
|
+
puts opts || exit
|
56
|
+
end
|
57
|
+
|
58
|
+
argv.map do |file|
|
59
|
+
MS::Ident::Peptide::Db.peptide_centric_db(file, opt)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# writes a new file with the added 'min_aaseq<Integer>'
|
64
|
+
# creates a temporary digestion file that contains all peptides digesting
|
65
|
+
# with certain missed_cleavages (i.e., min_seq_length is not applied to
|
66
|
+
# this file but on the final peptide centric db)
|
67
|
+
# returns the full name of the written file.
|
68
|
+
def self.peptide_centric_db(fasta_file, opts={})
|
69
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
70
|
+
|
71
|
+
(missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa)
|
72
|
+
start_time = Time.now
|
73
|
+
print "Digesting #{fasta_file} ..." if $VERBOSE
|
74
|
+
|
75
|
+
if expand_aa
|
76
|
+
letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
|
77
|
+
end
|
78
|
+
|
79
|
+
base = fasta_file.chomp(File.extname(fasta_file))
|
80
|
+
digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
|
81
|
+
File.open(digestion_file, "w") do |fh|
|
82
|
+
MS::Fasta.open(fasta_file) do |fasta|
|
83
|
+
fasta.each do |prot|
|
84
|
+
peptides = enzyme.digest(prot.sequence, missed_cleavages)
|
85
|
+
if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
|
86
|
+
m_peps = []
|
87
|
+
init_methionine_peps = []
|
88
|
+
peptides.each do |pep|
|
89
|
+
# if the peptide is at the beginning of the protein sequence
|
90
|
+
if prot.sequence[0,pep.size] == pep
|
91
|
+
m_peps << pep[1..-1]
|
92
|
+
end
|
93
|
+
end
|
94
|
+
peptides.push(*m_peps)
|
95
|
+
end
|
96
|
+
if expand_aa
|
97
|
+
peptides = peptides.map do |pep|
|
98
|
+
if pep =~ letters_to_expand_re
|
99
|
+
expand_peptides(pep, expand_aa)
|
100
|
+
else
|
101
|
+
pep
|
102
|
+
end
|
103
|
+
end.flatten
|
104
|
+
end
|
105
|
+
fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
110
|
+
|
111
|
+
|
112
|
+
start_time = Time.now
|
113
|
+
print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
|
114
|
+
|
115
|
+
hash = Hash.new {|h,k| h[k] = [] }
|
116
|
+
::IO.foreach(digestion_file) do |line|
|
117
|
+
(prot, *peps) = line.chomp!.split(/\s+/)
|
118
|
+
# prot is something like this: "sp|P31946|1433B_HUMAN" in uniprot
|
119
|
+
peps.each do |pep|
|
120
|
+
if pep.size >= min_length
|
121
|
+
hash[pep] << prot
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
126
|
+
|
127
|
+
base = digestion_file.chomp(File.extname(digestion_file))
|
128
|
+
final_outfile = base + ".min_aaseq#{min_length}" + ".yml"
|
129
|
+
|
130
|
+
start_time = Time.now
|
131
|
+
print "Writing #{hash.size} peptides to #{} ..." if $VERBOSE
|
132
|
+
|
133
|
+
File.open(final_outfile, 'w') do |out|
|
134
|
+
hash.each do |k,v|
|
135
|
+
out.puts( [k, v.join(PROTEIN_DELIMITER)].join(KEY_VALUE_DELIMITER) )
|
136
|
+
end
|
137
|
+
end
|
138
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
139
|
+
|
140
|
+
if remove_digestion_file
|
141
|
+
File.unlink(digestion_file)
|
142
|
+
end
|
143
|
+
File.expand_path(final_outfile)
|
144
|
+
end
|
145
|
+
|
146
|
+
# does combinatorial expansion of all letters requesting it.
|
147
|
+
# expand_aa is hash like: {'X'=>STANDARD_AA}
|
148
|
+
# returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
|
149
|
+
# be expanded
|
150
|
+
# returns an empty array if there is no expansion
|
151
|
+
def self.expand_peptides(peptide, expand_aa)
|
152
|
+
letters_in_order = expand_aa.keys.sort
|
153
|
+
index_and_key = []
|
154
|
+
peptide.split('').each_with_index do |char,i|
|
155
|
+
if let_index = letters_in_order.index(char)
|
156
|
+
index_and_key << [i, letters_in_order[let_index]]
|
157
|
+
end
|
158
|
+
end
|
159
|
+
if index_and_key.size > MAX_NUM_AA_EXPANSION
|
160
|
+
return nil
|
161
|
+
end
|
162
|
+
to_expand = [peptide]
|
163
|
+
index_and_key.each do |i,letter|
|
164
|
+
new_peps = []
|
165
|
+
while current_pep = to_expand.shift do
|
166
|
+
new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
|
167
|
+
end
|
168
|
+
to_expand = new_peps.flatten
|
169
|
+
end
|
170
|
+
to_expand
|
171
|
+
end
|
172
|
+
|
173
|
+
def initialize(db_file)
|
174
|
+
self.replace(YAML.load_file(db_file))
|
175
|
+
end
|
176
|
+
|
177
|
+
alias_method :old_bracket, '[]'.to_sym
|
178
|
+
|
179
|
+
# returns the protein id's as an array
|
180
|
+
def [](key)
|
181
|
+
old_bracket(key).chomp.split(PROTEIN_DELIMITER)
|
182
|
+
end
|
183
|
+
|
184
|
+
# an object for on disk retrieval of db entries
|
185
|
+
# proteins are returned as an array.
|
186
|
+
# behaves much like a hash once it is opened.
|
187
|
+
class IO
|
188
|
+
include Enumerable
|
189
|
+
def self.open(filename, &block)
|
190
|
+
raise ArgumentError unless block
|
191
|
+
File.open(filename) do |io|
|
192
|
+
block.call(self.new(io))
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
attr_accessor :io
|
197
|
+
attr_accessor :index
|
198
|
+
|
199
|
+
def initialize(io)
|
200
|
+
@io = io
|
201
|
+
@index = {}
|
202
|
+
re = /^(\w+)#{Regexp.escape(KEY_VALUE_DELIMITER)}/
|
203
|
+
prev_io_pos = io.pos
|
204
|
+
triplets = io.each_line.map do |line|
|
205
|
+
key = re.match(line)[1]
|
206
|
+
[key, prev_io_pos + key.bytesize+KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
|
207
|
+
end
|
208
|
+
triplets.each do |key, start, end_pos|
|
209
|
+
@index[key] = [start, end_pos-start]
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
# returns an array of proteins for the given key (peptide aaseq)
|
214
|
+
def [](key)
|
215
|
+
(start, length) = @index[key]
|
216
|
+
return nil unless start
|
217
|
+
@io.seek(start)
|
218
|
+
string = @io.read(length)
|
219
|
+
string.chomp!
|
220
|
+
string.split("\t")
|
221
|
+
end
|
222
|
+
|
223
|
+
# number of entries
|
224
|
+
def size ; @index.size end
|
225
|
+
alias_method :length, :size
|
226
|
+
|
227
|
+
def keys
|
228
|
+
@index.keys
|
229
|
+
end
|
230
|
+
|
231
|
+
# all the protein lists
|
232
|
+
def values
|
233
|
+
keys.map {|key| self[key] }
|
234
|
+
end
|
235
|
+
|
236
|
+
# yields a pair of aaseq and protein array
|
237
|
+
def each(&block)
|
238
|
+
@index.each do |key, start_length|
|
239
|
+
block.call([key, self[key]])
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module MS ; end
|
2
|
+
module MS::Ident ; end
|
3
|
+
|
4
|
+
# A 'sequence' is a notation of a peptide that includes the leading and
|
5
|
+
# trailing amino acid after cleavage (e.g., K.PEPTIDER.E or -.STARTK.L )
|
6
|
+
# and may contain post-translational modification information.
|
7
|
+
#
|
8
|
+
# 'aaseq' is the amino acid sequence of just the peptide with no leading or
|
9
|
+
# trailing notation (e.g., PEPTIDER or LAKKLY)
|
10
|
+
module MS::Ident::Peptide
|
11
|
+
Nonstandard_AA_re = /[^A-Z\.\-]/
|
12
|
+
|
13
|
+
class << self
|
14
|
+
|
15
|
+
# Takes a peptide sequence of the form '-.PEPTIDE.R', removes non-standard
|
16
|
+
# amino acids, and returns the center piece
|
17
|
+
def sequence_to_aaseq(sequence)
|
18
|
+
after_removed = remove_non_amino_acids(sequence)
|
19
|
+
pieces = after_removed.split('.')
|
20
|
+
case pieces.size
|
21
|
+
when 3
|
22
|
+
pieces[1]
|
23
|
+
when 2
|
24
|
+
if pieces[0].size > 1 ## N termini
|
25
|
+
pieces[0]
|
26
|
+
else ## C termini
|
27
|
+
pieces[1]
|
28
|
+
end
|
29
|
+
when 1 ## this must be a parse error!
|
30
|
+
pieces[0] ## which is the peptide itself
|
31
|
+
else
|
32
|
+
abort "bad peptide sequence: #{sequence.inspect}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# removes non standard amino acids specified by Nonstandard_AA_re
|
37
|
+
def remove_non_amino_acids(sequence)
|
38
|
+
sequence.gsub(Nonstandard_AA_re, '')
|
39
|
+
end
|
40
|
+
|
41
|
+
# remove non amino acids and split the sequence
|
42
|
+
def prepare_sequence(sequence)
|
43
|
+
nv = remove_non_amino_acids(sequence)
|
44
|
+
split_sequence(nv)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Returns prev, peptide, next from sequence. Parse errors return
|
48
|
+
# nil,nil,nil
|
49
|
+
# R.PEPTIDE.A # -> R, PEPTIDE, A
|
50
|
+
# R.PEPTIDE.- # -> R, PEPTIDE, -
|
51
|
+
# PEPTIDE.A # -> -, PEPTIDE, A
|
52
|
+
# A.PEPTIDE # -> A, PEPTIDE, -
|
53
|
+
# PEPTIDE # -> nil,nil,nil
|
54
|
+
def split_sequence(sequence)
|
55
|
+
pieces = sequence.split('.')
|
56
|
+
case pieces.size
|
57
|
+
when 3
|
58
|
+
pieces
|
59
|
+
when 2
|
60
|
+
if pieces[0].size > 1 ## N termini
|
61
|
+
['-', pieces[0], pieces[1]]
|
62
|
+
else ## C termini
|
63
|
+
[pieces[0], pieces[1], '-']
|
64
|
+
end
|
65
|
+
when 1 ## this must be a parse error!
|
66
|
+
[nil,nil,nil]
|
67
|
+
when 0
|
68
|
+
[nil,nil,nil]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'ms/ident/search'
|
2
|
+
require 'ms/ident/peptide_hit'
|
3
|
+
|
4
|
+
module MS ; end
|
5
|
+
module MS::Ident ; end
|
6
|
+
|
7
|
+
class MS::Ident::PeptideHit
|
8
|
+
module Qvalue
|
9
|
+
FILE_EXTENSION = '.phq.tsv'
|
10
|
+
FILE_DELIMITER = "\t"
|
11
|
+
HEADER = %w(run_id id aaseq charge qvalue)
|
12
|
+
|
13
|
+
class << self
|
14
|
+
|
15
|
+
# writes to the file, adding an extension
|
16
|
+
def to_phq(base, hits, qvalues=[])
|
17
|
+
to_file(base + FILE_EXTENSION, hits, qvalues)
|
18
|
+
end
|
19
|
+
|
20
|
+
# writes the peptide hits to a phq.tsv file. qvalues is a parallel array
|
21
|
+
# to hits that can provide qvalues if not inherent to the hits
|
22
|
+
# returns the filename.
|
23
|
+
def to_file(filename, hits, qvalues=[])
|
24
|
+
File.open(filename,'w') do |out|
|
25
|
+
out.puts HEADER.join(FILE_DELIMITER)
|
26
|
+
hits.zip(qvalues) do |hit, qvalue|
|
27
|
+
out.puts [hit.search.id, hit.id, hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
filename
|
31
|
+
end
|
32
|
+
|
33
|
+
# returns an array of PeptideHit objects from a phq.tsv
|
34
|
+
def from_file(filename)
|
35
|
+
searches = Hash.new {|h,id| h[id] = MS::Ident::Search.new(id) }
|
36
|
+
peptide_hits = []
|
37
|
+
File.open(filename) do |io|
|
38
|
+
header = io.readline.chomp.split(FILE_DELIMITER)
|
39
|
+
raise "bad headers" unless header == HEADER
|
40
|
+
io.each do |line|
|
41
|
+
line.chomp!
|
42
|
+
(run_id, id, aaseq, charge, qvalue) = line.split(FILE_DELIMITER)
|
43
|
+
ph = MS::Ident::PeptideHit.new
|
44
|
+
ph.search = searches[run_id]
|
45
|
+
ph.id = id; ph.aaseq = aaseq ; ph.charge = charge.to_i ; ph.qvalue = qvalue.to_f
|
46
|
+
peptide_hits << ph
|
47
|
+
end
|
48
|
+
end
|
49
|
+
peptide_hits
|
50
|
+
end
|
51
|
+
|
52
|
+
alias_method :from_phq, :from_file
|
53
|
+
|
54
|
+
end
|
55
|
+
end # Qvalue
|
56
|
+
end # Peptide Hit
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'merge'
|
2
|
+
|
3
|
+
module MS ; end
|
4
|
+
module MS::Ident ; end
|
5
|
+
|
6
|
+
module MS::Ident::PeptideHitLike
|
7
|
+
attr_accessor :id
|
8
|
+
attr_accessor :search
|
9
|
+
attr_accessor :missed_cleavages
|
10
|
+
attr_accessor :aaseq
|
11
|
+
attr_accessor :charge
|
12
|
+
# an array of MS::Ident::ProteinLike objects
|
13
|
+
attr_accessor :proteins
|
14
|
+
# relative to the set the hit is contained in!
|
15
|
+
attr_accessor :qvalue
|
16
|
+
end
|
17
|
+
|
18
|
+
class MS::Ident::PeptideHit
|
19
|
+
include MS::Ident::PeptideHitLike
|
20
|
+
include Merge
|
21
|
+
|
22
|
+
def initialize(hash)
|
23
|
+
merge!(hash)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'merge'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module MS ; end
|
5
|
+
module MS::Ident ; end
|
6
|
+
class MS::Ident::Pepxml ; end
|
7
|
+
|
8
|
+
# Modified aminoacid, static or variable
|
9
|
+
# unless otherwise stated, all attributes can be anything
|
10
|
+
class MS::Ident::Pepxml::AminoacidModification
|
11
|
+
include Merge
|
12
|
+
# The amino acid (one letter code)
|
13
|
+
attr_accessor :aminoacid
|
14
|
+
# Mass difference with respect to unmodified aminoacid, as a Float
|
15
|
+
attr_accessor :massdiff
|
16
|
+
# Mass of modified aminoacid, Float
|
17
|
+
attr_accessor :mass
|
18
|
+
# Y if both modified and unmodified aminoacid could be present in the
|
19
|
+
# dataset, N if only modified aminoacid can be present
|
20
|
+
attr_accessor :variable
|
21
|
+
# whether modification can reside only at protein terminus (specified 'n',
|
22
|
+
# 'c', or 'nc')
|
23
|
+
attr_accessor :peptide_terminus
|
24
|
+
# Symbol used by search engine to designate this modification
|
25
|
+
attr_accessor :symbol
|
26
|
+
# 'Y' if each peptide must have only modified or unmodified aminoacid, 'N' if a
|
27
|
+
# peptide may contain both modified and unmodified aminoacid
|
28
|
+
attr_accessor :binary
|
29
|
+
|
30
|
+
def initialize(hash={})
|
31
|
+
merge!(hash)
|
32
|
+
end
|
33
|
+
|
34
|
+
# returns the builder or an xml string if no builder supplied
|
35
|
+
def to_xml(builder=nil)
|
36
|
+
xmlb = builder || Nokogiri::XML::Builder.new
|
37
|
+
# note massdiff: must begin with either + (nonnegative) or - [e.g.
|
38
|
+
# +1.05446 or -2.3342] consider Numeric#to_plus_minus_string in
|
39
|
+
# MS::Ident::Pepxml
|
40
|
+
attrs = [:aminoacid, :massdiff, :mass, :variable, :peptide_terminus, :symbol, :binary].map {|at| v=send(at) ; [at,v] if v }.compact
|
41
|
+
hash = Hash[attrs]
|
42
|
+
hash[:massdiff] = hash[:massdiff].to_plus_minus_string
|
43
|
+
xmlb.aminoacid_modification(hash)
|
44
|
+
builder || xmlb.doc.root.to_xml
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# Modified aminoacid, static or variable
|
49
|
+
class MS::Ident::Pepxml::TerminalModification
|
50
|
+
include Merge
|
51
|
+
# n for N-terminus, c for C-terminus
|
52
|
+
attr_accessor :terminus
|
53
|
+
# Mass difference with respect to unmodified terminus
|
54
|
+
attr_accessor :massdiff
|
55
|
+
# Mass of modified terminus
|
56
|
+
attr_accessor :mass
|
57
|
+
# Y if both modified and unmodified terminus could be present in the
|
58
|
+
# dataset, N if only modified terminus can be present
|
59
|
+
attr_accessor :variable
|
60
|
+
# MSial symbol used by search engine to designate this modification
|
61
|
+
attr_accessor :symbol
|
62
|
+
# whether modification can reside only at protein terminus (specified n or
|
63
|
+
# c)
|
64
|
+
attr_accessor :protein_terminus
|
65
|
+
attr_accessor :description
|
66
|
+
|
67
|
+
def initialize(hash={})
|
68
|
+
hash.each {|k,v| send("#{k}=", v) }
|
69
|
+
end
|
70
|
+
|
71
|
+
# returns the builder or an xml string if no builder supplied
|
72
|
+
def to_xml(builder=nil)
|
73
|
+
xmlb = builder || Nokogiri::XML::Builder.new
|
74
|
+
#short_element_xml_from_instance_vars("terminal_modification")
|
75
|
+
attrs = [:terminus, :massdiff, :mass, :variable, :protein_terminus, :description].map {|at| v=send(at) ; [at,v] if v }
|
76
|
+
hash = Hash[attrs]
|
77
|
+
hash[:massdiff] = hash[:massdiff].to_plus_minus_string
|
78
|
+
xmlb.terminal_modification(hash)
|
79
|
+
builder || xmlb.doc.root.to_xml
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'merge'
|
2
|
+
|
3
|
+
require 'ms/ident/pepxml/msms_run_summary'
|
4
|
+
|
5
|
+
module MS ; end
|
6
|
+
module MS::Ident ; end
|
7
|
+
class MS::Ident::Pepxml; end
|
8
|
+
|
9
|
+
class MS::Ident::Pepxml::MsmsPipelineAnalysis
|
10
|
+
include Merge
|
11
|
+
XMLNS = "http://regis-web.systemsbiology.net/pepXML"
|
12
|
+
XMLNS_XSI = "http://www.w3.org/2001/XMLSchema-instance"
|
13
|
+
# (this doesn't actually exist), also, the space is supposed to be there
|
14
|
+
XSI_SCHEMA_LOCATION_BASE = "http://regis-web.systemsbiology.net/pepXML /tools/bin/TPP/tpp/schema/pepXML_v"
|
15
|
+
# the only additions concerning a writer are from v18 are to the 'spectrum': retention_time_sec and activationMethodType
|
16
|
+
PEPXML_VERSION = 115
|
17
|
+
|
18
|
+
#include SpecIDXML
|
19
|
+
# Version 1.2.3
|
20
|
+
#attr_writer :date
|
21
|
+
#attr_writer :xmlns, :xmlns_xsi, :xsi_schemaLocation
|
22
|
+
#attr_accessor :summary_xml
|
23
|
+
|
24
|
+
attr_accessor :xmlns
|
25
|
+
attr_accessor :xmlns_xsi
|
26
|
+
attr_accessor :xsi_schema_location
|
27
|
+
# an Integer
|
28
|
+
attr_accessor :pepxml_version
|
29
|
+
# self referential path to the outputfile
|
30
|
+
attr_accessor :summary_xml
|
31
|
+
attr_accessor :msms_run_summary
|
32
|
+
attr_writer :date
|
33
|
+
|
34
|
+
def block_arg
|
35
|
+
@msms_run_summary = MS::Ident::Pepxml::MsmsRunSummary.new
|
36
|
+
end
|
37
|
+
|
38
|
+
# if block given, yields a new msms_run_summary to return value of block
|
39
|
+
def initialize(hash={}, &block)
|
40
|
+
@xmlns = XMLNS
|
41
|
+
@xmlns_xsi = XMLNS_XSI
|
42
|
+
@xsi_schema_location = xsi_schema_location
|
43
|
+
@pepxml_version = PEPXML_VERSION
|
44
|
+
merge!(hash, &block)
|
45
|
+
end
|
46
|
+
|
47
|
+
# returns the location based on the pepxml version number
|
48
|
+
def xsi_schema_location
|
49
|
+
XSI_SCHEMA_LOCATION_BASE + pepxml_version.to_s + '.xsd'
|
50
|
+
end
|
51
|
+
|
52
|
+
# if no date string given, then it will set to Time.now
|
53
|
+
def date
|
54
|
+
return @date if @date
|
55
|
+
tarr = Time.now.to_a
|
56
|
+
tarr[3..5].reverse.join('-') + "T#{tarr[0..2].reverse.join(':')}"
|
57
|
+
end
|
58
|
+
|
59
|
+
# uses the filename as summary_xml (if it is nil) attribute and builds a complete, valid xml document,
|
60
|
+
# writing it to the filename
|
61
|
+
def to_xml(builder)
|
62
|
+
xmlb = builder || Nokogiri::XML::Builder.new
|
63
|
+
xmlb.msms_pipeline_analysis(:date => date, :xmlns => xmlns, 'xsi:schemaLocation'.to_sym => xsi_schema_location, :summary_xml => summary_xml) do |xmlb|
|
64
|
+
msms_run_summary.to_xml(xmlb) if msms_run_summary
|
65
|
+
end
|
66
|
+
builder || xmlb.doc.root.to_xml
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
|
@@ -0,0 +1,82 @@
|
|
1
|
+
require 'merge'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
require 'ms/ident/pepxml/sample_enzyme'
|
5
|
+
require 'ms/ident/pepxml/search_summary'
|
6
|
+
require 'ms/ident/pepxml/spectrum_query'
|
7
|
+
|
8
|
+
module MS ; end
|
9
|
+
module MS::Ident ; end
|
10
|
+
class MS::Ident::Pepxml; end
|
11
|
+
|
12
|
+
class MS::Ident::Pepxml::MsmsRunSummary
|
13
|
+
include Merge
|
14
|
+
# The name of the pep xml file without any extension
|
15
|
+
attr_accessor :base_name
|
16
|
+
# The name of the mass spec manufacturer
|
17
|
+
attr_accessor :ms_manufacturer
|
18
|
+
attr_accessor :ms_model
|
19
|
+
attr_accessor :ms_mass_analyzer
|
20
|
+
attr_accessor :ms_detector
|
21
|
+
attr_accessor :raw_data_type
|
22
|
+
attr_accessor :raw_data
|
23
|
+
attr_accessor :ms_ionization
|
24
|
+
attr_accessor :pepxml_version
|
25
|
+
|
26
|
+
# A SampleEnzyme object (responds to: name, cut, no_cut, sense)
|
27
|
+
attr_accessor :sample_enzyme
|
28
|
+
# A SearchSummary object
|
29
|
+
attr_accessor :search_summary
|
30
|
+
# An array of spectrum_queries
|
31
|
+
attr_accessor :spectrum_queries
|
32
|
+
|
33
|
+
def block_arg
|
34
|
+
[@sample_enzyme = MS::Ident::Pepxml::SampleEnzyme.new,
|
35
|
+
@search_summary = MS::Ident::Pepxml::SearchSummary.new,
|
36
|
+
@spectrum_queries ]
|
37
|
+
end
|
38
|
+
|
39
|
+
# takes a hash of name, value pairs
|
40
|
+
# if block given, yields a SampleEnzyme object, a SearchSummary and an array
|
41
|
+
# for SpectrumQueries
|
42
|
+
def initialize(hash={}, &block)
|
43
|
+
@spectrum_queries = []
|
44
|
+
merge!(hash, &block)
|
45
|
+
block.call(block_arg) if block
|
46
|
+
end
|
47
|
+
|
48
|
+
# optionally takes an xml builder object and returns the builder, or the xml
|
49
|
+
# string if no builder was given
|
50
|
+
# sets the index attribute of each spectrum query if it is not already set
|
51
|
+
def to_xml(builder=nil)
|
52
|
+
xmlb = builder || Nokogiri::XML::Builder.new
|
53
|
+
hash = {:base_name => base_name, :msManufacturer => ms_manufacturer, :msModel => ms_model, :msIonization => ms_ionization, :msMassAnalyzer => ms_mass_analyzer, :msDetector => ms_detector, :raw_data_type => raw_data_type, :raw_data => raw_data}
|
54
|
+
hash.each {|k,v| hash.delete(k) unless v }
|
55
|
+
xmlb.msms_run_summary(hash) do |xmlb|
|
56
|
+
sample_enzyme.to_xml(xmlb) if sample_enzyme
|
57
|
+
search_summary.to_xml(xmlb) if search_summary
|
58
|
+
spectrum_queries.each_with_index do |sq,i|
|
59
|
+
sq.index = i+1 unless sq.index
|
60
|
+
sq.to_xml(xmlb)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
builder || xmlb.doc.root.to_xml
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.from_pepxml_node(node)
|
67
|
+
self.new.from_pepxml_node(node)
|
68
|
+
end
|
69
|
+
|
70
|
+
# peps correspond to search_results
|
71
|
+
def from_pepxml_node(node)
|
72
|
+
@base_name = node['base_name']
|
73
|
+
@ms_manufacturer = node['msManufacturer']
|
74
|
+
@ms_model = node['msModel']
|
75
|
+
@ms_manufacturer = node['msIonization']
|
76
|
+
@ms_mass_analyzer = node['msMassAnalyzer']
|
77
|
+
@ms_detector = node['msDetector']
|
78
|
+
@raw_data_type = node['raw_data_type']
|
79
|
+
@raw_data = node['raw_data']
|
80
|
+
self
|
81
|
+
end
|
82
|
+
end
|