mspire 0.8.4 → 0.8.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +1 -1
- data/VERSION +1 -1
- data/lib/mspire/fasta.rb +5 -27
- data/lib/mspire/ident/peptide/db/creator.rb +248 -0
- data/lib/mspire/ident/peptide/db/io.rb +62 -0
- data/lib/mspire/ident/peptide/db.rb +18 -225
- data/lib/mspire/ident/peptide_hit/qvalue.rb +3 -2
- data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
- data/script/mascot_dat_to_peptide_hit_qvalues.rb +118 -0
- data/spec/mspire/ident/peptide/db/creator_spec.rb +65 -0
- data/spec/mspire/ident/peptide/db/io_spec.rb +21 -0
- data/spec/mspire/ident/peptide/db_spec.rb +7 -97
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml +728 -0
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -728
- metadata +9 -2
data/README.md
CHANGED
@@ -19,7 +19,7 @@ Prince JT, Marcotte EM. <b>mspire: mass spectrometry proteomics in Ruby.</b> *Bi
|
|
19
19
|
|
20
20
|
### imzml
|
21
21
|
|
22
|
-
Mspire is the *only* converter from mzml into imzml.
|
22
|
+
Mspire is the *only* commandline converter from mzml into imzml (also see [imzMLConverter](http://www.cs.bham.ac.uk/~ibs/imzMLConverter/))
|
23
23
|
|
24
24
|
* handles both processed and continuous modes
|
25
25
|
* gracefully handles SIM data
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.5
|
data/lib/mspire/fasta.rb
CHANGED
@@ -58,33 +58,11 @@ module Mspire
|
|
58
58
|
Bio::FlatFile.new(Bio::FastaFormat, io)
|
59
59
|
end
|
60
60
|
|
61
|
-
|
62
|
-
#
|
63
|
-
#
|
64
|
-
def self.
|
65
|
-
|
66
|
-
protid_to_length = {}
|
67
|
-
re = /^>([^\s]+) (.*)/
|
68
|
-
ids = []
|
69
|
-
lengths = []
|
70
|
-
current_length = nil
|
71
|
-
IO.foreach(file) do |line|
|
72
|
-
line.chomp!
|
73
|
-
if md=re.match(line)
|
74
|
-
lengths << current_length
|
75
|
-
current_id = md[1]
|
76
|
-
ids << current_id
|
77
|
-
current_length = 0
|
78
|
-
protid_to_description[current_id] = md[2]
|
79
|
-
else
|
80
|
-
current_length += line.size
|
81
|
-
end
|
82
|
-
end
|
83
|
-
lengths << current_length
|
84
|
-
lengths.shift # remove the first nil entry
|
85
|
-
[Hash[ids.zip(lengths).to_a], protid_to_description]
|
61
|
+
# takes the header string and returns the uniprot id
|
62
|
+
#
|
63
|
+
# 'sp|Q04917|1433F_HUMAN' #=> 'Q04917'
|
64
|
+
def self.uniprot_id(header)
|
65
|
+
header[/^[^\|]+\|([^\|]+)\|/, 1]
|
86
66
|
end
|
87
|
-
=end
|
88
|
-
|
89
67
|
end
|
90
68
|
end
|
@@ -0,0 +1,248 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require 'mspire/digester'
|
3
|
+
require 'mspire/fasta'
|
4
|
+
require 'mspire/ident/peptide/db'
|
5
|
+
|
6
|
+
class Mspire::Ident::Peptide::Db::Creator
|
7
|
+
MAX_NUM_AA_EXPANSION = 3
|
8
|
+
|
9
|
+
# the twenty standard amino acids
|
10
|
+
STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
|
11
|
+
EXPAND_AA = {'X' => STANDARD_AA}
|
12
|
+
|
13
|
+
DEFAULT_PEPTIDE_CENTRIC_DB = {
|
14
|
+
missed_cleavages: 2,
|
15
|
+
min_length: 4,
|
16
|
+
enzyme: Mspire::Digester[:trypsin],
|
17
|
+
remove_digestion_file: true,
|
18
|
+
cleave_initiator_methionine: true,
|
19
|
+
expand_aa: true,
|
20
|
+
uniprot: true
|
21
|
+
}
|
22
|
+
|
23
|
+
def self.cmdline(argv)
|
24
|
+
|
25
|
+
opt = {
|
26
|
+
:remove_digestion_file => true,
|
27
|
+
:enzyme => Mspire::Digester[:trypsin]
|
28
|
+
}
|
29
|
+
opts = OptionParser.new do |op|
|
30
|
+
op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
|
31
|
+
op.separator "output: "
|
32
|
+
op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
|
33
|
+
op.separator "format:"
|
34
|
+
op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
|
35
|
+
op.separator ""
|
36
|
+
op.separator " Initiator Methionines - by default, will generate two peptides"
|
37
|
+
op.separator " for any peptide found at the N-termini starting with 'M'"
|
38
|
+
op.separator " (i.e., one with and one without the leading methionine)"
|
39
|
+
op.separator ""
|
40
|
+
op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
|
41
|
+
op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
|
42
|
+
op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
|
43
|
+
op.on("--no-expand-x", "don't enumerate aa possibilities", "(removes these peptides)") { opt[:expand_aa] = false }
|
44
|
+
op.on("--no-uniprot", "use entire protid section of fasta header", "for non-uniprot fasta files") { opt[:uniprot] = false }
|
45
|
+
op.on("--trie", "use a trie (for very large uniprot files)", "must have fast_trie gem installed") {|v| opt[:trie] = v }
|
46
|
+
op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Mspire::Insilico::Digester.const_get(v.upcase) }
|
47
|
+
op.on("--list-enzymes", "lists approved enzymes and exits") do
|
48
|
+
puts Mspire::Digester::ENZYMES.keys.join("\n")
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.parse!(argv)
|
54
|
+
|
55
|
+
if argv.size == 0
|
56
|
+
puts opts || exit
|
57
|
+
end
|
58
|
+
|
59
|
+
argv.map do |file|
|
60
|
+
creator = Mspire::Ident::Peptide::Db::Creator.new
|
61
|
+
creator.create(file, opt)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# returns the name of the digestion file that was written
|
66
|
+
def create_digestion_file(fasta_file, opts={})
|
67
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
68
|
+
|
69
|
+
(missed_cleavages, enzyme, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :enzyme, :cleave_initiator_methionine, :expand_aa)
|
70
|
+
start_time = Time.now
|
71
|
+
print "Digesting #{fasta_file} ..." if $VERBOSE
|
72
|
+
|
73
|
+
letters_to_expand_re = Regexp.new("[" << Regexp.escape(EXPAND_AA.keys.join) << "]")
|
74
|
+
|
75
|
+
base = fasta_file.chomp(File.extname(fasta_file))
|
76
|
+
digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
|
77
|
+
File.open(digestion_file, "w") do |fh|
|
78
|
+
Mspire::Fasta.open(fasta_file) do |fasta|
|
79
|
+
fasta.each do |prot|
|
80
|
+
peptides = enzyme.digest(prot.sequence, missed_cleavages)
|
81
|
+
if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
|
82
|
+
m_peps = []
|
83
|
+
init_methionine_peps = []
|
84
|
+
peptides.each do |pep|
|
85
|
+
# if the peptide is at the beginning of the protein sequence
|
86
|
+
if prot.sequence[0,pep.size] == pep
|
87
|
+
m_peps << pep[1..-1]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
peptides.push(*m_peps)
|
91
|
+
end
|
92
|
+
peptides =
|
93
|
+
if expand_aa
|
94
|
+
peptides.flat_map do |pep|
|
95
|
+
(pep =~ letters_to_expand_re) ? expand_peptides(pep, EXPAND_AA) : pep
|
96
|
+
end
|
97
|
+
else
|
98
|
+
peptides.map {|pep| pep =~ letters_to_expand_re }.compact
|
99
|
+
end
|
100
|
+
header = prot.header
|
101
|
+
id = opts[:uniprot] ? Mspire::Fasta.uniprot_id(header) : header.split(/\s+/).first
|
102
|
+
fh.puts( id + "\t" + peptides.join(" ") )
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
107
|
+
digestion_file
|
108
|
+
end
|
109
|
+
|
110
|
+
# returns the full path of the created file
|
111
|
+
def db_from_fasta_digestion_file(digestion_file, opts={})
|
112
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
113
|
+
|
114
|
+
start_time = Time.now
|
115
|
+
puts "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
|
116
|
+
|
117
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
118
|
+
hash_like = hash_like_from_digestion_file(digestion_file, opts[:min_length], opts[:trie])
|
119
|
+
|
120
|
+
base = digestion_file.chomp(File.extname(digestion_file))
|
121
|
+
final_outfile = base + ".min_aaseq#{opts[:min_length]}" + ".yml"
|
122
|
+
|
123
|
+
start_time = Time.now
|
124
|
+
print "Writing #{hash_like.size} peptides to #{} ..." if $VERBOSE
|
125
|
+
|
126
|
+
File.open(final_outfile, 'w') do |out|
|
127
|
+
hash_like.each do |k,v|
|
128
|
+
#out.puts( [k, v.join(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)].join(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER) )
|
129
|
+
out.puts "#{k}#{Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER}#{v}"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
133
|
+
|
134
|
+
if opts[:remove_digestion_file]
|
135
|
+
File.unlink(digestion_file)
|
136
|
+
end
|
137
|
+
File.expand_path(final_outfile)
|
138
|
+
end
|
139
|
+
|
140
|
+
def hash_like_tree
|
141
|
+
require 'trie'
|
142
|
+
trie = Trie.new
|
143
|
+
def trie.[](key)
|
144
|
+
val = self.get(key)
|
145
|
+
if val.nil?
|
146
|
+
self.add(key,"")
|
147
|
+
self.get(key)
|
148
|
+
else
|
149
|
+
val
|
150
|
+
end
|
151
|
+
end
|
152
|
+
trie
|
153
|
+
end
|
154
|
+
|
155
|
+
def hash_like_from_digestion_file(digestion_file, min_length, use_trie=false)
|
156
|
+
cnt = 0
|
157
|
+
if use_trie
|
158
|
+
raise NotImplementedError
|
159
|
+
#puts "using trie" if $VERBOSE
|
160
|
+
#trie = hash_like_tree
|
161
|
+
#line_cnt = 0
|
162
|
+
#::IO.foreach(digestion_file) do |line|
|
163
|
+
#line_cnt += 1
|
164
|
+
##puts "LINE COUND"
|
165
|
+
##p line_cnt
|
166
|
+
#(prot, *peps) = line.chomp!.split(/\s+/)
|
167
|
+
##p peps
|
168
|
+
##p peps.class
|
169
|
+
## prot is something like this: "P31946"
|
170
|
+
#puts line
|
171
|
+
#peps.each do |pep|
|
172
|
+
#if pep.size >= min_length
|
173
|
+
#to_set =
|
174
|
+
#if val = trie.get(pep)
|
175
|
+
#val + Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER + prot
|
176
|
+
#else
|
177
|
+
#prot
|
178
|
+
#end
|
179
|
+
#p to_set.size
|
180
|
+
#trie.add(pep, to_set)
|
181
|
+
#end
|
182
|
+
#end
|
183
|
+
#cnt += 1
|
184
|
+
#puts cnt if (cnt % 1000) == 0
|
185
|
+
#end
|
186
|
+
#abort "HERE"
|
187
|
+
#trie
|
188
|
+
else
|
189
|
+
hash = {}
|
190
|
+
::IO.foreach(digestion_file) do |line|
|
191
|
+
(prot, *peps) = line.chomp!.split(/\s+/)
|
192
|
+
# prot is something like this: "P31946"
|
193
|
+
peps.each do |pep|
|
194
|
+
if pep.size >= min_length
|
195
|
+
if val = hash[pep]
|
196
|
+
val << Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER << prot
|
197
|
+
else
|
198
|
+
val = prot
|
199
|
+
end
|
200
|
+
hash[pep] = val
|
201
|
+
end
|
202
|
+
end
|
203
|
+
cnt += 1
|
204
|
+
puts cnt if (cnt % 1000) == 0
|
205
|
+
end
|
206
|
+
hash
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
# writes a new file with the added 'min_aaseq<Integer>'
|
211
|
+
# creates a temporary digestion file that contains all peptides digesting
|
212
|
+
# with certain missed_cleavages (i.e., min_seq_length is not applied to
|
213
|
+
# this file but on the final peptide centric db)
|
214
|
+
# returns the full name of the written file.
|
215
|
+
def create(fasta_file, opts={})
|
216
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
217
|
+
digestion_file = create_digestion_file(fasta_file, opts)
|
218
|
+
db_from_fasta_digestion_file(digestion_file, opts)
|
219
|
+
end
|
220
|
+
|
221
|
+
# does combinatorial expansion of all letters requesting it.
|
222
|
+
# expand_aa is hash like: {'X'=>STANDARD_AA}
|
223
|
+
# returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
|
224
|
+
# be expanded
|
225
|
+
# returns an empty array if there is no expansion
|
226
|
+
def expand_peptides(peptide, expand_aa_hash)
|
227
|
+
letters_in_order = expand_aa_hash.keys.sort
|
228
|
+
index_and_key = []
|
229
|
+
peptide.split('').each_with_index do |char,i|
|
230
|
+
if let_index = letters_in_order.index(char)
|
231
|
+
index_and_key << [i, letters_in_order[let_index]]
|
232
|
+
end
|
233
|
+
end
|
234
|
+
if index_and_key.size > MAX_NUM_AA_EXPANSION
|
235
|
+
return nil
|
236
|
+
end
|
237
|
+
to_expand = [peptide]
|
238
|
+
index_and_key.each do |i,letter|
|
239
|
+
new_peps = []
|
240
|
+
while current_pep = to_expand.shift do
|
241
|
+
new_peps << expand_aa_hash[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
|
242
|
+
end
|
243
|
+
to_expand = new_peps.flatten
|
244
|
+
end
|
245
|
+
to_expand
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'mspire/ident/peptide/db'
|
2
|
+
|
3
|
+
class Mspire::Ident::Peptide::Db::IO
|
4
|
+
# an object for on disk retrieval of db entries
|
5
|
+
# proteins are returned as an array.
|
6
|
+
# behaves like a hash once it is opened.
|
7
|
+
include Enumerable
|
8
|
+
def self.open(filename, &block)
|
9
|
+
raise ArgumentError unless block
|
10
|
+
File.open(filename) do |io|
|
11
|
+
block.call(self.new(io))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_accessor :io
|
16
|
+
attr_accessor :index
|
17
|
+
|
18
|
+
def initialize(io)
|
19
|
+
@io = io
|
20
|
+
@index = {}
|
21
|
+
re = /^(\w+)#{Regexp.escape(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER)}/
|
22
|
+
prev_io_pos = io.pos
|
23
|
+
triplets = io.each_line.map do |line|
|
24
|
+
key = re.match(line)[1]
|
25
|
+
[key, prev_io_pos + key.bytesize+Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
|
26
|
+
end
|
27
|
+
triplets.each do |key, start, end_pos|
|
28
|
+
@index[key] = [start, end_pos-start]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# returns an array of proteins for the given key (peptide aaseq)
|
33
|
+
def [](key)
|
34
|
+
(start, length) = @index[key]
|
35
|
+
return nil unless start
|
36
|
+
@io.seek(start)
|
37
|
+
string = @io.read(length)
|
38
|
+
string.chomp!
|
39
|
+
string.split(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)
|
40
|
+
end
|
41
|
+
|
42
|
+
# number of entries
|
43
|
+
def size ; @index.size end
|
44
|
+
alias_method :length, :size
|
45
|
+
|
46
|
+
def keys
|
47
|
+
@index.keys
|
48
|
+
end
|
49
|
+
|
50
|
+
# all the protein lists
|
51
|
+
def values
|
52
|
+
keys.map {|key| self[key] }
|
53
|
+
end
|
54
|
+
|
55
|
+
# yields a pair of aaseq and protein array
|
56
|
+
def each(&block)
|
57
|
+
@index.each do |key, start_length|
|
58
|
+
block.call([key, self[key]])
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
@@ -1,243 +1,36 @@
|
|
1
|
-
require '
|
2
|
-
require 'mspire/fasta'
|
3
|
-
require 'optparse'
|
1
|
+
require 'yaml'
|
4
2
|
|
5
3
|
module Mspire ; end
|
6
4
|
module Mspire::Ident ; end
|
7
5
|
module Mspire::Ident::Peptide ; end
|
8
6
|
|
9
|
-
#
|
10
|
-
#
|
11
|
-
|
12
|
-
# methods are untested at this time and should be avoided!
|
13
|
-
class Mspire::Ident::Peptide::Db < Hash
|
14
|
-
MAX_NUM_AA_EXPANSION = 3
|
15
|
-
|
16
|
-
# the twenty standard amino acids
|
17
|
-
STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
|
18
|
-
|
19
|
-
DEFAULT_PEPTIDE_CENTRIC_DB = {:missed_cleavages => 2, :min_length => 4, :enzyme => Mspire::Digester[:trypsin], :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
|
20
|
-
|
7
|
+
# Very simple object for protein retrieval from a peptide-centric database
|
8
|
+
# See Mspire::Ident::Peptide::Db::IO for an on-disc version for larger files.
|
9
|
+
class Mspire::Ident::Peptide::Db
|
21
10
|
PROTEIN_DELIMITER = "\t"
|
22
|
-
KEY_VALUE_DELIMITER =
|
23
|
-
|
24
|
-
def self.cmdline(argv)
|
25
|
-
|
26
|
-
opt = {
|
27
|
-
:remove_digestion_file => true,
|
28
|
-
:enzyme => Mspire::Digester[:trypsin]
|
29
|
-
}
|
30
|
-
opts = OptionParser.new do |op|
|
31
|
-
op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
|
32
|
-
op.separator "output: "
|
33
|
-
op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
|
34
|
-
op.separator "format:"
|
35
|
-
op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
|
36
|
-
op.separator ""
|
37
|
-
op.separator " Initiator Methionines - by default, will generate two peptides"
|
38
|
-
op.separator " for any peptide found at the N-termini starting with 'M'"
|
39
|
-
op.separator " (i.e., one with and one without the leading methionine)"
|
40
|
-
op.separator ""
|
41
|
-
op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
|
42
|
-
op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
|
43
|
-
op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
|
44
|
-
op.on("--no-expand-x", "don't enumerate aa 'X' possibilities") { opt[:expand_aa] = nil }
|
45
|
-
op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Mspire::Insilico::Digester.const_get(v.upcase) }
|
46
|
-
op.on("--list-enzymes", "lists approved enzymes and exits") do
|
47
|
-
puts Mspire::Digester::ENZYMES.keys.join("\n")
|
48
|
-
exit
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
opts.parse!(argv)
|
53
|
-
|
54
|
-
if argv.size == 0
|
55
|
-
puts opts || exit
|
56
|
-
end
|
57
|
-
|
58
|
-
argv.map do |file|
|
59
|
-
Mspire::Ident::Peptide::Db.peptide_centric_db(file, opt)
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
# writes a new file with the added 'min_aaseq<Integer>'
|
64
|
-
# creates a temporary digestion file that contains all peptides digesting
|
65
|
-
# with certain missed_cleavages (i.e., min_seq_length is not applied to
|
66
|
-
# this file but on the final peptide centric db)
|
67
|
-
# returns the full name of the written file.
|
68
|
-
def self.peptide_centric_db(fasta_file, opts={})
|
69
|
-
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
70
|
-
|
71
|
-
(missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa)
|
72
|
-
start_time = Time.now
|
73
|
-
print "Digesting #{fasta_file} ..." if $VERBOSE
|
74
|
-
|
75
|
-
if expand_aa
|
76
|
-
letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
|
77
|
-
end
|
78
|
-
|
79
|
-
base = fasta_file.chomp(File.extname(fasta_file))
|
80
|
-
digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
|
81
|
-
File.open(digestion_file, "w") do |fh|
|
82
|
-
Mspire::Fasta.open(fasta_file) do |fasta|
|
83
|
-
fasta.each do |prot|
|
84
|
-
peptides = enzyme.digest(prot.sequence, missed_cleavages)
|
85
|
-
if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
|
86
|
-
m_peps = []
|
87
|
-
init_methionine_peps = []
|
88
|
-
peptides.each do |pep|
|
89
|
-
# if the peptide is at the beginning of the protein sequence
|
90
|
-
if prot.sequence[0,pep.size] == pep
|
91
|
-
m_peps << pep[1..-1]
|
92
|
-
end
|
93
|
-
end
|
94
|
-
peptides.push(*m_peps)
|
95
|
-
end
|
96
|
-
if expand_aa
|
97
|
-
peptides = peptides.map do |pep|
|
98
|
-
if pep =~ letters_to_expand_re
|
99
|
-
expand_peptides(pep, expand_aa)
|
100
|
-
else
|
101
|
-
pep
|
102
|
-
end
|
103
|
-
end.flatten
|
104
|
-
end
|
105
|
-
fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
puts "#{Time.now - start_time} sec" if $VERBOSE
|
110
|
-
|
111
|
-
|
112
|
-
start_time = Time.now
|
113
|
-
print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
|
11
|
+
KEY_VALUE_DELIMITER = ': '
|
114
12
|
|
115
|
-
|
116
|
-
::IO.foreach(digestion_file) do |line|
|
117
|
-
(prot, *peps) = line.chomp!.split(/\s+/)
|
118
|
-
# prot is something like this: "sp|P31946|1433B_HUMAN" in uniprot
|
119
|
-
peps.each do |pep|
|
120
|
-
if pep.size >= min_length
|
121
|
-
hash[pep] << prot
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
puts "#{Time.now - start_time} sec" if $VERBOSE
|
126
|
-
|
127
|
-
base = digestion_file.chomp(File.extname(digestion_file))
|
128
|
-
final_outfile = base + ".min_aaseq#{min_length}" + ".yml"
|
129
|
-
|
130
|
-
start_time = Time.now
|
131
|
-
print "Writing #{hash.size} peptides to #{} ..." if $VERBOSE
|
132
|
-
|
133
|
-
File.open(final_outfile, 'w') do |out|
|
134
|
-
hash.each do |k,v|
|
135
|
-
out.puts( [k, v.join(PROTEIN_DELIMITER)].join(KEY_VALUE_DELIMITER) )
|
136
|
-
end
|
137
|
-
end
|
138
|
-
puts "#{Time.now - start_time} sec" if $VERBOSE
|
139
|
-
|
140
|
-
if remove_digestion_file
|
141
|
-
File.unlink(digestion_file)
|
142
|
-
end
|
143
|
-
File.expand_path(final_outfile)
|
144
|
-
end
|
145
|
-
|
146
|
-
# does combinatorial expansion of all letters requesting it.
|
147
|
-
# expand_aa is hash like: {'X'=>STANDARD_AA}
|
148
|
-
# returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
|
149
|
-
# be expanded
|
150
|
-
# returns an empty array if there is no expansion
|
151
|
-
def self.expand_peptides(peptide, expand_aa)
|
152
|
-
letters_in_order = expand_aa.keys.sort
|
153
|
-
index_and_key = []
|
154
|
-
peptide.split('').each_with_index do |char,i|
|
155
|
-
if let_index = letters_in_order.index(char)
|
156
|
-
index_and_key << [i, letters_in_order[let_index]]
|
157
|
-
end
|
158
|
-
end
|
159
|
-
if index_and_key.size > MAX_NUM_AA_EXPANSION
|
160
|
-
return nil
|
161
|
-
end
|
162
|
-
to_expand = [peptide]
|
163
|
-
index_and_key.each do |i,letter|
|
164
|
-
new_peps = []
|
165
|
-
while current_pep = to_expand.shift do
|
166
|
-
new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
|
167
|
-
end
|
168
|
-
to_expand = new_peps.flatten
|
169
|
-
end
|
170
|
-
to_expand
|
171
|
-
end
|
13
|
+
attr_accessor :data
|
172
14
|
|
173
15
|
def initialize(db_file)
|
174
|
-
|
16
|
+
@data = YAML.load_file(db_file)
|
175
17
|
end
|
176
18
|
|
177
|
-
|
178
|
-
|
179
|
-
# returns the protein id's as an array
|
19
|
+
# returns protein id's as an array
|
180
20
|
def [](key)
|
181
|
-
|
21
|
+
val=@data[key]
|
22
|
+
val.chomp.split(PROTEIN_DELIMITER) if val
|
182
23
|
end
|
183
24
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
class IO
|
188
|
-
include Enumerable
|
189
|
-
def self.open(filename, &block)
|
190
|
-
raise ArgumentError unless block
|
191
|
-
File.open(filename) do |io|
|
192
|
-
block.call(self.new(io))
|
193
|
-
end
|
194
|
-
end
|
195
|
-
|
196
|
-
attr_accessor :io
|
197
|
-
attr_accessor :index
|
198
|
-
|
199
|
-
def initialize(io)
|
200
|
-
@io = io
|
201
|
-
@index = {}
|
202
|
-
re = /^(\w+)#{Regexp.escape(KEY_VALUE_DELIMITER)}/
|
203
|
-
prev_io_pos = io.pos
|
204
|
-
triplets = io.each_line.map do |line|
|
205
|
-
key = re.match(line)[1]
|
206
|
-
[key, prev_io_pos + key.bytesize+KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
|
207
|
-
end
|
208
|
-
triplets.each do |key, start, end_pos|
|
209
|
-
@index[key] = [start, end_pos-start]
|
210
|
-
end
|
211
|
-
end
|
212
|
-
|
213
|
-
# returns an array of proteins for the given key (peptide aaseq)
|
214
|
-
def [](key)
|
215
|
-
(start, length) = @index[key]
|
216
|
-
return nil unless start
|
217
|
-
@io.seek(start)
|
218
|
-
string = @io.read(length)
|
219
|
-
string.chomp!
|
220
|
-
string.split("\t")
|
221
|
-
end
|
222
|
-
|
223
|
-
# number of entries
|
224
|
-
def size ; @index.size end
|
225
|
-
alias_method :length, :size
|
226
|
-
|
227
|
-
def keys
|
228
|
-
@index.keys
|
229
|
-
end
|
25
|
+
def keys
|
26
|
+
@data.keys
|
27
|
+
end
|
230
28
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
end
|
29
|
+
def values
|
30
|
+
@data.values
|
31
|
+
end
|
235
32
|
|
236
|
-
|
237
|
-
|
238
|
-
@index.each do |key, start_length|
|
239
|
-
block.call([key, self[key]])
|
240
|
-
end
|
241
|
-
end
|
33
|
+
def size
|
34
|
+
@data.size
|
242
35
|
end
|
243
36
|
end
|
@@ -19,12 +19,13 @@ class Mspire::Ident::PeptideHit
|
|
19
19
|
|
20
20
|
# writes the peptide hits to a phq.tsv file. qvalues is a parallel array
|
21
21
|
# to hits that can provide qvalues if not inherent to the hits
|
22
|
-
# returns the filename.
|
22
|
+
# returns the filename. Expects each hit to implement #search_id, #id,
|
23
|
+
# #aaseq and #charge
|
23
24
|
def to_file(filename, hits, qvalues=[])
|
24
25
|
File.open(filename,'w') do |out|
|
25
26
|
out.puts HEADER.join(FILE_DELIMITER)
|
26
27
|
hits.zip(qvalues) do |hit, qvalue|
|
27
|
-
out.puts [hit.
|
28
|
+
out.puts [hit.search_id, hit.id, hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
|
28
29
|
end
|
29
30
|
end
|
30
31
|
filename
|