mspire 0.8.4 → 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -1
- data/VERSION +1 -1
- data/lib/mspire/fasta.rb +5 -27
- data/lib/mspire/ident/peptide/db/creator.rb +248 -0
- data/lib/mspire/ident/peptide/db/io.rb +62 -0
- data/lib/mspire/ident/peptide/db.rb +18 -225
- data/lib/mspire/ident/peptide_hit/qvalue.rb +3 -2
- data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
- data/script/mascot_dat_to_peptide_hit_qvalues.rb +118 -0
- data/spec/mspire/ident/peptide/db/creator_spec.rb +65 -0
- data/spec/mspire/ident/peptide/db/io_spec.rb +21 -0
- data/spec/mspire/ident/peptide/db_spec.rb +7 -97
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml +728 -0
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -728
- metadata +9 -2
data/README.md
CHANGED
@@ -19,7 +19,7 @@ Prince JT, Marcotte EM. <b>mspire: mass spectrometry proteomics in Ruby.</b> *Bi
|
|
19
19
|
|
20
20
|
### imzml
|
21
21
|
|
22
|
-
Mspire is the *only* converter from mzml into imzml.
|
22
|
+
Mspire is the *only* commandline converter from mzml into imzml (also see [imzMLConverter](http://www.cs.bham.ac.uk/~ibs/imzMLConverter/))
|
23
23
|
|
24
24
|
* handles both processed and continuous modes
|
25
25
|
* gracefully handles SIM data
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.8.
|
1
|
+
0.8.5
|
data/lib/mspire/fasta.rb
CHANGED
@@ -58,33 +58,11 @@ module Mspire
|
|
58
58
|
Bio::FlatFile.new(Bio::FastaFormat, io)
|
59
59
|
end
|
60
60
|
|
61
|
-
|
62
|
-
#
|
63
|
-
#
|
64
|
-
def self.
|
65
|
-
|
66
|
-
protid_to_length = {}
|
67
|
-
re = /^>([^\s]+) (.*)/
|
68
|
-
ids = []
|
69
|
-
lengths = []
|
70
|
-
current_length = nil
|
71
|
-
IO.foreach(file) do |line|
|
72
|
-
line.chomp!
|
73
|
-
if md=re.match(line)
|
74
|
-
lengths << current_length
|
75
|
-
current_id = md[1]
|
76
|
-
ids << current_id
|
77
|
-
current_length = 0
|
78
|
-
protid_to_description[current_id] = md[2]
|
79
|
-
else
|
80
|
-
current_length += line.size
|
81
|
-
end
|
82
|
-
end
|
83
|
-
lengths << current_length
|
84
|
-
lengths.shift # remove the first nil entry
|
85
|
-
[Hash[ids.zip(lengths).to_a], protid_to_description]
|
61
|
+
# takes the header string and returns the uniprot id
|
62
|
+
#
|
63
|
+
# 'sp|Q04917|1433F_HUMAN' #=> 'Q04917'
|
64
|
+
def self.uniprot_id(header)
|
65
|
+
header[/^[^\|]+\|([^\|]+)\|/, 1]
|
86
66
|
end
|
87
|
-
=end
|
88
|
-
|
89
67
|
end
|
90
68
|
end
|
@@ -0,0 +1,248 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require 'mspire/digester'
|
3
|
+
require 'mspire/fasta'
|
4
|
+
require 'mspire/ident/peptide/db'
|
5
|
+
|
6
|
+
class Mspire::Ident::Peptide::Db::Creator
|
7
|
+
MAX_NUM_AA_EXPANSION = 3
|
8
|
+
|
9
|
+
# the twenty standard amino acids
|
10
|
+
STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
|
11
|
+
EXPAND_AA = {'X' => STANDARD_AA}
|
12
|
+
|
13
|
+
DEFAULT_PEPTIDE_CENTRIC_DB = {
|
14
|
+
missed_cleavages: 2,
|
15
|
+
min_length: 4,
|
16
|
+
enzyme: Mspire::Digester[:trypsin],
|
17
|
+
remove_digestion_file: true,
|
18
|
+
cleave_initiator_methionine: true,
|
19
|
+
expand_aa: true,
|
20
|
+
uniprot: true
|
21
|
+
}
|
22
|
+
|
23
|
+
def self.cmdline(argv)
|
24
|
+
|
25
|
+
opt = {
|
26
|
+
:remove_digestion_file => true,
|
27
|
+
:enzyme => Mspire::Digester[:trypsin]
|
28
|
+
}
|
29
|
+
opts = OptionParser.new do |op|
|
30
|
+
op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
|
31
|
+
op.separator "output: "
|
32
|
+
op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
|
33
|
+
op.separator "format:"
|
34
|
+
op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
|
35
|
+
op.separator ""
|
36
|
+
op.separator " Initiator Methionines - by default, will generate two peptides"
|
37
|
+
op.separator " for any peptide found at the N-termini starting with 'M'"
|
38
|
+
op.separator " (i.e., one with and one without the leading methionine)"
|
39
|
+
op.separator ""
|
40
|
+
op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
|
41
|
+
op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
|
42
|
+
op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
|
43
|
+
op.on("--no-expand-x", "don't enumerate aa possibilities", "(removes these peptides)") { opt[:expand_aa] = false }
|
44
|
+
op.on("--no-uniprot", "use entire protid section of fasta header", "for non-uniprot fasta files") { opt[:uniprot] = false }
|
45
|
+
op.on("--trie", "use a trie (for very large uniprot files)", "must have fast_trie gem installed") {|v| opt[:trie] = v }
|
46
|
+
op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Mspire::Insilico::Digester.const_get(v.upcase) }
|
47
|
+
op.on("--list-enzymes", "lists approved enzymes and exits") do
|
48
|
+
puts Mspire::Digester::ENZYMES.keys.join("\n")
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.parse!(argv)
|
54
|
+
|
55
|
+
if argv.size == 0
|
56
|
+
puts opts || exit
|
57
|
+
end
|
58
|
+
|
59
|
+
argv.map do |file|
|
60
|
+
creator = Mspire::Ident::Peptide::Db::Creator.new
|
61
|
+
creator.create(file, opt)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# returns the name of the digestion file that was written
|
66
|
+
def create_digestion_file(fasta_file, opts={})
|
67
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
68
|
+
|
69
|
+
(missed_cleavages, enzyme, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :enzyme, :cleave_initiator_methionine, :expand_aa)
|
70
|
+
start_time = Time.now
|
71
|
+
print "Digesting #{fasta_file} ..." if $VERBOSE
|
72
|
+
|
73
|
+
letters_to_expand_re = Regexp.new("[" << Regexp.escape(EXPAND_AA.keys.join) << "]")
|
74
|
+
|
75
|
+
base = fasta_file.chomp(File.extname(fasta_file))
|
76
|
+
digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
|
77
|
+
File.open(digestion_file, "w") do |fh|
|
78
|
+
Mspire::Fasta.open(fasta_file) do |fasta|
|
79
|
+
fasta.each do |prot|
|
80
|
+
peptides = enzyme.digest(prot.sequence, missed_cleavages)
|
81
|
+
if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
|
82
|
+
m_peps = []
|
83
|
+
init_methionine_peps = []
|
84
|
+
peptides.each do |pep|
|
85
|
+
# if the peptide is at the beginning of the protein sequence
|
86
|
+
if prot.sequence[0,pep.size] == pep
|
87
|
+
m_peps << pep[1..-1]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
peptides.push(*m_peps)
|
91
|
+
end
|
92
|
+
peptides =
|
93
|
+
if expand_aa
|
94
|
+
peptides.flat_map do |pep|
|
95
|
+
(pep =~ letters_to_expand_re) ? expand_peptides(pep, EXPAND_AA) : pep
|
96
|
+
end
|
97
|
+
else
|
98
|
+
peptides.map {|pep| pep =~ letters_to_expand_re }.compact
|
99
|
+
end
|
100
|
+
header = prot.header
|
101
|
+
id = opts[:uniprot] ? Mspire::Fasta.uniprot_id(header) : header.split(/\s+/).first
|
102
|
+
fh.puts( id + "\t" + peptides.join(" ") )
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
107
|
+
digestion_file
|
108
|
+
end
|
109
|
+
|
110
|
+
# returns the full path of the created file
|
111
|
+
def db_from_fasta_digestion_file(digestion_file, opts={})
|
112
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
113
|
+
|
114
|
+
start_time = Time.now
|
115
|
+
puts "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
|
116
|
+
|
117
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
118
|
+
hash_like = hash_like_from_digestion_file(digestion_file, opts[:min_length], opts[:trie])
|
119
|
+
|
120
|
+
base = digestion_file.chomp(File.extname(digestion_file))
|
121
|
+
final_outfile = base + ".min_aaseq#{opts[:min_length]}" + ".yml"
|
122
|
+
|
123
|
+
start_time = Time.now
|
124
|
+
print "Writing #{hash_like.size} peptides to #{} ..." if $VERBOSE
|
125
|
+
|
126
|
+
File.open(final_outfile, 'w') do |out|
|
127
|
+
hash_like.each do |k,v|
|
128
|
+
#out.puts( [k, v.join(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)].join(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER) )
|
129
|
+
out.puts "#{k}#{Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER}#{v}"
|
130
|
+
end
|
131
|
+
end
|
132
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
133
|
+
|
134
|
+
if opts[:remove_digestion_file]
|
135
|
+
File.unlink(digestion_file)
|
136
|
+
end
|
137
|
+
File.expand_path(final_outfile)
|
138
|
+
end
|
139
|
+
|
140
|
+
def hash_like_tree
|
141
|
+
require 'trie'
|
142
|
+
trie = Trie.new
|
143
|
+
def trie.[](key)
|
144
|
+
val = self.get(key)
|
145
|
+
if val.nil?
|
146
|
+
self.add(key,"")
|
147
|
+
self.get(key)
|
148
|
+
else
|
149
|
+
val
|
150
|
+
end
|
151
|
+
end
|
152
|
+
trie
|
153
|
+
end
|
154
|
+
|
155
|
+
def hash_like_from_digestion_file(digestion_file, min_length, use_trie=false)
|
156
|
+
cnt = 0
|
157
|
+
if use_trie
|
158
|
+
raise NotImplementedError
|
159
|
+
#puts "using trie" if $VERBOSE
|
160
|
+
#trie = hash_like_tree
|
161
|
+
#line_cnt = 0
|
162
|
+
#::IO.foreach(digestion_file) do |line|
|
163
|
+
#line_cnt += 1
|
164
|
+
##puts "LINE COUND"
|
165
|
+
##p line_cnt
|
166
|
+
#(prot, *peps) = line.chomp!.split(/\s+/)
|
167
|
+
##p peps
|
168
|
+
##p peps.class
|
169
|
+
## prot is something like this: "P31946"
|
170
|
+
#puts line
|
171
|
+
#peps.each do |pep|
|
172
|
+
#if pep.size >= min_length
|
173
|
+
#to_set =
|
174
|
+
#if val = trie.get(pep)
|
175
|
+
#val + Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER + prot
|
176
|
+
#else
|
177
|
+
#prot
|
178
|
+
#end
|
179
|
+
#p to_set.size
|
180
|
+
#trie.add(pep, to_set)
|
181
|
+
#end
|
182
|
+
#end
|
183
|
+
#cnt += 1
|
184
|
+
#puts cnt if (cnt % 1000) == 0
|
185
|
+
#end
|
186
|
+
#abort "HERE"
|
187
|
+
#trie
|
188
|
+
else
|
189
|
+
hash = {}
|
190
|
+
::IO.foreach(digestion_file) do |line|
|
191
|
+
(prot, *peps) = line.chomp!.split(/\s+/)
|
192
|
+
# prot is something like this: "P31946"
|
193
|
+
peps.each do |pep|
|
194
|
+
if pep.size >= min_length
|
195
|
+
if val = hash[pep]
|
196
|
+
val << Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER << prot
|
197
|
+
else
|
198
|
+
val = prot
|
199
|
+
end
|
200
|
+
hash[pep] = val
|
201
|
+
end
|
202
|
+
end
|
203
|
+
cnt += 1
|
204
|
+
puts cnt if (cnt % 1000) == 0
|
205
|
+
end
|
206
|
+
hash
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
# writes a new file with the added 'min_aaseq<Integer>'
|
211
|
+
# creates a temporary digestion file that contains all peptides digesting
|
212
|
+
# with certain missed_cleavages (i.e., min_seq_length is not applied to
|
213
|
+
# this file but on the final peptide centric db)
|
214
|
+
# returns the full name of the written file.
|
215
|
+
def create(fasta_file, opts={})
|
216
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
217
|
+
digestion_file = create_digestion_file(fasta_file, opts)
|
218
|
+
db_from_fasta_digestion_file(digestion_file, opts)
|
219
|
+
end
|
220
|
+
|
221
|
+
# does combinatorial expansion of all letters requesting it.
|
222
|
+
# expand_aa is hash like: {'X'=>STANDARD_AA}
|
223
|
+
# returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
|
224
|
+
# be expanded
|
225
|
+
# returns an empty array if there is no expansion
|
226
|
+
def expand_peptides(peptide, expand_aa_hash)
|
227
|
+
letters_in_order = expand_aa_hash.keys.sort
|
228
|
+
index_and_key = []
|
229
|
+
peptide.split('').each_with_index do |char,i|
|
230
|
+
if let_index = letters_in_order.index(char)
|
231
|
+
index_and_key << [i, letters_in_order[let_index]]
|
232
|
+
end
|
233
|
+
end
|
234
|
+
if index_and_key.size > MAX_NUM_AA_EXPANSION
|
235
|
+
return nil
|
236
|
+
end
|
237
|
+
to_expand = [peptide]
|
238
|
+
index_and_key.each do |i,letter|
|
239
|
+
new_peps = []
|
240
|
+
while current_pep = to_expand.shift do
|
241
|
+
new_peps << expand_aa_hash[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
|
242
|
+
end
|
243
|
+
to_expand = new_peps.flatten
|
244
|
+
end
|
245
|
+
to_expand
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'mspire/ident/peptide/db'
|
2
|
+
|
3
|
+
class Mspire::Ident::Peptide::Db::IO
|
4
|
+
# an object for on disk retrieval of db entries
|
5
|
+
# proteins are returned as an array.
|
6
|
+
# behaves like a hash once it is opened.
|
7
|
+
include Enumerable
|
8
|
+
def self.open(filename, &block)
|
9
|
+
raise ArgumentError unless block
|
10
|
+
File.open(filename) do |io|
|
11
|
+
block.call(self.new(io))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_accessor :io
|
16
|
+
attr_accessor :index
|
17
|
+
|
18
|
+
def initialize(io)
|
19
|
+
@io = io
|
20
|
+
@index = {}
|
21
|
+
re = /^(\w+)#{Regexp.escape(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER)}/
|
22
|
+
prev_io_pos = io.pos
|
23
|
+
triplets = io.each_line.map do |line|
|
24
|
+
key = re.match(line)[1]
|
25
|
+
[key, prev_io_pos + key.bytesize+Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
|
26
|
+
end
|
27
|
+
triplets.each do |key, start, end_pos|
|
28
|
+
@index[key] = [start, end_pos-start]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# returns an array of proteins for the given key (peptide aaseq)
|
33
|
+
def [](key)
|
34
|
+
(start, length) = @index[key]
|
35
|
+
return nil unless start
|
36
|
+
@io.seek(start)
|
37
|
+
string = @io.read(length)
|
38
|
+
string.chomp!
|
39
|
+
string.split(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)
|
40
|
+
end
|
41
|
+
|
42
|
+
# number of entries
|
43
|
+
def size ; @index.size end
|
44
|
+
alias_method :length, :size
|
45
|
+
|
46
|
+
def keys
|
47
|
+
@index.keys
|
48
|
+
end
|
49
|
+
|
50
|
+
# all the protein lists
|
51
|
+
def values
|
52
|
+
keys.map {|key| self[key] }
|
53
|
+
end
|
54
|
+
|
55
|
+
# yields a pair of aaseq and protein array
|
56
|
+
def each(&block)
|
57
|
+
@index.each do |key, start_length|
|
58
|
+
block.call([key, self[key]])
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
@@ -1,243 +1,36 @@
|
|
1
|
-
require '
|
2
|
-
require 'mspire/fasta'
|
3
|
-
require 'optparse'
|
1
|
+
require 'yaml'
|
4
2
|
|
5
3
|
module Mspire ; end
|
6
4
|
module Mspire::Ident ; end
|
7
5
|
module Mspire::Ident::Peptide ; end
|
8
6
|
|
9
|
-
#
|
10
|
-
#
|
11
|
-
|
12
|
-
# methods are untested at this time and should be avoided!
|
13
|
-
class Mspire::Ident::Peptide::Db < Hash
|
14
|
-
MAX_NUM_AA_EXPANSION = 3
|
15
|
-
|
16
|
-
# the twenty standard amino acids
|
17
|
-
STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
|
18
|
-
|
19
|
-
DEFAULT_PEPTIDE_CENTRIC_DB = {:missed_cleavages => 2, :min_length => 4, :enzyme => Mspire::Digester[:trypsin], :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
|
20
|
-
|
7
|
+
# Very simple object for protein retrieval from a peptide-centric database
|
8
|
+
# See Mspire::Ident::Peptide::Db::IO for an on-disc version for larger files.
|
9
|
+
class Mspire::Ident::Peptide::Db
|
21
10
|
PROTEIN_DELIMITER = "\t"
|
22
|
-
KEY_VALUE_DELIMITER =
|
23
|
-
|
24
|
-
def self.cmdline(argv)
|
25
|
-
|
26
|
-
opt = {
|
27
|
-
:remove_digestion_file => true,
|
28
|
-
:enzyme => Mspire::Digester[:trypsin]
|
29
|
-
}
|
30
|
-
opts = OptionParser.new do |op|
|
31
|
-
op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
|
32
|
-
op.separator "output: "
|
33
|
-
op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
|
34
|
-
op.separator "format:"
|
35
|
-
op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
|
36
|
-
op.separator ""
|
37
|
-
op.separator " Initiator Methionines - by default, will generate two peptides"
|
38
|
-
op.separator " for any peptide found at the N-termini starting with 'M'"
|
39
|
-
op.separator " (i.e., one with and one without the leading methionine)"
|
40
|
-
op.separator ""
|
41
|
-
op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
|
42
|
-
op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
|
43
|
-
op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
|
44
|
-
op.on("--no-expand-x", "don't enumerate aa 'X' possibilities") { opt[:expand_aa] = nil }
|
45
|
-
op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Mspire::Insilico::Digester.const_get(v.upcase) }
|
46
|
-
op.on("--list-enzymes", "lists approved enzymes and exits") do
|
47
|
-
puts Mspire::Digester::ENZYMES.keys.join("\n")
|
48
|
-
exit
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
opts.parse!(argv)
|
53
|
-
|
54
|
-
if argv.size == 0
|
55
|
-
puts opts || exit
|
56
|
-
end
|
57
|
-
|
58
|
-
argv.map do |file|
|
59
|
-
Mspire::Ident::Peptide::Db.peptide_centric_db(file, opt)
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
# writes a new file with the added 'min_aaseq<Integer>'
|
64
|
-
# creates a temporary digestion file that contains all peptides digesting
|
65
|
-
# with certain missed_cleavages (i.e., min_seq_length is not applied to
|
66
|
-
# this file but on the final peptide centric db)
|
67
|
-
# returns the full name of the written file.
|
68
|
-
def self.peptide_centric_db(fasta_file, opts={})
|
69
|
-
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
70
|
-
|
71
|
-
(missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa)
|
72
|
-
start_time = Time.now
|
73
|
-
print "Digesting #{fasta_file} ..." if $VERBOSE
|
74
|
-
|
75
|
-
if expand_aa
|
76
|
-
letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
|
77
|
-
end
|
78
|
-
|
79
|
-
base = fasta_file.chomp(File.extname(fasta_file))
|
80
|
-
digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
|
81
|
-
File.open(digestion_file, "w") do |fh|
|
82
|
-
Mspire::Fasta.open(fasta_file) do |fasta|
|
83
|
-
fasta.each do |prot|
|
84
|
-
peptides = enzyme.digest(prot.sequence, missed_cleavages)
|
85
|
-
if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
|
86
|
-
m_peps = []
|
87
|
-
init_methionine_peps = []
|
88
|
-
peptides.each do |pep|
|
89
|
-
# if the peptide is at the beginning of the protein sequence
|
90
|
-
if prot.sequence[0,pep.size] == pep
|
91
|
-
m_peps << pep[1..-1]
|
92
|
-
end
|
93
|
-
end
|
94
|
-
peptides.push(*m_peps)
|
95
|
-
end
|
96
|
-
if expand_aa
|
97
|
-
peptides = peptides.map do |pep|
|
98
|
-
if pep =~ letters_to_expand_re
|
99
|
-
expand_peptides(pep, expand_aa)
|
100
|
-
else
|
101
|
-
pep
|
102
|
-
end
|
103
|
-
end.flatten
|
104
|
-
end
|
105
|
-
fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
puts "#{Time.now - start_time} sec" if $VERBOSE
|
110
|
-
|
111
|
-
|
112
|
-
start_time = Time.now
|
113
|
-
print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
|
11
|
+
KEY_VALUE_DELIMITER = ': '
|
114
12
|
|
115
|
-
|
116
|
-
::IO.foreach(digestion_file) do |line|
|
117
|
-
(prot, *peps) = line.chomp!.split(/\s+/)
|
118
|
-
# prot is something like this: "sp|P31946|1433B_HUMAN" in uniprot
|
119
|
-
peps.each do |pep|
|
120
|
-
if pep.size >= min_length
|
121
|
-
hash[pep] << prot
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
puts "#{Time.now - start_time} sec" if $VERBOSE
|
126
|
-
|
127
|
-
base = digestion_file.chomp(File.extname(digestion_file))
|
128
|
-
final_outfile = base + ".min_aaseq#{min_length}" + ".yml"
|
129
|
-
|
130
|
-
start_time = Time.now
|
131
|
-
print "Writing #{hash.size} peptides to #{} ..." if $VERBOSE
|
132
|
-
|
133
|
-
File.open(final_outfile, 'w') do |out|
|
134
|
-
hash.each do |k,v|
|
135
|
-
out.puts( [k, v.join(PROTEIN_DELIMITER)].join(KEY_VALUE_DELIMITER) )
|
136
|
-
end
|
137
|
-
end
|
138
|
-
puts "#{Time.now - start_time} sec" if $VERBOSE
|
139
|
-
|
140
|
-
if remove_digestion_file
|
141
|
-
File.unlink(digestion_file)
|
142
|
-
end
|
143
|
-
File.expand_path(final_outfile)
|
144
|
-
end
|
145
|
-
|
146
|
-
# does combinatorial expansion of all letters requesting it.
|
147
|
-
# expand_aa is hash like: {'X'=>STANDARD_AA}
|
148
|
-
# returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
|
149
|
-
# be expanded
|
150
|
-
# returns an empty array if there is no expansion
|
151
|
-
def self.expand_peptides(peptide, expand_aa)
|
152
|
-
letters_in_order = expand_aa.keys.sort
|
153
|
-
index_and_key = []
|
154
|
-
peptide.split('').each_with_index do |char,i|
|
155
|
-
if let_index = letters_in_order.index(char)
|
156
|
-
index_and_key << [i, letters_in_order[let_index]]
|
157
|
-
end
|
158
|
-
end
|
159
|
-
if index_and_key.size > MAX_NUM_AA_EXPANSION
|
160
|
-
return nil
|
161
|
-
end
|
162
|
-
to_expand = [peptide]
|
163
|
-
index_and_key.each do |i,letter|
|
164
|
-
new_peps = []
|
165
|
-
while current_pep = to_expand.shift do
|
166
|
-
new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
|
167
|
-
end
|
168
|
-
to_expand = new_peps.flatten
|
169
|
-
end
|
170
|
-
to_expand
|
171
|
-
end
|
13
|
+
attr_accessor :data
|
172
14
|
|
173
15
|
def initialize(db_file)
|
174
|
-
|
16
|
+
@data = YAML.load_file(db_file)
|
175
17
|
end
|
176
18
|
|
177
|
-
|
178
|
-
|
179
|
-
# returns the protein id's as an array
|
19
|
+
# returns protein id's as an array
|
180
20
|
def [](key)
|
181
|
-
|
21
|
+
val=@data[key]
|
22
|
+
val.chomp.split(PROTEIN_DELIMITER) if val
|
182
23
|
end
|
183
24
|
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
class IO
|
188
|
-
include Enumerable
|
189
|
-
def self.open(filename, &block)
|
190
|
-
raise ArgumentError unless block
|
191
|
-
File.open(filename) do |io|
|
192
|
-
block.call(self.new(io))
|
193
|
-
end
|
194
|
-
end
|
195
|
-
|
196
|
-
attr_accessor :io
|
197
|
-
attr_accessor :index
|
198
|
-
|
199
|
-
def initialize(io)
|
200
|
-
@io = io
|
201
|
-
@index = {}
|
202
|
-
re = /^(\w+)#{Regexp.escape(KEY_VALUE_DELIMITER)}/
|
203
|
-
prev_io_pos = io.pos
|
204
|
-
triplets = io.each_line.map do |line|
|
205
|
-
key = re.match(line)[1]
|
206
|
-
[key, prev_io_pos + key.bytesize+KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
|
207
|
-
end
|
208
|
-
triplets.each do |key, start, end_pos|
|
209
|
-
@index[key] = [start, end_pos-start]
|
210
|
-
end
|
211
|
-
end
|
212
|
-
|
213
|
-
# returns an array of proteins for the given key (peptide aaseq)
|
214
|
-
def [](key)
|
215
|
-
(start, length) = @index[key]
|
216
|
-
return nil unless start
|
217
|
-
@io.seek(start)
|
218
|
-
string = @io.read(length)
|
219
|
-
string.chomp!
|
220
|
-
string.split("\t")
|
221
|
-
end
|
222
|
-
|
223
|
-
# number of entries
|
224
|
-
def size ; @index.size end
|
225
|
-
alias_method :length, :size
|
226
|
-
|
227
|
-
def keys
|
228
|
-
@index.keys
|
229
|
-
end
|
25
|
+
def keys
|
26
|
+
@data.keys
|
27
|
+
end
|
230
28
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
end
|
29
|
+
def values
|
30
|
+
@data.values
|
31
|
+
end
|
235
32
|
|
236
|
-
|
237
|
-
|
238
|
-
@index.each do |key, start_length|
|
239
|
-
block.call([key, self[key]])
|
240
|
-
end
|
241
|
-
end
|
33
|
+
def size
|
34
|
+
@data.size
|
242
35
|
end
|
243
36
|
end
|
@@ -19,12 +19,13 @@ class Mspire::Ident::PeptideHit
|
|
19
19
|
|
20
20
|
# writes the peptide hits to a phq.tsv file. qvalues is a parallel array
|
21
21
|
# to hits that can provide qvalues if not inherent to the hits
|
22
|
-
# returns the filename.
|
22
|
+
# returns the filename. Expects each hit to implement #search_id, #id,
|
23
|
+
# #aaseq and #charge
|
23
24
|
def to_file(filename, hits, qvalues=[])
|
24
25
|
File.open(filename,'w') do |out|
|
25
26
|
out.puts HEADER.join(FILE_DELIMITER)
|
26
27
|
hits.zip(qvalues) do |hit, qvalue|
|
27
|
-
out.puts [hit.
|
28
|
+
out.puts [hit.search_id, hit.id, hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
|
28
29
|
end
|
29
30
|
end
|
30
31
|
filename
|