protk 1.2.6.pre5 → 1.3.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
@@ -1,264 +0,0 @@
1
- require 'bio'
2
- require 'matrix'
3
-
4
- class PeptideFragment
5
- attr_accessor :start
6
- attr_accessor :end
7
- attr_accessor :seq
8
- end
9
-
10
- class PeptideToGeneAlignment
11
- attr_accessor :gene_seq
12
- attr_accessor :pep_seq
13
- attr_accessor :trace
14
-
15
- def initialize(gene,peptide,trace)
16
- @gene_seq = gene
17
- @pep_seq = peptide
18
- @trace = trace
19
- end
20
-
21
- def inspect
22
- descr = "#{@gene_seq}\n"
23
-
24
- pep_triples=""
25
- @pep_seq.each_char { |c| pep_triples<<c;pep_triples<<c;pep_triples<<c }
26
-
27
- # gene_seq_triples=""
28
- # Bio::Sequence::NA.new(@gene_seq).translate.each_char do |c|
29
- # gene_seq_triples<<c;gene_seq_triples<<c;gene_seq_triples<<c
30
- # end
31
-
32
- # descr << "#{gene_seq_triples}\n"
33
-
34
- pepi=0
35
- @trace.each_with_index do |move, i|
36
- if move==1
37
- descr<<"-"
38
- elsif move==0
39
- descr<<"#{pep_triples[pepi]}"
40
- pepi+=1
41
- end
42
- end
43
- descr<<"\n"
44
- puts descr
45
- end
46
-
47
- def fragments
48
- frags=[]
49
- in_fragment=false
50
- @trace.each_with_index do |move,i|
51
- if move==0
52
- frags << [i,0] unless in_fragment #Start a fragment
53
- in_fragment=true
54
- else
55
- frags.last[1]=i-1 if in_fragment #End a fragment
56
- in_fragment=false
57
- end
58
- end
59
- if frags.last[1]==0
60
- frags.last[1]=@trace.length-1
61
- end
62
- frags
63
- end
64
-
65
- def gaps
66
- gps=[]
67
- in_start_end=true
68
- in_gap=false
69
- @trace.each_with_index do |move, i|
70
- if move==0
71
- in_start_end=false
72
- if in_gap #Ending a gap
73
- gps.last[1]=i
74
- end
75
- in_gap=false
76
- else
77
- if !in_start_end && !in_gap #Starting a gap
78
- in_gap=true
79
- gps<<[i,0]
80
- end
81
- end
82
- end
83
- #Remove gaps that have zero length (Trailing)
84
- gps=gps.collect do |gp|
85
- rv=gp
86
- if gp[1]==0
87
- rv=nil
88
- end
89
- rv
90
- end
91
- gps.compact!
92
- gps
93
- end
94
-
95
- end
96
-
97
- # Uses a dynamic programming algorithm (Smith-Waterman like) to align a peptide sequence to a nucleotide.
98
- # This aligner assumes you are doing protogenomics and just want to assume that
99
- # (a) The entire peptide sequence matches (with gaps) to the DNA sequence
100
- #
101
- class GappedAligner
102
-
103
- def initialize
104
- @big_penalty = -1000000000
105
- @gap_open_penalty = -10000
106
- @gap_extend_penalty = -1
107
- @end_gap_penalty = 0
108
- @match_bonus = 400
109
-
110
- @match_move=0
111
- @aadel_move=-1
112
- @nadel_move=1
113
- @triplet_offsets = [[0,-2,-1],[-1,0,-2],[-2,-1,0]]
114
- end
115
-
116
- def aa_deletion()
117
- return @big_penalty
118
- end
119
-
120
- def score_na_deletion(move_type)
121
- if move_type==@nadel_move
122
- return @gap_extend_penalty
123
- end
124
- return @gap_open_penalty
125
- end
126
-
127
- def score_match(aa,na)
128
- if aa==na
129
- return @match_bonus
130
- end
131
- return @big_penalty
132
- end
133
-
134
- def traceback(from_row,from_col,dpmoves)
135
- last_move = dpmoves[from_row][from_col]
136
- last_row = from_row-1
137
- last_col = from_col-1
138
- if last_move==@aadel_move
139
- last_col+=1
140
- elsif last_move==@nadel_move
141
- last_row+=1
142
- end
143
-
144
- if last_col==0 && last_row==0
145
- return [last_move]
146
- else
147
- throw "Beyond end of array" if last_col<0 || last_row <0
148
-
149
- return traceback(last_row,last_col,dpmoves).push(last_move)
150
- end
151
- end
152
-
153
- def next_frame(previous_frame)
154
- (previous_frame+1) % 3
155
- end
156
-
157
- def translate_na_at(j,frame,gene_seq)
158
- rm = j % 3
159
- start_pos=j+@triplet_offsets[rm][frame]
160
- if start_pos < 0
161
- return '-'
162
- else
163
- return gene_seq[start_pos,3].translate
164
- end
165
- end
166
-
167
- def save_matrix(dpmatrix,pep_triples,gene_seq,name)
168
- matfile=File.open("#{name}.csv", "w+")
169
- matfile.write(",,")
170
- gene_seq.each_char { |na| matfile.write("#{na},") }
171
- matfile.write("\n")
172
- dpmatrix.each_with_index { |row,ri|
173
- if ri>0
174
- matfile.write("#{pep_triples[ri-1]},")
175
- else
176
- matfile.write(",")
177
- end
178
- row.each { |col|
179
- matfile.write("#{col},")
180
- }
181
- matfile.write("\n")
182
- }
183
- matfile.close()
184
- end
185
-
186
- def calculate_dp(pep_seq,gene_seq)
187
- gene_seq = Bio::Sequence::NA.new(gene_seq)
188
- nrow = pep_seq.length*3+1
189
- ncol = gene_seq.length+1
190
-
191
- throw "Peptide sequence is longer than gene" if nrow > ncol
192
-
193
- pep_triples=""
194
- pep_seq.each_char { |c| pep_triples<<c;pep_triples<<c;pep_triples<<c }
195
-
196
- dpmoves=Matrix.build(nrow,ncol) {|r,c| 0 }.to_a
197
- dpmatrix=Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
198
- dpframes=Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
199
- # before_gap_positions = Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
200
-
201
- # Boundary conditions
202
- (0..(nrow-1)).each { |i|
203
- dpmatrix[i][0] = aa_deletion*i
204
- dpmoves[i][0] = @aadel_move
205
- }
206
- (0..(ncol-1)).each { |j|
207
- dpmatrix[0][j] = @end_gap_penalty*j
208
- dpmoves[0][j] = @nadel_move
209
- dpframes[0][j] = j % 3
210
- }
211
- dpmoves[0][0]=0
212
- dpframes[0][0]=0
213
-
214
- (1..(nrow-1)).each do |i|
215
- (1..(ncol-1)).each do |j|
216
- aa = pep_triples[i-1]
217
-
218
- translated_na = translate_na_at(j-1,dpframes[i-1][j-1],gene_seq)
219
-
220
- match = score_match(aa,translated_na) + dpmatrix[i-1][j-1]
221
-
222
- nadel = score_na_deletion(dpmoves[i][j-1]) + dpmatrix[i][j-1]
223
-
224
- # if (translated_na=="R") && (pep_seq=="FR") && (aa == "R")
225
- # require 'debugger';debugger
226
- # end
227
-
228
- if match >= nadel
229
- dpmatrix[i][j] = match
230
- dpmoves[i][j] = @match_move
231
- dpframes[i][j] = dpframes[i-1][j-1]
232
- else
233
- dpmatrix[i][j] = nadel
234
- dpmoves[i][j] = @nadel_move
235
- dpframes[i][j] = next_frame(dpframes[i][j-1])
236
- end
237
-
238
- end
239
- end
240
-
241
- # Find best end-point
242
- end_score = dpmatrix[nrow-1].max
243
- end_j = dpmatrix[nrow-1].index(end_score)
244
-
245
- save_matrix(dpmatrix,pep_triples,gene_seq,"dpmatrix")
246
- save_matrix(dpmoves,pep_triples,gene_seq,"moves")
247
- save_matrix(dpframes,pep_triples,gene_seq,"frames")
248
- # require 'debugger';debugger
249
-
250
- traceback(nrow-1,end_j,dpmoves)
251
- end
252
-
253
-
254
- def align pep_seq, gene_seq
255
-
256
- trace = calculate_dp(pep_seq,gene_seq)
257
- alignment = PeptideToGeneAlignment.new(gene_seq,pep_seq,trace)
258
- # puts alignment
259
- # require 'debugger';debugger
260
-
261
- return alignment
262
- end
263
-
264
- end