protk 1.2.6.pre5 → 1.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
@@ -1,264 +0,0 @@
1
- require 'bio'
2
- require 'matrix'
3
-
4
- class PeptideFragment
5
- attr_accessor :start
6
- attr_accessor :end
7
- attr_accessor :seq
8
- end
9
-
10
- class PeptideToGeneAlignment
11
- attr_accessor :gene_seq
12
- attr_accessor :pep_seq
13
- attr_accessor :trace
14
-
15
- def initialize(gene,peptide,trace)
16
- @gene_seq = gene
17
- @pep_seq = peptide
18
- @trace = trace
19
- end
20
-
21
- def inspect
22
- descr = "#{@gene_seq}\n"
23
-
24
- pep_triples=""
25
- @pep_seq.each_char { |c| pep_triples<<c;pep_triples<<c;pep_triples<<c }
26
-
27
- # gene_seq_triples=""
28
- # Bio::Sequence::NA.new(@gene_seq).translate.each_char do |c|
29
- # gene_seq_triples<<c;gene_seq_triples<<c;gene_seq_triples<<c
30
- # end
31
-
32
- # descr << "#{gene_seq_triples}\n"
33
-
34
- pepi=0
35
- @trace.each_with_index do |move, i|
36
- if move==1
37
- descr<<"-"
38
- elsif move==0
39
- descr<<"#{pep_triples[pepi]}"
40
- pepi+=1
41
- end
42
- end
43
- descr<<"\n"
44
- puts descr
45
- end
46
-
47
- def fragments
48
- frags=[]
49
- in_fragment=false
50
- @trace.each_with_index do |move,i|
51
- if move==0
52
- frags << [i,0] unless in_fragment #Start a fragment
53
- in_fragment=true
54
- else
55
- frags.last[1]=i-1 if in_fragment #End a fragment
56
- in_fragment=false
57
- end
58
- end
59
- if frags.last[1]==0
60
- frags.last[1]=@trace.length-1
61
- end
62
- frags
63
- end
64
-
65
- def gaps
66
- gps=[]
67
- in_start_end=true
68
- in_gap=false
69
- @trace.each_with_index do |move, i|
70
- if move==0
71
- in_start_end=false
72
- if in_gap #Ending a gap
73
- gps.last[1]=i
74
- end
75
- in_gap=false
76
- else
77
- if !in_start_end && !in_gap #Starting a gap
78
- in_gap=true
79
- gps<<[i,0]
80
- end
81
- end
82
- end
83
- #Remove gaps that have zero length (Trailing)
84
- gps=gps.collect do |gp|
85
- rv=gp
86
- if gp[1]==0
87
- rv=nil
88
- end
89
- rv
90
- end
91
- gps.compact!
92
- gps
93
- end
94
-
95
- end
96
-
97
- # Uses a dynamic programming algorithm (Smith-Waterman like) to align a peptide sequence to a nucleotide.
98
- # This aligner assumes you are doing protogenomics and just want to assume that
99
- # (a) The entire peptide sequence matches (with gaps) to the DNA sequence
100
- #
101
- class GappedAligner
102
-
103
- def initialize
104
- @big_penalty = -1000000000
105
- @gap_open_penalty = -10000
106
- @gap_extend_penalty = -1
107
- @end_gap_penalty = 0
108
- @match_bonus = 400
109
-
110
- @match_move=0
111
- @aadel_move=-1
112
- @nadel_move=1
113
- @triplet_offsets = [[0,-2,-1],[-1,0,-2],[-2,-1,0]]
114
- end
115
-
116
- def aa_deletion()
117
- return @big_penalty
118
- end
119
-
120
- def score_na_deletion(move_type)
121
- if move_type==@nadel_move
122
- return @gap_extend_penalty
123
- end
124
- return @gap_open_penalty
125
- end
126
-
127
- def score_match(aa,na)
128
- if aa==na
129
- return @match_bonus
130
- end
131
- return @big_penalty
132
- end
133
-
134
- def traceback(from_row,from_col,dpmoves)
135
- last_move = dpmoves[from_row][from_col]
136
- last_row = from_row-1
137
- last_col = from_col-1
138
- if last_move==@aadel_move
139
- last_col+=1
140
- elsif last_move==@nadel_move
141
- last_row+=1
142
- end
143
-
144
- if last_col==0 && last_row==0
145
- return [last_move]
146
- else
147
- throw "Beyond end of array" if last_col<0 || last_row <0
148
-
149
- return traceback(last_row,last_col,dpmoves).push(last_move)
150
- end
151
- end
152
-
153
- def next_frame(previous_frame)
154
- (previous_frame+1) % 3
155
- end
156
-
157
- def translate_na_at(j,frame,gene_seq)
158
- rm = j % 3
159
- start_pos=j+@triplet_offsets[rm][frame]
160
- if start_pos < 0
161
- return '-'
162
- else
163
- return gene_seq[start_pos,3].translate
164
- end
165
- end
166
-
167
- def save_matrix(dpmatrix,pep_triples,gene_seq,name)
168
- matfile=File.open("#{name}.csv", "w+")
169
- matfile.write(",,")
170
- gene_seq.each_char { |na| matfile.write("#{na},") }
171
- matfile.write("\n")
172
- dpmatrix.each_with_index { |row,ri|
173
- if ri>0
174
- matfile.write("#{pep_triples[ri-1]},")
175
- else
176
- matfile.write(",")
177
- end
178
- row.each { |col|
179
- matfile.write("#{col},")
180
- }
181
- matfile.write("\n")
182
- }
183
- matfile.close()
184
- end
185
-
186
- def calculate_dp(pep_seq,gene_seq)
187
- gene_seq = Bio::Sequence::NA.new(gene_seq)
188
- nrow = pep_seq.length*3+1
189
- ncol = gene_seq.length+1
190
-
191
- throw "Peptide sequence is longer than gene" if nrow > ncol
192
-
193
- pep_triples=""
194
- pep_seq.each_char { |c| pep_triples<<c;pep_triples<<c;pep_triples<<c }
195
-
196
- dpmoves=Matrix.build(nrow,ncol) {|r,c| 0 }.to_a
197
- dpmatrix=Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
198
- dpframes=Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
199
- # before_gap_positions = Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
200
-
201
- # Boundary conditions
202
- (0..(nrow-1)).each { |i|
203
- dpmatrix[i][0] = aa_deletion*i
204
- dpmoves[i][0] = @aadel_move
205
- }
206
- (0..(ncol-1)).each { |j|
207
- dpmatrix[0][j] = @end_gap_penalty*j
208
- dpmoves[0][j] = @nadel_move
209
- dpframes[0][j] = j % 3
210
- }
211
- dpmoves[0][0]=0
212
- dpframes[0][0]=0
213
-
214
- (1..(nrow-1)).each do |i|
215
- (1..(ncol-1)).each do |j|
216
- aa = pep_triples[i-1]
217
-
218
- translated_na = translate_na_at(j-1,dpframes[i-1][j-1],gene_seq)
219
-
220
- match = score_match(aa,translated_na) + dpmatrix[i-1][j-1]
221
-
222
- nadel = score_na_deletion(dpmoves[i][j-1]) + dpmatrix[i][j-1]
223
-
224
- # if (translated_na=="R") && (pep_seq=="FR") && (aa == "R")
225
- # require 'debugger';debugger
226
- # end
227
-
228
- if match >= nadel
229
- dpmatrix[i][j] = match
230
- dpmoves[i][j] = @match_move
231
- dpframes[i][j] = dpframes[i-1][j-1]
232
- else
233
- dpmatrix[i][j] = nadel
234
- dpmoves[i][j] = @nadel_move
235
- dpframes[i][j] = next_frame(dpframes[i][j-1])
236
- end
237
-
238
- end
239
- end
240
-
241
- # Find best end-point
242
- end_score = dpmatrix[nrow-1].max
243
- end_j = dpmatrix[nrow-1].index(end_score)
244
-
245
- save_matrix(dpmatrix,pep_triples,gene_seq,"dpmatrix")
246
- save_matrix(dpmoves,pep_triples,gene_seq,"moves")
247
- save_matrix(dpframes,pep_triples,gene_seq,"frames")
248
- # require 'debugger';debugger
249
-
250
- traceback(nrow-1,end_j,dpmoves)
251
- end
252
-
253
-
254
- def align pep_seq, gene_seq
255
-
256
- trace = calculate_dp(pep_seq,gene_seq)
257
- alignment = PeptideToGeneAlignment.new(gene_seq,pep_seq,trace)
258
- # puts alignment
259
- # require 'debugger';debugger
260
-
261
- return alignment
262
- end
263
-
264
- end