protk 1.2.4 → 1.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,6 +19,11 @@ include LibXML
19
19
  tool=Tool.new([:explicit_output])
20
20
  tool.option_parser.banner = "Convert a protXML file to a tab delimited table.\n\nUsage: protxml_to_table.rb [options] file1.protXML"
21
21
 
22
+ # tool.options.proteinid_regex=".*?\|.*?\|(.*)"
23
+ # tool.option_parser.on( '--regex rexpr', 'Regex' ) do |regex|
24
+ # tool.options.proteinid_regex=regex
25
+ # end
26
+
22
27
  exit unless tool.check_options
23
28
 
24
29
  if ( ARGV[0].nil? )
@@ -48,7 +53,7 @@ end
48
53
 
49
54
 
50
55
  column_headers=[
51
- "group_number","group_probability","protein_name",
56
+ "group_number","group_probability","protein_name","protein_id","indistinguishable_proteins",
52
57
  "protein_probability","coverage","peptides",
53
58
  "num_peptides","confidence"
54
59
  ]
@@ -62,13 +67,25 @@ protein_groups.each do |protein_group|
62
67
 
63
68
  proteins=protein_group.find("./#{protxml_ns_prefix}protein", protxml_ns)
64
69
 
65
- proteins.each do |protein|
70
+ proteins.each do |protein|
71
+
72
+ indis_proteins=protein.find("./#{protxml_ns_prefix}indistinguishable_protein", protxml_ns)
73
+ indis_proteins_summary=""
74
+ indis_proteins.each { |iprot| indis_proteins_summary<<"#{iprot.attributes['protein_name']};" }
75
+
76
+ protein_id=""
77
+ if protein.attributes['protein_name'] =~ /.*?\|.*?\|(.*)/
78
+ protein_id=protein.attributes['protein_name'].match(/.*?\|.*?\|(.*)/)[1]
79
+ end
80
+
66
81
  column_values=[]
67
82
 
68
83
  column_values << protein_group.attributes['group_number']
69
84
  column_values << protein_group.attributes['probability']
70
85
 
71
86
  column_values << protein.attributes['protein_name']
87
+ column_values << protein_id
88
+ column_values << indis_proteins_summary
72
89
  column_values << protein.attributes['probability']
73
90
  column_values << protein.attributes['percent_coverage']
74
91
  column_values << protein.attributes['unique_stripped_peptides']
data/bin/sixframe.rb CHANGED
@@ -41,9 +41,11 @@ end
41
41
 
42
42
  inname=ARGV.shift
43
43
 
44
- outfile=File.open("#{inname}.translated.fasta",'w')
44
+ outfile=nil
45
45
  if ( tool.explicit_output != nil)
46
46
  outfile=File.open(tool.explicit_output,'w')
47
+ else
48
+ outfile=File.open("#{inname}.translated.fasta",'w')
47
49
  end
48
50
 
49
51
 
data/bin/tandem_search.rb CHANGED
@@ -149,6 +149,37 @@ def set_option(std_params, tandem_key, value)
149
149
  notes[0].content=value
150
150
  end
151
151
 
152
+ def append_option(std_params, tandem_key, value)
153
+ notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
154
+ if notes.length == 0
155
+ node = XML::Node.new('note')
156
+ node["type"] = "input"
157
+ node["label"] = tandem_key
158
+ node.content = value
159
+ std_params.find('/bioml')[0] << node
160
+ else
161
+ throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
162
+ notes[0].content = append_string(notes[0].content, value)
163
+ end
164
+ end
165
+
166
+ def collapse_keys(std_params, tandem_key)
167
+ mods=std_params.find('/bioml/note[@type="input" and @label="#{tandem_key}"]')
168
+ if not mods
169
+ first_mod = mods[0]
170
+ rest_mods = mods[1..-1]
171
+ rest_mods.each{ |node| first_mod.content = append_string(first_mod.content, node.content); node.remove!}
172
+ end
173
+ end
174
+
175
+ def append_string(first, second)
176
+ if first.empty?
177
+ second
178
+ else
179
+ "#{first},#{second}"
180
+ end
181
+ end
182
+
152
183
  def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
153
184
  set_option(std_params, "protein, cleavage semi", search_tool.cleavage_semi ? "yes" : "no")
154
185
  set_option(std_params, "scoring, maximum missed cleavage sites", search_tool.missed_cleavages)
@@ -301,7 +332,11 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
301
332
  mods=std_params.find('/bioml/note[@type="input" and @id="methionine-oxidation-variable"]')
302
333
  mods.each{ |node| node.remove!}
303
334
  end
304
-
335
+
336
+ # Merge all remaining id based modification into single modification.
337
+ collapse_keys(std_params, "residue, potential modification mass")
338
+ collapse_keys(std_params, "residue, modification mass")
339
+
305
340
  var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
306
341
  var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
307
342
  fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
@@ -313,31 +348,17 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
313
348
  var_mods.each do |vm|
314
349
 
315
350
  mod_type="potential modification mass"
316
- mod_type = "potential modification motif" if ( vm=~/[\[\]\(\)\{\}\!]/ )
317
- mod_id_label = "custom-variable-mod-#{mod_id.to_s}"
318
- mod_id=mod_id+1
319
- mnode=XML::Node.new('node')
320
- mnode["id"]=mod_id_label
321
- mnode["type"]="input"
322
- mnode["label"]="residue, #{mod_type}"
323
- mnode.content=vm
324
-
325
- root_bioml_node << mnode
351
+ mod_type = "potential modification motif" if motif?(vm)
352
+ label="residue, #{mod_type}"
353
+ append_option(std_params, label, vm)
326
354
  end
327
355
 
328
356
  mod_id=1
329
357
  fix_mods.each do |fm|
330
358
  mod_type="modification mass"
331
- mod_type = "modification motif" if ( fm=~/[\[\]\(\)\{\}\!]/ )
332
- mod_id_label = "custom-fixed-mod-#{mod_id.to_s}"
333
- mod_id=mod_id+1
334
- mnode=XML::Node.new('node')
335
- mnode["id"]=mod_id_label
336
- mnode["type"]="input"
337
- mnode["label"]="residue, #{mod_type}"
338
- mnode.content=fm
339
-
340
- root_bioml_node << mnode
359
+ mod_type = "modification motif" if motif?(fm)
360
+ label="residue, #{mod_type}"
361
+ append_option(std_params, label, fm)
341
362
  end
342
363
 
343
364
  #p root_bioml_node
@@ -345,6 +366,13 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
345
366
 
346
367
  end
347
368
 
369
+ def motif?(mod_string)
370
+ # 124@[ is not a modification motif, it is a residue (N-term) modification,
371
+ # so when checking if modification is a motif look for paired square brackets.
372
+ mod_string =~ /[\(\)\{\}\!]/ or mod_string =~ /\[.*\]/
373
+ end
374
+
375
+
348
376
  def generate_taxonomy_doc(taxo_doc,current_db,search_tool)
349
377
 
350
378
  taxon_label=taxo_doc.find('/bioml/taxon')
@@ -425,8 +453,8 @@ ARGV.each do |filename|
425
453
  # Run the search
426
454
  #
427
455
  job_params= {:jobid => search_tool.jobid_from_filename(filename)}
428
- job_params[:queue]="lowmem"
429
- job_params[:vmem]="900mb"
456
+ job_params[:queue]="sixteen"
457
+ job_params[:vmem]="12gb"
430
458
  code = search_tool.run(cmd,genv,job_params,jobscript_path)
431
459
  throw "Command failed with exit code #{code}" unless code==0
432
460
  else
@@ -15,7 +15,7 @@ require 'libxml'
15
15
 
16
16
  include LibXML
17
17
 
18
- tool=Tool.new([:explicit_output, :background,:over_write])
18
+ tool=Tool.new([:background,:over_write])
19
19
  tool.option_parser.banner = "Execute a toppas pipeline with a single inputs node\n\nUsage: toppas_pipeline.rb [options] input1 input2 ..."
20
20
 
21
21
  tool.options.outdir = ""
@@ -28,6 +28,11 @@ tool.option_parser.on( '--toppas-file f',"the toppas file to run" ) do |file|
28
28
  tool.options.toppas_file = file
29
29
  end
30
30
 
31
+ tool.options.threads = "1"
32
+ tool.option_parser.on( '--threads t',"Number of threads to use" ) do |tr|
33
+ tool.options.threads=tr
34
+ end
35
+
31
36
  exit unless tool.check_options
32
37
 
33
38
  if ( ARGV[0].nil? )
@@ -67,13 +72,13 @@ throw "outdir is a required parameter" if tool.outdir==""
67
72
  throw "toppas-file is a required parameter" if tool.toppas_file==""
68
73
  throw "outdir must exist" unless Dir.exist?(tool.outdir)
69
74
 
70
- trf_path = "#{tool.toppas_file}.trf"
75
+ trf_path = "#{Pathname.new(Tempfile.new(tool.toppas_file).path).basename.to_s}.trf"
71
76
 
72
77
  generate_trf(ARGV,trf_path)
73
78
 
74
79
  cmd=""
75
80
  cmd<<"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:#{genv.openms_root}/lib;
76
- #{genv.executepipeline} -in #{Pathname.new(tool.toppas_file).realpath.to_s} -out_dir #{Pathname.new(tool.outdir).realpath.to_s} -resource_file #{Pathname.new(trf_path).realpath.to_s}"
81
+ #{genv.executepipeline} -in #{Pathname.new(tool.toppas_file).realpath.to_s} -out_dir #{Pathname.new(tool.outdir).realpath.to_s} -resource_file #{Pathname.new(trf_path).realpath.to_s} -threads #{tool.threads}"
77
82
 
78
83
  run_pipeline(genv,tool,cmd,tool.outdir,tool.jobid_from_filename(tool.toppas_file))
79
84
 
@@ -24,6 +24,11 @@ tool.option_parser.on( '--id-column num', 'Specify a column for ids (default is
24
24
  tool.options.id_column=col.to_i
25
25
  end
26
26
 
27
+ tool.options.flatfiledb="swissprot"
28
+ tool.option_parser.on( '--flatfiledb dbname', 'Specify path to a Uniprot flatfile' ) do |dbname|
29
+ tool.options.flatfiledb=dbname
30
+ end
31
+
27
32
  tool.options.fields=nil
28
33
  tool.option_parser.on( '--fields flds', 'A comma separated list of fields to extract' ) do |flds|
29
34
  tool.options.fields=flds
@@ -42,7 +47,7 @@ genv=Constants.new
42
47
 
43
48
  input_file=ARGV[0]
44
49
 
45
- swissprotdb=SwissprotDatabase.new(genv)
50
+ swissprotdb=SwissprotDatabase.new(genv,tool.flatfiledb)
46
51
 
47
52
  output_file=nil
48
53
 
@@ -1,6 +1,4 @@
1
1
  #include <ruby.h>
2
-
3
-
4
2
  /* */
5
3
  /* make_random.c - make random protein sequence database using Markov chain with transitional */
6
4
  /* probabilities from amino acid frequencies in a real database in FASTA format */
@@ -25,7 +23,8 @@
25
23
  #define MAX_SEQUENCE_LENGTH 20000
26
24
  #define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
27
25
 
28
- static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
26
+
27
+ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
29
28
  char *input_file = RSTRING_PTR(input_file_in);
30
29
  long sequences_to_generate = NUM2INT(db_length_in);
31
30
  char * output_file = RSTRING_PTR(output_file_in);
@@ -148,7 +147,7 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
148
147
  measured_aa_freq[a]++;
149
148
  }
150
149
  }
151
- else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
150
+ else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
152
151
  }
153
152
  MP[20][pl]++;
154
153
  measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
@@ -178,12 +177,12 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
178
177
  x=(double)row_sum[j]*((double)rand()/RAND_MAX);
179
178
  partial_sum=MP[0][j]; i=1;
180
179
  while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
181
- if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
180
+ if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
182
181
  if (i<21)
183
182
  {
184
183
  random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
185
184
  }
186
- else /* i==21, i.e. protein sequence terminated */
185
+ else /* i==21, i.e. protein sequence terminated */
187
186
  {
188
187
  k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
189
188
  for(l=0;l<j;l++)
@@ -196,7 +195,7 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
196
195
  }
197
196
 
198
197
  random_sequence_output[k]='\0';
199
- if (!(k%61)) random_sequence_output[k-1]='\0'; /* remove extra newline for sequence length multiple of 60 */
198
+ if (!(k%61)) random_sequence_output[k-1]='\0'; /* remove extra newline for sequence length multiple of 60 */
200
199
  fprintf(outp,">%srp%li\n%s\n",prefix_string,protein,random_sequence_output);
201
200
  break;
202
201
  }
@@ -222,14 +221,13 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
222
221
 
223
222
  }
224
223
 
225
- /* ruby calls this to load the extension */
226
- void Init_protk(void) {
227
- /* assume we haven't yet defined Hola */
228
- VALUE klass = rb_define_class("Protk",
229
- rb_cObject);
230
224
 
231
- /* the hola_bonjour function can be called
232
- * from ruby as "Hola.bonjour" */
225
+ void Init_decoymaker(void)
226
+ {
227
+ VALUE klass = rb_define_class("Decoymaker",rb_cObject);
228
+
233
229
  rb_define_singleton_method(klass,
234
- "make_decoys", protk_make_decoys, 4);
230
+ "make_decoys", decoymaker_make_decoys, 4);
231
+
232
+
235
233
  }
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('protk/decoymaker')
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('protk/simplealign')
@@ -1,10 +1,10 @@
1
1
  <?xml version="1.0" encoding="ISO-8859-1"?>
2
- <PARAMETERS version="1.3" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/schemas/Param_1_3.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
2
+ <PARAMETERS version="1.4" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/schemas/Param_1_4.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3
3
  <NODE name="FeatureFinderIsotopeWavelet" description="Detects two-dimensional features in LC-MS data.">
4
- <ITEM name="version" value="1.9.0" type="string" description="Version of the tool that generated this parameters file." tags="advanced" />
4
+ <ITEM name="version" value="1.10.0" type="string" description="Version of the tool that generated this parameters file." tags="advanced" />
5
5
  <NODE name="1" description="Instance &apos;1&apos; section for &apos;FeatureFinderIsotopeWavelet&apos;">
6
- <ITEM name="in" value="" type="string" description="input file" tags="input file,required" restrictions="*.mzML" />
7
- <ITEM name="out" value="" type="string" description="output file" tags="output file,required" restrictions="*.featureXML" />
6
+ <ITEM name="in" value="" type="string" description="input file" tags="input file,required" supported_formats="*.mzML" />
7
+ <ITEM name="out" value="" type="string" description="output file" tags="output file,required" supported_formats="*.featureXML" />
8
8
  <ITEM name="log" value="" type="string" description="Name of log file (created only when specified)" tags="advanced" />
9
9
  <ITEM name="debug" value="0" type="int" description="Sets the debug level" tags="advanced" />
10
10
  <ITEM name="threads" value="1" type="int" description="Sets the number of threads allowed to be used by the TOPP tool" />
@@ -12,9 +12,9 @@
12
12
  <ITEM name="test" value="false" type="string" description="Enables the test mode (needed for internal use only)" tags="advanced" restrictions="true,false" />
13
13
  <NODE name="algorithm" description="Algorithm section">
14
14
  <ITEM name="max_charge" value="3" type="int" description="The maximal charge state to be considered." restrictions="1:" />
15
- <ITEM name="intensity_threshold" value="3" type="float" description="The final threshold t&apos; is build upon the formula: t&apos; = av+t*sd, where t is the intensity_threshold, av the average intensity within the wavelet transformed signal and sd the standard deviation of the transform. If you set intensity_threshold=-1, t&apos; will be zero.#br#As the &apos;optimal&apos; value for this parameter is highly data dependent, we would recommend to start with -1, which will also extract features with very low signal-to-noise ratio. Subsequently, one might increase the threshold to find an optimized trade-off between false positives and true positives. Depending on the dynamic range of your spectra, suitable value ranges include: -1, [0:10], and if your data features even very high intensity values, t can also adopt values up to around 30. Please note that this parameter is not of an integer type, s.t. you can also use t:=0.1, e.g." />
15
+ <ITEM name="intensity_threshold" value="-1" type="float" description="The final threshold t&apos; is build upon the formula: t&apos; = av+t*sd, where t is the intensity_threshold, av the average intensity within the wavelet transformed signal and sd the standard deviation of the transform. If you set intensity_threshold=-1, t&apos; will be zero.#br#As the &apos;optimal&apos; value for this parameter is highly data dependent, we would recommend to start with -1, which will also extract features with very low signal-to-noise ratio. Subsequently, one might increase the threshold to find an optimized trade-off between false positives and true positives. Depending on the dynamic range of your spectra, suitable value ranges include: -1, [0:10], and if your data features even very high intensity values, t can also adopt values up to around 30. Please note that this parameter is not of an integer type, s.t. you can also use t:=0.1, e.g." />
16
16
  <ITEM name="intensity_type" value="ref" type="string" description="Determines the intensity type returned for the identified features. &apos;ref&apos; (default) returns the sum of the intensities of each isotopic peak within an isotope pattern. &apos;trans&apos; refers to the intensity of the monoisotopic peak within the wavelet transform. &apos;corrected&apos; refers also to the transformed intensity with an attempt to remove the effects of the convolution. While the latter ones might be preferable for qualitative analyses, &apos;ref&apos; might be the best option to obtain quantitative results. Please note that intensity values might be spoiled (in particular for the option &apos;ref&apos;), as soon as patterns overlap (see also the explanations given in the class documentation of FeatureFinderAlgorihtmIsotopeWavelet)." tags="advanced" restrictions="ref,trans,corrected" />
17
- <ITEM name="check_ppm" value="true" type="string" description="Enables/disables a ppm test vs. the averagine model, i.e. potential peptide masses are checked for plausibility. In addition, a heuristic correcting potential mass shifts induced by the wavelet is applied." tags="advanced" restrictions="true,false" />
17
+ <ITEM name="check_ppm" value="false" type="string" description="Enables/disables a ppm test vs. the averagine model, i.e. potential peptide masses are checked for plausibility. In addition, a heuristic correcting potential mass shifts induced by the wavelet is applied." tags="advanced" restrictions="true,false" />
18
18
  <ITEM name="hr_data" value="false" type="string" description="Must be true in case of high-resolution data, i.e. for spectra featuring large m/z-gaps (present in FTICR and Orbitrap data, e.g.). Please check a single MS scan out of your recording, if you are unsure." restrictions="true,false" />
19
19
  <NODE name="sweep_line" description="">
20
20
  <ITEM name="rt_votes_cutoff" value="5" type="int" description="Defines the minimum number of subsequent scans where a pattern must occur to be considered as a feature." tags="advanced" restrictions="0:" />
@@ -0,0 +1,264 @@
1
+ require 'bio'
2
+ require 'matrix'
3
+
4
+ class PeptideFragment
5
+ attr_accessor :start
6
+ attr_accessor :end
7
+ attr_accessor :seq
8
+ end
9
+
10
+ class PeptideToGeneAlignment
11
+ attr_accessor :gene_seq
12
+ attr_accessor :pep_seq
13
+ attr_accessor :trace
14
+
15
+ def initialize(gene,peptide,trace)
16
+ @gene_seq = gene
17
+ @pep_seq = peptide
18
+ @trace = trace
19
+ end
20
+
21
+ def inspect
22
+ descr = "#{@gene_seq}\n"
23
+
24
+ pep_triples=""
25
+ @pep_seq.each_char { |c| pep_triples<<c;pep_triples<<c;pep_triples<<c }
26
+
27
+ # gene_seq_triples=""
28
+ # Bio::Sequence::NA.new(@gene_seq).translate.each_char do |c|
29
+ # gene_seq_triples<<c;gene_seq_triples<<c;gene_seq_triples<<c
30
+ # end
31
+
32
+ # descr << "#{gene_seq_triples}\n"
33
+
34
+ pepi=0
35
+ @trace.each_with_index do |move, i|
36
+ if move==1
37
+ descr<<"-"
38
+ elsif move==0
39
+ descr<<"#{pep_triples[pepi]}"
40
+ pepi+=1
41
+ end
42
+ end
43
+ descr<<"\n"
44
+ puts descr
45
+ end
46
+
47
+ def fragments
48
+ frags=[]
49
+ in_fragment=false
50
+ @trace.each_with_index do |move,i|
51
+ if move==0
52
+ frags << [i,0] unless in_fragment #Start a fragment
53
+ in_fragment=true
54
+ else
55
+ frags.last[1]=i-1 if in_fragment #End a fragment
56
+ in_fragment=false
57
+ end
58
+ end
59
+ if frags.last[1]==0
60
+ frags.last[1]=@trace.length-1
61
+ end
62
+ frags
63
+ end
64
+
65
+ def gaps
66
+ gps=[]
67
+ in_start_end=true
68
+ in_gap=false
69
+ @trace.each_with_index do |move, i|
70
+ if move==0
71
+ in_start_end=false
72
+ if in_gap #Ending a gap
73
+ gps.last[1]=i
74
+ end
75
+ in_gap=false
76
+ else
77
+ if !in_start_end && !in_gap #Starting a gap
78
+ in_gap=true
79
+ gps<<[i,0]
80
+ end
81
+ end
82
+ end
83
+ #Remove gaps that have zero length (Trailing)
84
+ gps=gps.collect do |gp|
85
+ rv=gp
86
+ if gp[1]==0
87
+ rv=nil
88
+ end
89
+ rv
90
+ end
91
+ gps.compact!
92
+ gps
93
+ end
94
+
95
+ end
96
+
97
+ # Uses a dynamic programming algorithm (Smith-Waterman like) to align a peptide sequence to a nucleotide.
98
+ # This aligner assumes you are doing protogenomics and just want to assume that
99
+ # (a) The entire peptide sequence matches (with gaps) to the DNA sequence
100
+ #
101
+ class GappedAligner
102
+
103
+ def initialize
104
+ @big_penalty = -1000000000
105
+ @gap_open_penalty = -10000
106
+ @gap_extend_penalty = -1
107
+ @end_gap_penalty = 0
108
+ @match_bonus = 400
109
+
110
+ @match_move=0
111
+ @aadel_move=-1
112
+ @nadel_move=1
113
+ @triplet_offsets = [[0,-2,-1],[-1,0,-2],[-2,-1,0]]
114
+ end
115
+
116
+ def aa_deletion()
117
+ return @big_penalty
118
+ end
119
+
120
+ def score_na_deletion(move_type)
121
+ if move_type==@nadel_move
122
+ return @gap_extend_penalty
123
+ end
124
+ return @gap_open_penalty
125
+ end
126
+
127
+ def score_match(aa,na)
128
+ if aa==na
129
+ return @match_bonus
130
+ end
131
+ return @big_penalty
132
+ end
133
+
134
+ def traceback(from_row,from_col,dpmoves)
135
+ last_move = dpmoves[from_row][from_col]
136
+ last_row = from_row-1
137
+ last_col = from_col-1
138
+ if last_move==@aadel_move
139
+ last_col+=1
140
+ elsif last_move==@nadel_move
141
+ last_row+=1
142
+ end
143
+
144
+ if last_col==0 && last_row==0
145
+ return [last_move]
146
+ else
147
+ throw "Beyond end of array" if last_col<0 || last_row <0
148
+
149
+ return traceback(last_row,last_col,dpmoves).push(last_move)
150
+ end
151
+ end
152
+
153
+ def next_frame(previous_frame)
154
+ (previous_frame+1) % 3
155
+ end
156
+
157
+ def translate_na_at(j,frame,gene_seq)
158
+ rm = j % 3
159
+ start_pos=j+@triplet_offsets[rm][frame]
160
+ if start_pos < 0
161
+ return '-'
162
+ else
163
+ return gene_seq[start_pos,3].translate
164
+ end
165
+ end
166
+
167
+ def save_matrix(dpmatrix,pep_triples,gene_seq,name)
168
+ matfile=File.open("#{name}.csv", "w+")
169
+ matfile.write(",,")
170
+ gene_seq.each_char { |na| matfile.write("#{na},") }
171
+ matfile.write("\n")
172
+ dpmatrix.each_with_index { |row,ri|
173
+ if ri>0
174
+ matfile.write("#{pep_triples[ri-1]},")
175
+ else
176
+ matfile.write(",")
177
+ end
178
+ row.each { |col|
179
+ matfile.write("#{col},")
180
+ }
181
+ matfile.write("\n")
182
+ }
183
+ matfile.close()
184
+ end
185
+
186
+ def calculate_dp(pep_seq,gene_seq)
187
+ gene_seq = Bio::Sequence::NA.new(gene_seq)
188
+ nrow = pep_seq.length*3+1
189
+ ncol = gene_seq.length+1
190
+
191
+ throw "Peptide sequence is longer than gene" if nrow > ncol
192
+
193
+ pep_triples=""
194
+ pep_seq.each_char { |c| pep_triples<<c;pep_triples<<c;pep_triples<<c }
195
+
196
+ dpmoves=Matrix.build(nrow,ncol) {|r,c| 0 }.to_a
197
+ dpmatrix=Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
198
+ dpframes=Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
199
+ # before_gap_positions = Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
200
+
201
+ # Boundary conditions
202
+ (0..(nrow-1)).each { |i|
203
+ dpmatrix[i][0] = aa_deletion*i
204
+ dpmoves[i][0] = @aadel_move
205
+ }
206
+ (0..(ncol-1)).each { |j|
207
+ dpmatrix[0][j] = @end_gap_penalty*j
208
+ dpmoves[0][j] = @nadel_move
209
+ dpframes[0][j] = j % 3
210
+ }
211
+ dpmoves[0][0]=0
212
+ dpframes[0][0]=0
213
+
214
+ (1..(nrow-1)).each do |i|
215
+ (1..(ncol-1)).each do |j|
216
+ aa = pep_triples[i-1]
217
+
218
+ translated_na = translate_na_at(j-1,dpframes[i-1][j-1],gene_seq)
219
+
220
+ match = score_match(aa,translated_na) + dpmatrix[i-1][j-1]
221
+
222
+ nadel = score_na_deletion(dpmoves[i][j-1]) + dpmatrix[i][j-1]
223
+
224
+ # if (translated_na=="R") && (pep_seq=="FR") && (aa == "R")
225
+ # require 'debugger';debugger
226
+ # end
227
+
228
+ if match >= nadel
229
+ dpmatrix[i][j] = match
230
+ dpmoves[i][j] = @match_move
231
+ dpframes[i][j] = dpframes[i-1][j-1]
232
+ else
233
+ dpmatrix[i][j] = nadel
234
+ dpmoves[i][j] = @nadel_move
235
+ dpframes[i][j] = next_frame(dpframes[i][j-1])
236
+ end
237
+
238
+ end
239
+ end
240
+
241
+ # Find best end-point
242
+ end_score = dpmatrix[nrow-1].max
243
+ end_j = dpmatrix[nrow-1].index(end_score)
244
+
245
+ save_matrix(dpmatrix,pep_triples,gene_seq,"dpmatrix")
246
+ save_matrix(dpmoves,pep_triples,gene_seq,"moves")
247
+ save_matrix(dpframes,pep_triples,gene_seq,"frames")
248
+ # require 'debugger';debugger
249
+
250
+ traceback(nrow-1,end_j,dpmoves)
251
+ end
252
+
253
+
254
+ def align pep_seq, gene_seq
255
+
256
+ trace = calculate_dp(pep_seq,gene_seq)
257
+ alignment = PeptideToGeneAlignment.new(gene_seq,pep_seq,trace)
258
+ # puts alignment
259
+ # require 'debugger';debugger
260
+
261
+ return alignment
262
+ end
263
+
264
+ end