protk 1.2.4 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,11 @@ include LibXML
19
19
  tool=Tool.new([:explicit_output])
20
20
  tool.option_parser.banner = "Convert a protXML file to a tab delimited table.\n\nUsage: protxml_to_table.rb [options] file1.protXML"
21
21
 
22
+ # tool.options.proteinid_regex=".*?\|.*?\|(.*)"
23
+ # tool.option_parser.on( '--regex rexpr', 'Regex' ) do |regex|
24
+ # tool.options.proteinid_regex=regex
25
+ # end
26
+
22
27
  exit unless tool.check_options
23
28
 
24
29
  if ( ARGV[0].nil? )
@@ -48,7 +53,7 @@ end
48
53
 
49
54
 
50
55
  column_headers=[
51
- "group_number","group_probability","protein_name",
56
+ "group_number","group_probability","protein_name","protein_id","indistinguishable_proteins",
52
57
  "protein_probability","coverage","peptides",
53
58
  "num_peptides","confidence"
54
59
  ]
@@ -62,13 +67,25 @@ protein_groups.each do |protein_group|
62
67
 
63
68
  proteins=protein_group.find("./#{protxml_ns_prefix}protein", protxml_ns)
64
69
 
65
- proteins.each do |protein|
70
+ proteins.each do |protein|
71
+
72
+ indis_proteins=protein.find("./#{protxml_ns_prefix}indistinguishable_protein", protxml_ns)
73
+ indis_proteins_summary=""
74
+ indis_proteins.each { |iprot| indis_proteins_summary<<"#{iprot.attributes['protein_name']};" }
75
+
76
+ protein_id=""
77
+ if protein.attributes['protein_name'] =~ /.*?\|.*?\|(.*)/
78
+ protein_id=protein.attributes['protein_name'].match(/.*?\|.*?\|(.*)/)[1]
79
+ end
80
+
66
81
  column_values=[]
67
82
 
68
83
  column_values << protein_group.attributes['group_number']
69
84
  column_values << protein_group.attributes['probability']
70
85
 
71
86
  column_values << protein.attributes['protein_name']
87
+ column_values << protein_id
88
+ column_values << indis_proteins_summary
72
89
  column_values << protein.attributes['probability']
73
90
  column_values << protein.attributes['percent_coverage']
74
91
  column_values << protein.attributes['unique_stripped_peptides']
data/bin/sixframe.rb CHANGED
@@ -41,9 +41,11 @@ end
41
41
 
42
42
  inname=ARGV.shift
43
43
 
44
- outfile=File.open("#{inname}.translated.fasta",'w')
44
+ outfile=nil
45
45
  if ( tool.explicit_output != nil)
46
46
  outfile=File.open(tool.explicit_output,'w')
47
+ else
48
+ outfile=File.open("#{inname}.translated.fasta",'w')
47
49
  end
48
50
 
49
51
 
data/bin/tandem_search.rb CHANGED
@@ -149,6 +149,37 @@ def set_option(std_params, tandem_key, value)
149
149
  notes[0].content=value
150
150
  end
151
151
 
152
+ def append_option(std_params, tandem_key, value)
153
+ notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
154
+ if notes.length == 0
155
+ node = XML::Node.new('note')
156
+ node["type"] = "input"
157
+ node["label"] = tandem_key
158
+ node.content = value
159
+ std_params.find('/bioml')[0] << node
160
+ else
161
+ throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
162
+ notes[0].content = append_string(notes[0].content, value)
163
+ end
164
+ end
165
+
166
+ def collapse_keys(std_params, tandem_key)
167
+ mods=std_params.find('/bioml/note[@type="input" and @label="#{tandem_key}"]')
168
+ if not mods
169
+ first_mod = mods[0]
170
+ rest_mods = mods[1..-1]
171
+ rest_mods.each{ |node| first_mod.content = append_string(first_mod.content, node.content); node.remove!}
172
+ end
173
+ end
174
+
175
+ def append_string(first, second)
176
+ if first.empty?
177
+ second
178
+ else
179
+ "#{first},#{second}"
180
+ end
181
+ end
182
+
152
183
  def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
153
184
  set_option(std_params, "protein, cleavage semi", search_tool.cleavage_semi ? "yes" : "no")
154
185
  set_option(std_params, "scoring, maximum missed cleavage sites", search_tool.missed_cleavages)
@@ -301,7 +332,11 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
301
332
  mods=std_params.find('/bioml/note[@type="input" and @id="methionine-oxidation-variable"]')
302
333
  mods.each{ |node| node.remove!}
303
334
  end
304
-
335
+
336
+ # Merge all remaining id based modification into single modification.
337
+ collapse_keys(std_params, "residue, potential modification mass")
338
+ collapse_keys(std_params, "residue, modification mass")
339
+
305
340
  var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
306
341
  var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
307
342
  fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
@@ -313,31 +348,17 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
313
348
  var_mods.each do |vm|
314
349
 
315
350
  mod_type="potential modification mass"
316
- mod_type = "potential modification motif" if ( vm=~/[\[\]\(\)\{\}\!]/ )
317
- mod_id_label = "custom-variable-mod-#{mod_id.to_s}"
318
- mod_id=mod_id+1
319
- mnode=XML::Node.new('node')
320
- mnode["id"]=mod_id_label
321
- mnode["type"]="input"
322
- mnode["label"]="residue, #{mod_type}"
323
- mnode.content=vm
324
-
325
- root_bioml_node << mnode
351
+ mod_type = "potential modification motif" if motif?(vm)
352
+ label="residue, #{mod_type}"
353
+ append_option(std_params, label, vm)
326
354
  end
327
355
 
328
356
  mod_id=1
329
357
  fix_mods.each do |fm|
330
358
  mod_type="modification mass"
331
- mod_type = "modification motif" if ( fm=~/[\[\]\(\)\{\}\!]/ )
332
- mod_id_label = "custom-fixed-mod-#{mod_id.to_s}"
333
- mod_id=mod_id+1
334
- mnode=XML::Node.new('node')
335
- mnode["id"]=mod_id_label
336
- mnode["type"]="input"
337
- mnode["label"]="residue, #{mod_type}"
338
- mnode.content=fm
339
-
340
- root_bioml_node << mnode
359
+ mod_type = "modification motif" if motif?(fm)
360
+ label="residue, #{mod_type}"
361
+ append_option(std_params, label, fm)
341
362
  end
342
363
 
343
364
  #p root_bioml_node
@@ -345,6 +366,13 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
345
366
 
346
367
  end
347
368
 
369
+ def motif?(mod_string)
370
+ # 124@[ is not a modification motif, it is a residue (N-term) modification,
371
+ # so when checking if modification is a motif look for paired square brackets.
372
+ mod_string =~ /[\(\)\{\}\!]/ or mod_string =~ /\[.*\]/
373
+ end
374
+
375
+
348
376
  def generate_taxonomy_doc(taxo_doc,current_db,search_tool)
349
377
 
350
378
  taxon_label=taxo_doc.find('/bioml/taxon')
@@ -425,8 +453,8 @@ ARGV.each do |filename|
425
453
  # Run the search
426
454
  #
427
455
  job_params= {:jobid => search_tool.jobid_from_filename(filename)}
428
- job_params[:queue]="lowmem"
429
- job_params[:vmem]="900mb"
456
+ job_params[:queue]="sixteen"
457
+ job_params[:vmem]="12gb"
430
458
  code = search_tool.run(cmd,genv,job_params,jobscript_path)
431
459
  throw "Command failed with exit code #{code}" unless code==0
432
460
  else
@@ -15,7 +15,7 @@ require 'libxml'
15
15
 
16
16
  include LibXML
17
17
 
18
- tool=Tool.new([:explicit_output, :background,:over_write])
18
+ tool=Tool.new([:background,:over_write])
19
19
  tool.option_parser.banner = "Execute a toppas pipeline with a single inputs node\n\nUsage: toppas_pipeline.rb [options] input1 input2 ..."
20
20
 
21
21
  tool.options.outdir = ""
@@ -28,6 +28,11 @@ tool.option_parser.on( '--toppas-file f',"the toppas file to run" ) do |file|
28
28
  tool.options.toppas_file = file
29
29
  end
30
30
 
31
+ tool.options.threads = "1"
32
+ tool.option_parser.on( '--threads t',"Number of threads to use" ) do |tr|
33
+ tool.options.threads=tr
34
+ end
35
+
31
36
  exit unless tool.check_options
32
37
 
33
38
  if ( ARGV[0].nil? )
@@ -67,13 +72,13 @@ throw "outdir is a required parameter" if tool.outdir==""
67
72
  throw "toppas-file is a required parameter" if tool.toppas_file==""
68
73
  throw "outdir must exist" unless Dir.exist?(tool.outdir)
69
74
 
70
- trf_path = "#{tool.toppas_file}.trf"
75
+ trf_path = "#{Pathname.new(Tempfile.new(tool.toppas_file).path).basename.to_s}.trf"
71
76
 
72
77
  generate_trf(ARGV,trf_path)
73
78
 
74
79
  cmd=""
75
80
  cmd<<"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:#{genv.openms_root}/lib;
76
- #{genv.executepipeline} -in #{Pathname.new(tool.toppas_file).realpath.to_s} -out_dir #{Pathname.new(tool.outdir).realpath.to_s} -resource_file #{Pathname.new(trf_path).realpath.to_s}"
81
+ #{genv.executepipeline} -in #{Pathname.new(tool.toppas_file).realpath.to_s} -out_dir #{Pathname.new(tool.outdir).realpath.to_s} -resource_file #{Pathname.new(trf_path).realpath.to_s} -threads #{tool.threads}"
77
82
 
78
83
  run_pipeline(genv,tool,cmd,tool.outdir,tool.jobid_from_filename(tool.toppas_file))
79
84
 
@@ -24,6 +24,11 @@ tool.option_parser.on( '--id-column num', 'Specify a column for ids (default is
24
24
  tool.options.id_column=col.to_i
25
25
  end
26
26
 
27
+ tool.options.flatfiledb="swissprot"
28
+ tool.option_parser.on( '--flatfiledb dbname', 'Specify path to a Uniprot flatfile' ) do |dbname|
29
+ tool.options.flatfiledb=dbname
30
+ end
31
+
27
32
  tool.options.fields=nil
28
33
  tool.option_parser.on( '--fields flds', 'A comma separated list of fields to extract' ) do |flds|
29
34
  tool.options.fields=flds
@@ -42,7 +47,7 @@ genv=Constants.new
42
47
 
43
48
  input_file=ARGV[0]
44
49
 
45
- swissprotdb=SwissprotDatabase.new(genv)
50
+ swissprotdb=SwissprotDatabase.new(genv,tool.flatfiledb)
46
51
 
47
52
  output_file=nil
48
53
 
@@ -1,6 +1,4 @@
1
1
  #include <ruby.h>
2
-
3
-
4
2
  /* */
5
3
  /* make_random.c - make random protein sequence database using Markov chain with transitional */
6
4
  /* probabilities from amino acid frequencies in a real database in FASTA format */
@@ -25,7 +23,8 @@
25
23
  #define MAX_SEQUENCE_LENGTH 20000
26
24
  #define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
27
25
 
28
- static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
26
+
27
+ static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
29
28
  char *input_file = RSTRING_PTR(input_file_in);
30
29
  long sequences_to_generate = NUM2INT(db_length_in);
31
30
  char * output_file = RSTRING_PTR(output_file_in);
@@ -148,7 +147,7 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
148
147
  measured_aa_freq[a]++;
149
148
  }
150
149
  }
151
- else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
150
+ else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
152
151
  }
153
152
  MP[20][pl]++;
154
153
  measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
@@ -178,12 +177,12 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
178
177
  x=(double)row_sum[j]*((double)rand()/RAND_MAX);
179
178
  partial_sum=MP[0][j]; i=1;
180
179
  while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
181
- if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
180
+ if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
182
181
  if (i<21)
183
182
  {
184
183
  random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
185
184
  }
186
- else /* i==21, i.e. protein sequence terminated */
185
+ else /* i==21, i.e. protein sequence terminated */
187
186
  {
188
187
  k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
189
188
  for(l=0;l<j;l++)
@@ -196,7 +195,7 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
196
195
  }
197
196
 
198
197
  random_sequence_output[k]='\0';
199
- if (!(k%61)) random_sequence_output[k-1]='\0'; /* remove extra newline for sequence length multiple of 60 */
198
+ if (!(k%61)) random_sequence_output[k-1]='\0'; /* remove extra newline for sequence length multiple of 60 */
200
199
  fprintf(outp,">%srp%li\n%s\n",prefix_string,protein,random_sequence_output);
201
200
  break;
202
201
  }
@@ -222,14 +221,13 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
222
221
 
223
222
  }
224
223
 
225
- /* ruby calls this to load the extension */
226
- void Init_protk(void) {
227
- /* assume we haven't yet defined Hola */
228
- VALUE klass = rb_define_class("Protk",
229
- rb_cObject);
230
224
 
231
- /* the hola_bonjour function can be called
232
- * from ruby as "Hola.bonjour" */
225
+ void Init_decoymaker(void)
226
+ {
227
+ VALUE klass = rb_define_class("Decoymaker",rb_cObject);
228
+
233
229
  rb_define_singleton_method(klass,
234
- "make_decoys", protk_make_decoys, 4);
230
+ "make_decoys", decoymaker_make_decoys, 4);
231
+
232
+
235
233
  }
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('protk/decoymaker')
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('protk/simplealign')
@@ -1,10 +1,10 @@
1
1
  <?xml version="1.0" encoding="ISO-8859-1"?>
2
- <PARAMETERS version="1.3" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/schemas/Param_1_3.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
2
+ <PARAMETERS version="1.4" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/schemas/Param_1_4.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
3
3
  <NODE name="FeatureFinderIsotopeWavelet" description="Detects two-dimensional features in LC-MS data.">
4
- <ITEM name="version" value="1.9.0" type="string" description="Version of the tool that generated this parameters file." tags="advanced" />
4
+ <ITEM name="version" value="1.10.0" type="string" description="Version of the tool that generated this parameters file." tags="advanced" />
5
5
  <NODE name="1" description="Instance &apos;1&apos; section for &apos;FeatureFinderIsotopeWavelet&apos;">
6
- <ITEM name="in" value="" type="string" description="input file" tags="input file,required" restrictions="*.mzML" />
7
- <ITEM name="out" value="" type="string" description="output file" tags="output file,required" restrictions="*.featureXML" />
6
+ <ITEM name="in" value="" type="string" description="input file" tags="input file,required" supported_formats="*.mzML" />
7
+ <ITEM name="out" value="" type="string" description="output file" tags="output file,required" supported_formats="*.featureXML" />
8
8
  <ITEM name="log" value="" type="string" description="Name of log file (created only when specified)" tags="advanced" />
9
9
  <ITEM name="debug" value="0" type="int" description="Sets the debug level" tags="advanced" />
10
10
  <ITEM name="threads" value="1" type="int" description="Sets the number of threads allowed to be used by the TOPP tool" />
@@ -12,9 +12,9 @@
12
12
  <ITEM name="test" value="false" type="string" description="Enables the test mode (needed for internal use only)" tags="advanced" restrictions="true,false" />
13
13
  <NODE name="algorithm" description="Algorithm section">
14
14
  <ITEM name="max_charge" value="3" type="int" description="The maximal charge state to be considered." restrictions="1:" />
15
- <ITEM name="intensity_threshold" value="3" type="float" description="The final threshold t&apos; is build upon the formula: t&apos; = av+t*sd, where t is the intensity_threshold, av the average intensity within the wavelet transformed signal and sd the standard deviation of the transform. If you set intensity_threshold=-1, t&apos; will be zero.#br#As the &apos;optimal&apos; value for this parameter is highly data dependent, we would recommend to start with -1, which will also extract features with very low signal-to-noise ratio. Subsequently, one might increase the threshold to find an optimized trade-off between false positives and true positives. Depending on the dynamic range of your spectra, suitable value ranges include: -1, [0:10], and if your data features even very high intensity values, t can also adopt values up to around 30. Please note that this parameter is not of an integer type, s.t. you can also use t:=0.1, e.g." />
15
+ <ITEM name="intensity_threshold" value="-1" type="float" description="The final threshold t&apos; is build upon the formula: t&apos; = av+t*sd, where t is the intensity_threshold, av the average intensity within the wavelet transformed signal and sd the standard deviation of the transform. If you set intensity_threshold=-1, t&apos; will be zero.#br#As the &apos;optimal&apos; value for this parameter is highly data dependent, we would recommend to start with -1, which will also extract features with very low signal-to-noise ratio. Subsequently, one might increase the threshold to find an optimized trade-off between false positives and true positives. Depending on the dynamic range of your spectra, suitable value ranges include: -1, [0:10], and if your data features even very high intensity values, t can also adopt values up to around 30. Please note that this parameter is not of an integer type, s.t. you can also use t:=0.1, e.g." />
16
16
  <ITEM name="intensity_type" value="ref" type="string" description="Determines the intensity type returned for the identified features. &apos;ref&apos; (default) returns the sum of the intensities of each isotopic peak within an isotope pattern. &apos;trans&apos; refers to the intensity of the monoisotopic peak within the wavelet transform. &apos;corrected&apos; refers also to the transformed intensity with an attempt to remove the effects of the convolution. While the latter ones might be preferable for qualitative analyses, &apos;ref&apos; might be the best option to obtain quantitative results. Please note that intensity values might be spoiled (in particular for the option &apos;ref&apos;), as soon as patterns overlap (see also the explanations given in the class documentation of FeatureFinderAlgorihtmIsotopeWavelet)." tags="advanced" restrictions="ref,trans,corrected" />
17
- <ITEM name="check_ppm" value="true" type="string" description="Enables/disables a ppm test vs. the averagine model, i.e. potential peptide masses are checked for plausibility. In addition, a heuristic correcting potential mass shifts induced by the wavelet is applied." tags="advanced" restrictions="true,false" />
17
+ <ITEM name="check_ppm" value="false" type="string" description="Enables/disables a ppm test vs. the averagine model, i.e. potential peptide masses are checked for plausibility. In addition, a heuristic correcting potential mass shifts induced by the wavelet is applied." tags="advanced" restrictions="true,false" />
18
18
  <ITEM name="hr_data" value="false" type="string" description="Must be true in case of high-resolution data, i.e. for spectra featuring large m/z-gaps (present in FTICR and Orbitrap data, e.g.). Please check a single MS scan out of your recording, if you are unsure." restrictions="true,false" />
19
19
  <NODE name="sweep_line" description="">
20
20
  <ITEM name="rt_votes_cutoff" value="5" type="int" description="Defines the minimum number of subsequent scans where a pattern must occur to be considered as a feature." tags="advanced" restrictions="0:" />
@@ -0,0 +1,264 @@
1
+ require 'bio'
2
+ require 'matrix'
3
+
4
+ class PeptideFragment
5
+ attr_accessor :start
6
+ attr_accessor :end
7
+ attr_accessor :seq
8
+ end
9
+
10
+ class PeptideToGeneAlignment
11
+ attr_accessor :gene_seq
12
+ attr_accessor :pep_seq
13
+ attr_accessor :trace
14
+
15
+ def initialize(gene,peptide,trace)
16
+ @gene_seq = gene
17
+ @pep_seq = peptide
18
+ @trace = trace
19
+ end
20
+
21
+ def inspect
22
+ descr = "#{@gene_seq}\n"
23
+
24
+ pep_triples=""
25
+ @pep_seq.each_char { |c| pep_triples<<c;pep_triples<<c;pep_triples<<c }
26
+
27
+ # gene_seq_triples=""
28
+ # Bio::Sequence::NA.new(@gene_seq).translate.each_char do |c|
29
+ # gene_seq_triples<<c;gene_seq_triples<<c;gene_seq_triples<<c
30
+ # end
31
+
32
+ # descr << "#{gene_seq_triples}\n"
33
+
34
+ pepi=0
35
+ @trace.each_with_index do |move, i|
36
+ if move==1
37
+ descr<<"-"
38
+ elsif move==0
39
+ descr<<"#{pep_triples[pepi]}"
40
+ pepi+=1
41
+ end
42
+ end
43
+ descr<<"\n"
44
+ puts descr
45
+ end
46
+
47
+ def fragments
48
+ frags=[]
49
+ in_fragment=false
50
+ @trace.each_with_index do |move,i|
51
+ if move==0
52
+ frags << [i,0] unless in_fragment #Start a fragment
53
+ in_fragment=true
54
+ else
55
+ frags.last[1]=i-1 if in_fragment #End a fragment
56
+ in_fragment=false
57
+ end
58
+ end
59
+ if frags.last[1]==0
60
+ frags.last[1]=@trace.length-1
61
+ end
62
+ frags
63
+ end
64
+
65
+ def gaps
66
+ gps=[]
67
+ in_start_end=true
68
+ in_gap=false
69
+ @trace.each_with_index do |move, i|
70
+ if move==0
71
+ in_start_end=false
72
+ if in_gap #Ending a gap
73
+ gps.last[1]=i
74
+ end
75
+ in_gap=false
76
+ else
77
+ if !in_start_end && !in_gap #Starting a gap
78
+ in_gap=true
79
+ gps<<[i,0]
80
+ end
81
+ end
82
+ end
83
+ #Remove gaps that have zero length (Trailing)
84
+ gps=gps.collect do |gp|
85
+ rv=gp
86
+ if gp[1]==0
87
+ rv=nil
88
+ end
89
+ rv
90
+ end
91
+ gps.compact!
92
+ gps
93
+ end
94
+
95
+ end
96
+
97
+ # Uses a dynamic programming algorithm (Smith-Waterman like) to align a peptide sequence to a nucleotide.
98
+ # This aligner assumes you are doing protogenomics and just want to assume that
99
+ # (a) The entire peptide sequence matches (with gaps) to the DNA sequence
100
+ #
101
+ class GappedAligner
102
+
103
+ def initialize
104
+ @big_penalty = -1000000000
105
+ @gap_open_penalty = -10000
106
+ @gap_extend_penalty = -1
107
+ @end_gap_penalty = 0
108
+ @match_bonus = 400
109
+
110
+ @match_move=0
111
+ @aadel_move=-1
112
+ @nadel_move=1
113
+ @triplet_offsets = [[0,-2,-1],[-1,0,-2],[-2,-1,0]]
114
+ end
115
+
116
+ def aa_deletion()
117
+ return @big_penalty
118
+ end
119
+
120
+ def score_na_deletion(move_type)
121
+ if move_type==@nadel_move
122
+ return @gap_extend_penalty
123
+ end
124
+ return @gap_open_penalty
125
+ end
126
+
127
+ def score_match(aa,na)
128
+ if aa==na
129
+ return @match_bonus
130
+ end
131
+ return @big_penalty
132
+ end
133
+
134
+ def traceback(from_row,from_col,dpmoves)
135
+ last_move = dpmoves[from_row][from_col]
136
+ last_row = from_row-1
137
+ last_col = from_col-1
138
+ if last_move==@aadel_move
139
+ last_col+=1
140
+ elsif last_move==@nadel_move
141
+ last_row+=1
142
+ end
143
+
144
+ if last_col==0 && last_row==0
145
+ return [last_move]
146
+ else
147
+ throw "Beyond end of array" if last_col<0 || last_row <0
148
+
149
+ return traceback(last_row,last_col,dpmoves).push(last_move)
150
+ end
151
+ end
152
+
153
+ def next_frame(previous_frame)
154
+ (previous_frame+1) % 3
155
+ end
156
+
157
+ def translate_na_at(j,frame,gene_seq)
158
+ rm = j % 3
159
+ start_pos=j+@triplet_offsets[rm][frame]
160
+ if start_pos < 0
161
+ return '-'
162
+ else
163
+ return gene_seq[start_pos,3].translate
164
+ end
165
+ end
166
+
167
+ def save_matrix(dpmatrix,pep_triples,gene_seq,name)
168
+ matfile=File.open("#{name}.csv", "w+")
169
+ matfile.write(",,")
170
+ gene_seq.each_char { |na| matfile.write("#{na},") }
171
+ matfile.write("\n")
172
+ dpmatrix.each_with_index { |row,ri|
173
+ if ri>0
174
+ matfile.write("#{pep_triples[ri-1]},")
175
+ else
176
+ matfile.write(",")
177
+ end
178
+ row.each { |col|
179
+ matfile.write("#{col},")
180
+ }
181
+ matfile.write("\n")
182
+ }
183
+ matfile.close()
184
+ end
185
+
186
+ def calculate_dp(pep_seq,gene_seq)
187
+ gene_seq = Bio::Sequence::NA.new(gene_seq)
188
+ nrow = pep_seq.length*3+1
189
+ ncol = gene_seq.length+1
190
+
191
+ throw "Peptide sequence is longer than gene" if nrow > ncol
192
+
193
+ pep_triples=""
194
+ pep_seq.each_char { |c| pep_triples<<c;pep_triples<<c;pep_triples<<c }
195
+
196
+ dpmoves=Matrix.build(nrow,ncol) {|r,c| 0 }.to_a
197
+ dpmatrix=Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
198
+ dpframes=Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
199
+ # before_gap_positions = Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
200
+
201
+ # Boundary conditions
202
+ (0..(nrow-1)).each { |i|
203
+ dpmatrix[i][0] = aa_deletion*i
204
+ dpmoves[i][0] = @aadel_move
205
+ }
206
+ (0..(ncol-1)).each { |j|
207
+ dpmatrix[0][j] = @end_gap_penalty*j
208
+ dpmoves[0][j] = @nadel_move
209
+ dpframes[0][j] = j % 3
210
+ }
211
+ dpmoves[0][0]=0
212
+ dpframes[0][0]=0
213
+
214
+ (1..(nrow-1)).each do |i|
215
+ (1..(ncol-1)).each do |j|
216
+ aa = pep_triples[i-1]
217
+
218
+ translated_na = translate_na_at(j-1,dpframes[i-1][j-1],gene_seq)
219
+
220
+ match = score_match(aa,translated_na) + dpmatrix[i-1][j-1]
221
+
222
+ nadel = score_na_deletion(dpmoves[i][j-1]) + dpmatrix[i][j-1]
223
+
224
+ # if (translated_na=="R") && (pep_seq=="FR") && (aa == "R")
225
+ # require 'debugger';debugger
226
+ # end
227
+
228
+ if match >= nadel
229
+ dpmatrix[i][j] = match
230
+ dpmoves[i][j] = @match_move
231
+ dpframes[i][j] = dpframes[i-1][j-1]
232
+ else
233
+ dpmatrix[i][j] = nadel
234
+ dpmoves[i][j] = @nadel_move
235
+ dpframes[i][j] = next_frame(dpframes[i][j-1])
236
+ end
237
+
238
+ end
239
+ end
240
+
241
+ # Find best end-point
242
+ end_score = dpmatrix[nrow-1].max
243
+ end_j = dpmatrix[nrow-1].index(end_score)
244
+
245
+ save_matrix(dpmatrix,pep_triples,gene_seq,"dpmatrix")
246
+ save_matrix(dpmoves,pep_triples,gene_seq,"moves")
247
+ save_matrix(dpframes,pep_triples,gene_seq,"frames")
248
+ # require 'debugger';debugger
249
+
250
+ traceback(nrow-1,end_j,dpmoves)
251
+ end
252
+
253
+
254
+ def align pep_seq, gene_seq
255
+
256
+ trace = calculate_dp(pep_seq,gene_seq)
257
+ alignment = PeptideToGeneAlignment.new(gene_seq,pep_seq,trace)
258
+ # puts alignment
259
+ # require 'debugger';debugger
260
+
261
+ return alignment
262
+ end
263
+
264
+ end