protk 1.2.4 → 1.2.5
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/add_retention_times.rb +89 -0
- data/bin/augustus_to_proteindb.rb +193 -0
- data/bin/blastxml_to_table.rb +72 -0
- data/bin/feature_finder.rb +7 -1
- data/bin/make_decoy.rb +10 -2
- data/bin/mascot_search.rb +14 -4
- data/bin/msgfplus_search.rb +14 -5
- data/bin/peptide_prophet.rb +14 -7
- data/bin/protxml_to_gff.rb +624 -0
- data/bin/protxml_to_table.rb +19 -2
- data/bin/sixframe.rb +3 -1
- data/bin/tandem_search.rb +51 -23
- data/bin/toppas_pipeline.rb +8 -3
- data/bin/uniprot_annotation.rb +6 -1
- data/ext/protk/{protk.c → decoymaker/decoymaker.c} +13 -15
- data/ext/protk/decoymaker/extconf.rb +3 -0
- data/ext/protk/simplealign/extconf.rb +3 -0
- data/lib/protk/data/FeatureFinderIsotopeWavelet.ini +6 -6
- data/lib/protk/gapped_aligner.rb +264 -0
- data/lib/protk/manage_db_rakefile.rake +2 -1
- data/lib/protk/mascot_util.rb +7 -2
- data/lib/protk/randomize.rb +2 -2
- data/lib/protk/search_tool.rb +1 -1
- data/lib/protk/setup_rakefile.rake +25 -2
- data/lib/protk/spreadsheet_extensions.rb +1 -0
- data/lib/protk/swissprot_database.rb +11 -1
- metadata +30 -8
- data/bin/mascot2xml.rb +0 -87
- data/ext/protk/extconf.rb +0 -3
- data/lib/protk/data/pepxml_mascot_template.xml +0 -29
- data/lib/protk/data/predefined_db.trembl_annotation.yaml +0 -20
data/bin/protxml_to_table.rb
CHANGED
@@ -19,6 +19,11 @@ include LibXML
|
|
19
19
|
tool=Tool.new([:explicit_output])
|
20
20
|
tool.option_parser.banner = "Convert a protXML file to a tab delimited table.\n\nUsage: protxml_to_table.rb [options] file1.protXML"
|
21
21
|
|
22
|
+
# tool.options.proteinid_regex=".*?\|.*?\|(.*)"
|
23
|
+
# tool.option_parser.on( '--regex rexpr', 'Regex' ) do |regex|
|
24
|
+
# tool.options.proteinid_regex=regex
|
25
|
+
# end
|
26
|
+
|
22
27
|
exit unless tool.check_options
|
23
28
|
|
24
29
|
if ( ARGV[0].nil? )
|
@@ -48,7 +53,7 @@ end
|
|
48
53
|
|
49
54
|
|
50
55
|
column_headers=[
|
51
|
-
"group_number","group_probability","protein_name",
|
56
|
+
"group_number","group_probability","protein_name","protein_id","indistinguishable_proteins",
|
52
57
|
"protein_probability","coverage","peptides",
|
53
58
|
"num_peptides","confidence"
|
54
59
|
]
|
@@ -62,13 +67,25 @@ protein_groups.each do |protein_group|
|
|
62
67
|
|
63
68
|
proteins=protein_group.find("./#{protxml_ns_prefix}protein", protxml_ns)
|
64
69
|
|
65
|
-
proteins.each do |protein|
|
70
|
+
proteins.each do |protein|
|
71
|
+
|
72
|
+
indis_proteins=protein.find("./#{protxml_ns_prefix}indistinguishable_protein", protxml_ns)
|
73
|
+
indis_proteins_summary=""
|
74
|
+
indis_proteins.each { |iprot| indis_proteins_summary<<"#{iprot.attributes['protein_name']};" }
|
75
|
+
|
76
|
+
protein_id=""
|
77
|
+
if protein.attributes['protein_name'] =~ /.*?\|.*?\|(.*)/
|
78
|
+
protein_id=protein.attributes['protein_name'].match(/.*?\|.*?\|(.*)/)[1]
|
79
|
+
end
|
80
|
+
|
66
81
|
column_values=[]
|
67
82
|
|
68
83
|
column_values << protein_group.attributes['group_number']
|
69
84
|
column_values << protein_group.attributes['probability']
|
70
85
|
|
71
86
|
column_values << protein.attributes['protein_name']
|
87
|
+
column_values << protein_id
|
88
|
+
column_values << indis_proteins_summary
|
72
89
|
column_values << protein.attributes['probability']
|
73
90
|
column_values << protein.attributes['percent_coverage']
|
74
91
|
column_values << protein.attributes['unique_stripped_peptides']
|
data/bin/sixframe.rb
CHANGED
@@ -41,9 +41,11 @@ end
|
|
41
41
|
|
42
42
|
inname=ARGV.shift
|
43
43
|
|
44
|
-
outfile=
|
44
|
+
outfile=nil
|
45
45
|
if ( tool.explicit_output != nil)
|
46
46
|
outfile=File.open(tool.explicit_output,'w')
|
47
|
+
else
|
48
|
+
outfile=File.open("#{inname}.translated.fasta",'w')
|
47
49
|
end
|
48
50
|
|
49
51
|
|
data/bin/tandem_search.rb
CHANGED
@@ -149,6 +149,37 @@ def set_option(std_params, tandem_key, value)
|
|
149
149
|
notes[0].content=value
|
150
150
|
end
|
151
151
|
|
152
|
+
def append_option(std_params, tandem_key, value)
|
153
|
+
notes = std_params.find("/bioml/note[@type=\"input\" and @label=\"#{tandem_key}\"]")
|
154
|
+
if notes.length == 0
|
155
|
+
node = XML::Node.new('note')
|
156
|
+
node["type"] = "input"
|
157
|
+
node["label"] = tandem_key
|
158
|
+
node.content = value
|
159
|
+
std_params.find('/bioml')[0] << node
|
160
|
+
else
|
161
|
+
throw "Exactly one parameter named (#{tandem_key}) is required in parameter file" unless notes.length==1
|
162
|
+
notes[0].content = append_string(notes[0].content, value)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def collapse_keys(std_params, tandem_key)
|
167
|
+
mods=std_params.find('/bioml/note[@type="input" and @label="#{tandem_key}"]')
|
168
|
+
if not mods
|
169
|
+
first_mod = mods[0]
|
170
|
+
rest_mods = mods[1..-1]
|
171
|
+
rest_mods.each{ |node| first_mod.content = append_string(first_mod.content, node.content); node.remove!}
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def append_string(first, second)
|
176
|
+
if first.empty?
|
177
|
+
second
|
178
|
+
else
|
179
|
+
"#{first},#{second}"
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
152
183
|
def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_db,search_tool,genv)
|
153
184
|
set_option(std_params, "protein, cleavage semi", search_tool.cleavage_semi ? "yes" : "no")
|
154
185
|
set_option(std_params, "scoring, maximum missed cleavage sites", search_tool.missed_cleavages)
|
@@ -301,7 +332,11 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
|
|
301
332
|
mods=std_params.find('/bioml/note[@type="input" and @id="methionine-oxidation-variable"]')
|
302
333
|
mods.each{ |node| node.remove!}
|
303
334
|
end
|
304
|
-
|
335
|
+
|
336
|
+
# Merge all remaining id based modification into single modification.
|
337
|
+
collapse_keys(std_params, "residue, potential modification mass")
|
338
|
+
collapse_keys(std_params, "residue, modification mass")
|
339
|
+
|
305
340
|
var_mods = search_tool.var_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject {|e| e.empty? }
|
306
341
|
var_mods=var_mods.collect {|mod| decode_modification_string(mod) }
|
307
342
|
fix_mods = search_tool.fix_mods.split(",").collect { |mod| mod.lstrip.rstrip }.reject { |e| e.empty? }
|
@@ -313,31 +348,17 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
|
|
313
348
|
var_mods.each do |vm|
|
314
349
|
|
315
350
|
mod_type="potential modification mass"
|
316
|
-
mod_type = "potential modification motif" if (
|
317
|
-
|
318
|
-
|
319
|
-
mnode=XML::Node.new('node')
|
320
|
-
mnode["id"]=mod_id_label
|
321
|
-
mnode["type"]="input"
|
322
|
-
mnode["label"]="residue, #{mod_type}"
|
323
|
-
mnode.content=vm
|
324
|
-
|
325
|
-
root_bioml_node << mnode
|
351
|
+
mod_type = "potential modification motif" if motif?(vm)
|
352
|
+
label="residue, #{mod_type}"
|
353
|
+
append_option(std_params, label, vm)
|
326
354
|
end
|
327
355
|
|
328
356
|
mod_id=1
|
329
357
|
fix_mods.each do |fm|
|
330
358
|
mod_type="modification mass"
|
331
|
-
mod_type = "modification motif" if (
|
332
|
-
|
333
|
-
|
334
|
-
mnode=XML::Node.new('node')
|
335
|
-
mnode["id"]=mod_id_label
|
336
|
-
mnode["type"]="input"
|
337
|
-
mnode["label"]="residue, #{mod_type}"
|
338
|
-
mnode.content=fm
|
339
|
-
|
340
|
-
root_bioml_node << mnode
|
359
|
+
mod_type = "modification motif" if motif?(fm)
|
360
|
+
label="residue, #{mod_type}"
|
361
|
+
append_option(std_params, label, fm)
|
341
362
|
end
|
342
363
|
|
343
364
|
#p root_bioml_node
|
@@ -345,6 +366,13 @@ def generate_parameter_doc(std_params,output_path,input_path,taxo_path,current_d
|
|
345
366
|
|
346
367
|
end
|
347
368
|
|
369
|
+
def motif?(mod_string)
|
370
|
+
# 124@[ is not a modification motif, it is a residue (N-term) modification,
|
371
|
+
# so when checking if modification is a motif look for paired square brackets.
|
372
|
+
mod_string =~ /[\(\)\{\}\!]/ or mod_string =~ /\[.*\]/
|
373
|
+
end
|
374
|
+
|
375
|
+
|
348
376
|
def generate_taxonomy_doc(taxo_doc,current_db,search_tool)
|
349
377
|
|
350
378
|
taxon_label=taxo_doc.find('/bioml/taxon')
|
@@ -425,8 +453,8 @@ ARGV.each do |filename|
|
|
425
453
|
# Run the search
|
426
454
|
#
|
427
455
|
job_params= {:jobid => search_tool.jobid_from_filename(filename)}
|
428
|
-
job_params[:queue]="
|
429
|
-
job_params[:vmem]="
|
456
|
+
job_params[:queue]="sixteen"
|
457
|
+
job_params[:vmem]="12gb"
|
430
458
|
code = search_tool.run(cmd,genv,job_params,jobscript_path)
|
431
459
|
throw "Command failed with exit code #{code}" unless code==0
|
432
460
|
else
|
data/bin/toppas_pipeline.rb
CHANGED
@@ -15,7 +15,7 @@ require 'libxml'
|
|
15
15
|
|
16
16
|
include LibXML
|
17
17
|
|
18
|
-
tool=Tool.new([:
|
18
|
+
tool=Tool.new([:background,:over_write])
|
19
19
|
tool.option_parser.banner = "Execute a toppas pipeline with a single inputs node\n\nUsage: toppas_pipeline.rb [options] input1 input2 ..."
|
20
20
|
|
21
21
|
tool.options.outdir = ""
|
@@ -28,6 +28,11 @@ tool.option_parser.on( '--toppas-file f',"the toppas file to run" ) do |file|
|
|
28
28
|
tool.options.toppas_file = file
|
29
29
|
end
|
30
30
|
|
31
|
+
tool.options.threads = "1"
|
32
|
+
tool.option_parser.on( '--threads t',"Number of threads to use" ) do |tr|
|
33
|
+
tool.options.threads=tr
|
34
|
+
end
|
35
|
+
|
31
36
|
exit unless tool.check_options
|
32
37
|
|
33
38
|
if ( ARGV[0].nil? )
|
@@ -67,13 +72,13 @@ throw "outdir is a required parameter" if tool.outdir==""
|
|
67
72
|
throw "toppas-file is a required parameter" if tool.toppas_file==""
|
68
73
|
throw "outdir must exist" unless Dir.exist?(tool.outdir)
|
69
74
|
|
70
|
-
trf_path = "#{tool.toppas_file}.trf"
|
75
|
+
trf_path = "#{Pathname.new(Tempfile.new(tool.toppas_file).path).basename.to_s}.trf"
|
71
76
|
|
72
77
|
generate_trf(ARGV,trf_path)
|
73
78
|
|
74
79
|
cmd=""
|
75
80
|
cmd<<"export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:#{genv.openms_root}/lib;
|
76
|
-
#{genv.executepipeline} -in #{Pathname.new(tool.toppas_file).realpath.to_s} -out_dir #{Pathname.new(tool.outdir).realpath.to_s} -resource_file #{Pathname.new(trf_path).realpath.to_s}"
|
81
|
+
#{genv.executepipeline} -in #{Pathname.new(tool.toppas_file).realpath.to_s} -out_dir #{Pathname.new(tool.outdir).realpath.to_s} -resource_file #{Pathname.new(trf_path).realpath.to_s} -threads #{tool.threads}"
|
77
82
|
|
78
83
|
run_pipeline(genv,tool,cmd,tool.outdir,tool.jobid_from_filename(tool.toppas_file))
|
79
84
|
|
data/bin/uniprot_annotation.rb
CHANGED
@@ -24,6 +24,11 @@ tool.option_parser.on( '--id-column num', 'Specify a column for ids (default is
|
|
24
24
|
tool.options.id_column=col.to_i
|
25
25
|
end
|
26
26
|
|
27
|
+
tool.options.flatfiledb="swissprot"
|
28
|
+
tool.option_parser.on( '--flatfiledb dbname', 'Specify path to a Uniprot flatfile' ) do |dbname|
|
29
|
+
tool.options.flatfiledb=dbname
|
30
|
+
end
|
31
|
+
|
27
32
|
tool.options.fields=nil
|
28
33
|
tool.option_parser.on( '--fields flds', 'A comma separated list of fields to extract' ) do |flds|
|
29
34
|
tool.options.fields=flds
|
@@ -42,7 +47,7 @@ genv=Constants.new
|
|
42
47
|
|
43
48
|
input_file=ARGV[0]
|
44
49
|
|
45
|
-
swissprotdb=SwissprotDatabase.new(genv)
|
50
|
+
swissprotdb=SwissprotDatabase.new(genv,tool.flatfiledb)
|
46
51
|
|
47
52
|
output_file=nil
|
48
53
|
|
@@ -1,6 +1,4 @@
|
|
1
1
|
#include <ruby.h>
|
2
|
-
|
3
|
-
|
4
2
|
/* */
|
5
3
|
/* make_random.c - make random protein sequence database using Markov chain with transitional */
|
6
4
|
/* probabilities from amino acid frequencies in a real database in FASTA format */
|
@@ -25,7 +23,8 @@
|
|
25
23
|
#define MAX_SEQUENCE_LENGTH 20000
|
26
24
|
#define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
|
27
25
|
|
28
|
-
|
26
|
+
|
27
|
+
static VALUE decoymaker_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
|
29
28
|
char *input_file = RSTRING_PTR(input_file_in);
|
30
29
|
long sequences_to_generate = NUM2INT(db_length_in);
|
31
30
|
char * output_file = RSTRING_PTR(output_file_in);
|
@@ -148,7 +147,7 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
|
|
148
147
|
measured_aa_freq[a]++;
|
149
148
|
}
|
150
149
|
}
|
151
|
-
|
150
|
+
else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
|
152
151
|
}
|
153
152
|
MP[20][pl]++;
|
154
153
|
measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
|
@@ -178,12 +177,12 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
|
|
178
177
|
x=(double)row_sum[j]*((double)rand()/RAND_MAX);
|
179
178
|
partial_sum=MP[0][j]; i=1;
|
180
179
|
while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
|
181
|
-
|
180
|
+
if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
|
182
181
|
if (i<21)
|
183
182
|
{
|
184
183
|
random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
|
185
184
|
}
|
186
|
-
|
185
|
+
else /* i==21, i.e. protein sequence terminated */
|
187
186
|
{
|
188
187
|
k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
|
189
188
|
for(l=0;l<j;l++)
|
@@ -196,7 +195,7 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
|
|
196
195
|
}
|
197
196
|
|
198
197
|
random_sequence_output[k]='\0';
|
199
|
-
|
198
|
+
if (!(k%61)) random_sequence_output[k-1]='\0'; /* remove extra newline for sequence length multiple of 60 */
|
200
199
|
fprintf(outp,">%srp%li\n%s\n",prefix_string,protein,random_sequence_output);
|
201
200
|
break;
|
202
201
|
}
|
@@ -222,14 +221,13 @@ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in
|
|
222
221
|
|
223
222
|
}
|
224
223
|
|
225
|
-
/* ruby calls this to load the extension */
|
226
|
-
void Init_protk(void) {
|
227
|
-
/* assume we haven't yet defined Hola */
|
228
|
-
VALUE klass = rb_define_class("Protk",
|
229
|
-
rb_cObject);
|
230
224
|
|
231
|
-
|
232
|
-
|
225
|
+
void Init_decoymaker(void)
|
226
|
+
{
|
227
|
+
VALUE klass = rb_define_class("Decoymaker",rb_cObject);
|
228
|
+
|
233
229
|
rb_define_singleton_method(klass,
|
234
|
-
"make_decoys",
|
230
|
+
"make_decoys", decoymaker_make_decoys, 4);
|
231
|
+
|
232
|
+
|
235
233
|
}
|
@@ -1,10 +1,10 @@
|
|
1
1
|
<?xml version="1.0" encoding="ISO-8859-1"?>
|
2
|
-
<PARAMETERS version="1.
|
2
|
+
<PARAMETERS version="1.4" xsi:noNamespaceSchemaLocation="http://open-ms.sourceforge.net/schemas/Param_1_4.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
3
3
|
<NODE name="FeatureFinderIsotopeWavelet" description="Detects two-dimensional features in LC-MS data.">
|
4
|
-
<ITEM name="version" value="1.
|
4
|
+
<ITEM name="version" value="1.10.0" type="string" description="Version of the tool that generated this parameters file." tags="advanced" />
|
5
5
|
<NODE name="1" description="Instance '1' section for 'FeatureFinderIsotopeWavelet'">
|
6
|
-
<ITEM name="in" value="" type="string" description="input file" tags="input file,required"
|
7
|
-
<ITEM name="out" value="" type="string" description="output file" tags="output file,required"
|
6
|
+
<ITEM name="in" value="" type="string" description="input file" tags="input file,required" supported_formats="*.mzML" />
|
7
|
+
<ITEM name="out" value="" type="string" description="output file" tags="output file,required" supported_formats="*.featureXML" />
|
8
8
|
<ITEM name="log" value="" type="string" description="Name of log file (created only when specified)" tags="advanced" />
|
9
9
|
<ITEM name="debug" value="0" type="int" description="Sets the debug level" tags="advanced" />
|
10
10
|
<ITEM name="threads" value="1" type="int" description="Sets the number of threads allowed to be used by the TOPP tool" />
|
@@ -12,9 +12,9 @@
|
|
12
12
|
<ITEM name="test" value="false" type="string" description="Enables the test mode (needed for internal use only)" tags="advanced" restrictions="true,false" />
|
13
13
|
<NODE name="algorithm" description="Algorithm section">
|
14
14
|
<ITEM name="max_charge" value="3" type="int" description="The maximal charge state to be considered." restrictions="1:" />
|
15
|
-
<ITEM name="intensity_threshold" value="
|
15
|
+
<ITEM name="intensity_threshold" value="-1" type="float" description="The final threshold t' is build upon the formula: t' = av+t*sd, where t is the intensity_threshold, av the average intensity within the wavelet transformed signal and sd the standard deviation of the transform. If you set intensity_threshold=-1, t' will be zero.#br#As the 'optimal' value for this parameter is highly data dependent, we would recommend to start with -1, which will also extract features with very low signal-to-noise ratio. Subsequently, one might increase the threshold to find an optimized trade-off between false positives and true positives. Depending on the dynamic range of your spectra, suitable value ranges include: -1, [0:10], and if your data features even very high intensity values, t can also adopt values up to around 30. Please note that this parameter is not of an integer type, s.t. you can also use t:=0.1, e.g." />
|
16
16
|
<ITEM name="intensity_type" value="ref" type="string" description="Determines the intensity type returned for the identified features. 'ref' (default) returns the sum of the intensities of each isotopic peak within an isotope pattern. 'trans' refers to the intensity of the monoisotopic peak within the wavelet transform. 'corrected' refers also to the transformed intensity with an attempt to remove the effects of the convolution. While the latter ones might be preferable for qualitative analyses, 'ref' might be the best option to obtain quantitative results. Please note that intensity values might be spoiled (in particular for the option 'ref'), as soon as patterns overlap (see also the explanations given in the class documentation of FeatureFinderAlgorihtmIsotopeWavelet)." tags="advanced" restrictions="ref,trans,corrected" />
|
17
|
-
<ITEM name="check_ppm" value="
|
17
|
+
<ITEM name="check_ppm" value="false" type="string" description="Enables/disables a ppm test vs. the averagine model, i.e. potential peptide masses are checked for plausibility. In addition, a heuristic correcting potential mass shifts induced by the wavelet is applied." tags="advanced" restrictions="true,false" />
|
18
18
|
<ITEM name="hr_data" value="false" type="string" description="Must be true in case of high-resolution data, i.e. for spectra featuring large m/z-gaps (present in FTICR and Orbitrap data, e.g.). Please check a single MS scan out of your recording, if you are unsure." restrictions="true,false" />
|
19
19
|
<NODE name="sweep_line" description="">
|
20
20
|
<ITEM name="rt_votes_cutoff" value="5" type="int" description="Defines the minimum number of subsequent scans where a pattern must occur to be considered as a feature." tags="advanced" restrictions="0:" />
|
@@ -0,0 +1,264 @@
|
|
1
|
+
require 'bio'
|
2
|
+
require 'matrix'
|
3
|
+
|
4
|
+
class PeptideFragment
|
5
|
+
attr_accessor :start
|
6
|
+
attr_accessor :end
|
7
|
+
attr_accessor :seq
|
8
|
+
end
|
9
|
+
|
10
|
+
class PeptideToGeneAlignment
|
11
|
+
attr_accessor :gene_seq
|
12
|
+
attr_accessor :pep_seq
|
13
|
+
attr_accessor :trace
|
14
|
+
|
15
|
+
def initialize(gene,peptide,trace)
|
16
|
+
@gene_seq = gene
|
17
|
+
@pep_seq = peptide
|
18
|
+
@trace = trace
|
19
|
+
end
|
20
|
+
|
21
|
+
def inspect
|
22
|
+
descr = "#{@gene_seq}\n"
|
23
|
+
|
24
|
+
pep_triples=""
|
25
|
+
@pep_seq.each_char { |c| pep_triples<<c;pep_triples<<c;pep_triples<<c }
|
26
|
+
|
27
|
+
# gene_seq_triples=""
|
28
|
+
# Bio::Sequence::NA.new(@gene_seq).translate.each_char do |c|
|
29
|
+
# gene_seq_triples<<c;gene_seq_triples<<c;gene_seq_triples<<c
|
30
|
+
# end
|
31
|
+
|
32
|
+
# descr << "#{gene_seq_triples}\n"
|
33
|
+
|
34
|
+
pepi=0
|
35
|
+
@trace.each_with_index do |move, i|
|
36
|
+
if move==1
|
37
|
+
descr<<"-"
|
38
|
+
elsif move==0
|
39
|
+
descr<<"#{pep_triples[pepi]}"
|
40
|
+
pepi+=1
|
41
|
+
end
|
42
|
+
end
|
43
|
+
descr<<"\n"
|
44
|
+
puts descr
|
45
|
+
end
|
46
|
+
|
47
|
+
def fragments
|
48
|
+
frags=[]
|
49
|
+
in_fragment=false
|
50
|
+
@trace.each_with_index do |move,i|
|
51
|
+
if move==0
|
52
|
+
frags << [i,0] unless in_fragment #Start a fragment
|
53
|
+
in_fragment=true
|
54
|
+
else
|
55
|
+
frags.last[1]=i-1 if in_fragment #End a fragment
|
56
|
+
in_fragment=false
|
57
|
+
end
|
58
|
+
end
|
59
|
+
if frags.last[1]==0
|
60
|
+
frags.last[1]=@trace.length-1
|
61
|
+
end
|
62
|
+
frags
|
63
|
+
end
|
64
|
+
|
65
|
+
def gaps
|
66
|
+
gps=[]
|
67
|
+
in_start_end=true
|
68
|
+
in_gap=false
|
69
|
+
@trace.each_with_index do |move, i|
|
70
|
+
if move==0
|
71
|
+
in_start_end=false
|
72
|
+
if in_gap #Ending a gap
|
73
|
+
gps.last[1]=i
|
74
|
+
end
|
75
|
+
in_gap=false
|
76
|
+
else
|
77
|
+
if !in_start_end && !in_gap #Starting a gap
|
78
|
+
in_gap=true
|
79
|
+
gps<<[i,0]
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
#Remove gaps that have zero length (Trailing)
|
84
|
+
gps=gps.collect do |gp|
|
85
|
+
rv=gp
|
86
|
+
if gp[1]==0
|
87
|
+
rv=nil
|
88
|
+
end
|
89
|
+
rv
|
90
|
+
end
|
91
|
+
gps.compact!
|
92
|
+
gps
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
|
97
|
+
# Uses a dynamic programming algorithm (Smith-Waterman like) to align a peptide sequence to a nucleotide.
|
98
|
+
# This aligner assumes you are doing protogenomics and just want to assume that
|
99
|
+
# (a) The entire peptide sequence matches (with gaps) to the DNA sequence
|
100
|
+
#
|
101
|
+
class GappedAligner
|
102
|
+
|
103
|
+
def initialize
|
104
|
+
@big_penalty = -1000000000
|
105
|
+
@gap_open_penalty = -10000
|
106
|
+
@gap_extend_penalty = -1
|
107
|
+
@end_gap_penalty = 0
|
108
|
+
@match_bonus = 400
|
109
|
+
|
110
|
+
@match_move=0
|
111
|
+
@aadel_move=-1
|
112
|
+
@nadel_move=1
|
113
|
+
@triplet_offsets = [[0,-2,-1],[-1,0,-2],[-2,-1,0]]
|
114
|
+
end
|
115
|
+
|
116
|
+
def aa_deletion()
|
117
|
+
return @big_penalty
|
118
|
+
end
|
119
|
+
|
120
|
+
def score_na_deletion(move_type)
|
121
|
+
if move_type==@nadel_move
|
122
|
+
return @gap_extend_penalty
|
123
|
+
end
|
124
|
+
return @gap_open_penalty
|
125
|
+
end
|
126
|
+
|
127
|
+
def score_match(aa,na)
|
128
|
+
if aa==na
|
129
|
+
return @match_bonus
|
130
|
+
end
|
131
|
+
return @big_penalty
|
132
|
+
end
|
133
|
+
|
134
|
+
def traceback(from_row,from_col,dpmoves)
|
135
|
+
last_move = dpmoves[from_row][from_col]
|
136
|
+
last_row = from_row-1
|
137
|
+
last_col = from_col-1
|
138
|
+
if last_move==@aadel_move
|
139
|
+
last_col+=1
|
140
|
+
elsif last_move==@nadel_move
|
141
|
+
last_row+=1
|
142
|
+
end
|
143
|
+
|
144
|
+
if last_col==0 && last_row==0
|
145
|
+
return [last_move]
|
146
|
+
else
|
147
|
+
throw "Beyond end of array" if last_col<0 || last_row <0
|
148
|
+
|
149
|
+
return traceback(last_row,last_col,dpmoves).push(last_move)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def next_frame(previous_frame)
|
154
|
+
(previous_frame+1) % 3
|
155
|
+
end
|
156
|
+
|
157
|
+
def translate_na_at(j,frame,gene_seq)
|
158
|
+
rm = j % 3
|
159
|
+
start_pos=j+@triplet_offsets[rm][frame]
|
160
|
+
if start_pos < 0
|
161
|
+
return '-'
|
162
|
+
else
|
163
|
+
return gene_seq[start_pos,3].translate
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
def save_matrix(dpmatrix,pep_triples,gene_seq,name)
|
168
|
+
matfile=File.open("#{name}.csv", "w+")
|
169
|
+
matfile.write(",,")
|
170
|
+
gene_seq.each_char { |na| matfile.write("#{na},") }
|
171
|
+
matfile.write("\n")
|
172
|
+
dpmatrix.each_with_index { |row,ri|
|
173
|
+
if ri>0
|
174
|
+
matfile.write("#{pep_triples[ri-1]},")
|
175
|
+
else
|
176
|
+
matfile.write(",")
|
177
|
+
end
|
178
|
+
row.each { |col|
|
179
|
+
matfile.write("#{col},")
|
180
|
+
}
|
181
|
+
matfile.write("\n")
|
182
|
+
}
|
183
|
+
matfile.close()
|
184
|
+
end
|
185
|
+
|
186
|
+
def calculate_dp(pep_seq,gene_seq)
|
187
|
+
gene_seq = Bio::Sequence::NA.new(gene_seq)
|
188
|
+
nrow = pep_seq.length*3+1
|
189
|
+
ncol = gene_seq.length+1
|
190
|
+
|
191
|
+
throw "Peptide sequence is longer than gene" if nrow > ncol
|
192
|
+
|
193
|
+
pep_triples=""
|
194
|
+
pep_seq.each_char { |c| pep_triples<<c;pep_triples<<c;pep_triples<<c }
|
195
|
+
|
196
|
+
dpmoves=Matrix.build(nrow,ncol) {|r,c| 0 }.to_a
|
197
|
+
dpmatrix=Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
|
198
|
+
dpframes=Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
|
199
|
+
# before_gap_positions = Matrix.build(nrow,ncol) { |r,c| 0 }.to_a
|
200
|
+
|
201
|
+
# Boundary conditions
|
202
|
+
(0..(nrow-1)).each { |i|
|
203
|
+
dpmatrix[i][0] = aa_deletion*i
|
204
|
+
dpmoves[i][0] = @aadel_move
|
205
|
+
}
|
206
|
+
(0..(ncol-1)).each { |j|
|
207
|
+
dpmatrix[0][j] = @end_gap_penalty*j
|
208
|
+
dpmoves[0][j] = @nadel_move
|
209
|
+
dpframes[0][j] = j % 3
|
210
|
+
}
|
211
|
+
dpmoves[0][0]=0
|
212
|
+
dpframes[0][0]=0
|
213
|
+
|
214
|
+
(1..(nrow-1)).each do |i|
|
215
|
+
(1..(ncol-1)).each do |j|
|
216
|
+
aa = pep_triples[i-1]
|
217
|
+
|
218
|
+
translated_na = translate_na_at(j-1,dpframes[i-1][j-1],gene_seq)
|
219
|
+
|
220
|
+
match = score_match(aa,translated_na) + dpmatrix[i-1][j-1]
|
221
|
+
|
222
|
+
nadel = score_na_deletion(dpmoves[i][j-1]) + dpmatrix[i][j-1]
|
223
|
+
|
224
|
+
# if (translated_na=="R") && (pep_seq=="FR") && (aa == "R")
|
225
|
+
# require 'debugger';debugger
|
226
|
+
# end
|
227
|
+
|
228
|
+
if match >= nadel
|
229
|
+
dpmatrix[i][j] = match
|
230
|
+
dpmoves[i][j] = @match_move
|
231
|
+
dpframes[i][j] = dpframes[i-1][j-1]
|
232
|
+
else
|
233
|
+
dpmatrix[i][j] = nadel
|
234
|
+
dpmoves[i][j] = @nadel_move
|
235
|
+
dpframes[i][j] = next_frame(dpframes[i][j-1])
|
236
|
+
end
|
237
|
+
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# Find best end-point
|
242
|
+
end_score = dpmatrix[nrow-1].max
|
243
|
+
end_j = dpmatrix[nrow-1].index(end_score)
|
244
|
+
|
245
|
+
save_matrix(dpmatrix,pep_triples,gene_seq,"dpmatrix")
|
246
|
+
save_matrix(dpmoves,pep_triples,gene_seq,"moves")
|
247
|
+
save_matrix(dpframes,pep_triples,gene_seq,"frames")
|
248
|
+
# require 'debugger';debugger
|
249
|
+
|
250
|
+
traceback(nrow-1,end_j,dpmoves)
|
251
|
+
end
|
252
|
+
|
253
|
+
|
254
|
+
def align pep_seq, gene_seq
|
255
|
+
|
256
|
+
trace = calculate_dp(pep_seq,gene_seq)
|
257
|
+
alignment = PeptideToGeneAlignment.new(gene_seq,pep_seq,trace)
|
258
|
+
# puts alignment
|
259
|
+
# require 'debugger';debugger
|
260
|
+
|
261
|
+
return alignment
|
262
|
+
end
|
263
|
+
|
264
|
+
end
|