protk 1.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +85 -0
- data/bin/annotate_ids.rb +59 -0
- data/bin/big_search.rb +41 -0
- data/bin/correct_omssa_retention_times.rb +27 -0
- data/bin/feature_finder.rb +76 -0
- data/bin/file_convert.rb +157 -0
- data/bin/generate_omssa_loc.rb +42 -0
- data/bin/interprophet.rb +91 -0
- data/bin/make_decoy.rb +64 -0
- data/bin/manage_db.rb +123 -0
- data/bin/mascot_search.rb +187 -0
- data/bin/mascot_to_pepxml.rb +44 -0
- data/bin/msgfplus_search.rb +191 -0
- data/bin/omssa_search.rb +205 -0
- data/bin/peptide_prophet.rb +245 -0
- data/bin/pepxml_to_table.rb +78 -0
- data/bin/protein_prophet.rb +140 -0
- data/bin/protk_setup.rb +31 -0
- data/bin/repair_run_summary.rb +113 -0
- data/bin/tandem_search.rb +292 -0
- data/bin/template_search.rb +144 -0
- data/bin/unimod_to_loc.rb +118 -0
- data/bin/xls_to_table.rb +46 -0
- data/ext/protk/extconf.rb +3 -0
- data/ext/protk/protk.c +235 -0
- data/lib/protk/big_search_rakefile.rake +16 -0
- data/lib/protk/big_search_tool.rb +23 -0
- data/lib/protk/bio_sptr_extensions.rb +210 -0
- data/lib/protk/biotools_excel_converter.rb +60 -0
- data/lib/protk/command_runner.rb +84 -0
- data/lib/protk/constants.rb +296 -0
- data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
- data/lib/protk/data/apt-get_packages.yaml +47 -0
- data/lib/protk/data/brew_packages.yaml +10 -0
- data/lib/protk/data/default_config.yml +20 -0
- data/lib/protk/data/predefined_db.crap.yaml +19 -0
- data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
- data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
- data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
- data/lib/protk/data/tandem_params.xml +56 -0
- data/lib/protk/data/taxonomy_template.xml +9 -0
- data/lib/protk/data/unimod.xml +16780 -0
- data/lib/protk/eupathdb_gene_information_table.rb +158 -0
- data/lib/protk/galaxy_stager.rb +24 -0
- data/lib/protk/galaxy_util.rb +9 -0
- data/lib/protk/manage_db_rakefile.rake +484 -0
- data/lib/protk/manage_db_tool.rb +181 -0
- data/lib/protk/mascot_util.rb +63 -0
- data/lib/protk/omssa_util.rb +57 -0
- data/lib/protk/plasmodb.rb +50 -0
- data/lib/protk/prophet_tool.rb +85 -0
- data/lib/protk/protein_annotator.rb +646 -0
- data/lib/protk/protxml.rb +137 -0
- data/lib/protk/randomize.rb +7 -0
- data/lib/protk/search_tool.rb +182 -0
- data/lib/protk/setup_rakefile.rake +245 -0
- data/lib/protk/setup_tool.rb +19 -0
- data/lib/protk/spreadsheet_extensions.rb +78 -0
- data/lib/protk/swissprot_database.rb +38 -0
- data/lib/protk/tool.rb +182 -0
- data/lib/protk/xtandem_defaults.rb +11 -0
- data/lib/protk.rb +18 -0
- metadata +256 -0
data/bin/xls_to_table.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 18/1/2011
|
5
|
+
#
|
6
|
+
# Converts an Excel Spreadsheet to a tab delimited table
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/command_runner'
|
12
|
+
require 'protk/tool'
|
13
|
+
require 'spreadsheet'
|
14
|
+
|
15
|
+
# Setup command-line options for this tool.
|
16
|
+
#
|
17
|
+
tool=Tool.new({:explicit_output=>true})
|
18
|
+
tool.option_parser.banner = "Convert an xls file to a tab delimited table.\n\nUsage: xls_to_table.rb [options] file1.xls"
|
19
|
+
|
20
|
+
tool.option_parser.parse!
|
21
|
+
|
22
|
+
input_file=ARGV[0]
|
23
|
+
|
24
|
+
output_file=tool.explicit_output
|
25
|
+
output_file="#{input_file}.csv" unless ( output_file != nil )
|
26
|
+
|
27
|
+
output_fh = File.new(output_file,'w')
|
28
|
+
|
29
|
+
|
30
|
+
# Open the original excel workbook for reading
|
31
|
+
Spreadsheet.client_encoding = 'UTF-8'
|
32
|
+
inputBook = Spreadsheet.open "#{input_file}"
|
33
|
+
inputSheet = inputBook.worksheet 0
|
34
|
+
|
35
|
+
inputSheet.each do |row|
|
36
|
+
line=""
|
37
|
+
row.each do |colv|
|
38
|
+
line << "#{colv}\t"
|
39
|
+
end
|
40
|
+
line.chop!
|
41
|
+
output_fh.write "#{line}\n"
|
42
|
+
end
|
43
|
+
|
44
|
+
output_fh.close
|
45
|
+
|
46
|
+
|
data/ext/protk/protk.c
ADDED
@@ -0,0 +1,235 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
|
3
|
+
|
4
|
+
/* */
|
5
|
+
/* make_random.c - make random protein sequence database using Markov chain with transitional */
|
6
|
+
/* probabilities from amino acid frequencies in a real database in FASTA format */
|
7
|
+
/* */
|
8
|
+
/* (c) Magnus Palmblad, Division of Ion Physics, Uppsala University, Sweden, 2001- */
|
9
|
+
/* */
|
10
|
+
/* Usage: make_random <sequence database> <number of sequences to generate> <output file> */
|
11
|
+
/* */
|
12
|
+
/* Example: mmpi 562.fasta 1000000 562_random_1000000.fasta */
|
13
|
+
/* */
|
14
|
+
/* Compile with gcc -o make_random make_random.c -lm */
|
15
|
+
/* */
|
16
|
+
|
17
|
+
#include <stdio.h>
|
18
|
+
#include <stdlib.h>
|
19
|
+
#include <ctype.h>
|
20
|
+
#include <string.h>
|
21
|
+
#include <math.h>
|
22
|
+
|
23
|
+
#define AMINO_ACIDS "ARNDCEQGHILKMFPSTWYV"
|
24
|
+
#define NOT_AMINO_ACIDS "BJOUXZ*"
|
25
|
+
#define MAX_SEQUENCE_LENGTH 20000
|
26
|
+
#define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
|
27
|
+
|
28
|
+
static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
|
29
|
+
char *input_file = RSTRING_PTR(input_file_in);
|
30
|
+
long sequences_to_generate = NUM2INT(db_length_in);
|
31
|
+
char * output_file = RSTRING_PTR(output_file_in);
|
32
|
+
|
33
|
+
char line[MAX_LINE_LENGTH];
|
34
|
+
char settings_line[60][70];
|
35
|
+
char infile[255], outfile[255]; /* for reading input and writing output */
|
36
|
+
char prefix_string[255];
|
37
|
+
char *p,**index;
|
38
|
+
char *sequence;
|
39
|
+
char one_sequence[MAX_SEQUENCE_LENGTH],random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)],random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
|
40
|
+
char *temp_sequence;
|
41
|
+
int a;
|
42
|
+
FILE *inp, *outp;
|
43
|
+
|
44
|
+
long i, j, k, l, n, n_sequences, protein;
|
45
|
+
long MP[21][MAX_SEQUENCE_LENGTH];
|
46
|
+
long measured_aa_freq[21], generated_aa_freq[21], measured_pl_sum=0, generated_pl_sum=0;
|
47
|
+
long row_sum[MAX_SEQUENCE_LENGTH],partial_sum;
|
48
|
+
long one_index,pl;
|
49
|
+
double x;
|
50
|
+
|
51
|
+
/* scanning sequence database */
|
52
|
+
|
53
|
+
strcpy(infile,input_file);
|
54
|
+
if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
|
55
|
+
printf("scanning sequence database %s",infile);fflush(stdout);
|
56
|
+
i=0;n=0;k=0;
|
57
|
+
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {i++; if(line[0]=='>') {if (!(n%1000)) printf(".");fflush(stdout); n++;} }
|
58
|
+
|
59
|
+
n_sequences=n;
|
60
|
+
|
61
|
+
|
62
|
+
/* reading sequence database */
|
63
|
+
|
64
|
+
temp_sequence=(char*)calloc(sizeof(char),MAX_SEQUENCE_LENGTH);
|
65
|
+
sequence=(char*)malloc(sizeof(char)*(i*80)); /* allocate enough memory for 80 characters per line in FASTA database */
|
66
|
+
index=(char**)malloc(sizeof(char*)*n_sequences);
|
67
|
+
index[0]=sequence; /* set first index pointer to beginning of first database sequence */
|
68
|
+
|
69
|
+
if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
|
70
|
+
|
71
|
+
printf("done\nreading sequence database %s",infile);fflush(stdout);
|
72
|
+
n=-1;
|
73
|
+
strcpy(temp_sequence,"\0");
|
74
|
+
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
|
75
|
+
{
|
76
|
+
if (strcmp(line,"\n")==0) {
|
77
|
+
continue;
|
78
|
+
}
|
79
|
+
if (line[0]=='>') {
|
80
|
+
if (n>=0) {
|
81
|
+
if (!(n%1000)&&n>0) {
|
82
|
+
printf(".");fflush(stdout);
|
83
|
+
}
|
84
|
+
strcpy(index[n],temp_sequence);
|
85
|
+
n++; index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
|
86
|
+
strcpy(temp_sequence,"\0");
|
87
|
+
}
|
88
|
+
else
|
89
|
+
{
|
90
|
+
n++;
|
91
|
+
strcpy(temp_sequence,"\0");
|
92
|
+
}
|
93
|
+
}
|
94
|
+
else
|
95
|
+
{
|
96
|
+
if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) continue;
|
97
|
+
strncat(temp_sequence,line,strlen(line)-1);
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
strcpy(index[n],temp_sequence);
|
102
|
+
fclose(inp);
|
103
|
+
|
104
|
+
n_sequences=n+1;
|
105
|
+
|
106
|
+
printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
|
107
|
+
measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
/* generating Markov probabilities */
|
112
|
+
|
113
|
+
printf("generating Markov probability matrix...");fflush(stdout);
|
114
|
+
|
115
|
+
srand(time(0)); /* replace with constant to re-generate identical random databases */
|
116
|
+
|
117
|
+
for(i=0;i<MAX_SEQUENCE_LENGTH;i++) {
|
118
|
+
for(j=0;j<=20;j++) {
|
119
|
+
MP[j][i]=0;
|
120
|
+
}
|
121
|
+
}
|
122
|
+
for(j=0;j<=20;j++) {
|
123
|
+
measured_aa_freq[j]=0;generated_aa_freq[j]=0;
|
124
|
+
}
|
125
|
+
|
126
|
+
for(protein=0;protein<n_sequences;protein++)
|
127
|
+
{
|
128
|
+
if (!(protein%1000)) {printf(".");fflush(stdout);}
|
129
|
+
if (protein<(n_sequences-1))
|
130
|
+
{
|
131
|
+
strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
|
132
|
+
one_sequence[(index[protein+1]-index[protein])/sizeof(char)]='\0';
|
133
|
+
}
|
134
|
+
else strcpy(one_sequence,index[protein]);
|
135
|
+
pl=strlen(one_sequence);
|
136
|
+
n=1;one_index=0;
|
137
|
+
|
138
|
+
for(i=0;i<pl;i++)
|
139
|
+
{
|
140
|
+
if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
|
141
|
+
{
|
142
|
+
if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
|
143
|
+
{
|
144
|
+
printf("Unknown amino acid %c",one_sequence[i]);
|
145
|
+
} else {
|
146
|
+
a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
|
147
|
+
MP[a][i]++;
|
148
|
+
measured_aa_freq[a]++;
|
149
|
+
}
|
150
|
+
}
|
151
|
+
else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
|
152
|
+
}
|
153
|
+
MP[20][pl]++;
|
154
|
+
measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
|
155
|
+
}
|
156
|
+
printf("done\n"); fflush(stdout);
|
157
|
+
|
158
|
+
|
159
|
+
|
160
|
+
for(i=0;i<MAX_SEQUENCE_LENGTH;i++) row_sum[i]=0;
|
161
|
+
for(i=0;i<MAX_SEQUENCE_LENGTH;i++) for(j=0;j<=20;j++) row_sum[i]+=MP[j][i];
|
162
|
+
|
163
|
+
|
164
|
+
/* generate random protein sequences through Markov chain */
|
165
|
+
|
166
|
+
strcpy(outfile,output_file);
|
167
|
+
if ((outp = fopen(outfile, "w"))==NULL) {printf("error opening output file %s\n",outfile); return -1;}
|
168
|
+
printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
|
169
|
+
|
170
|
+
strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
|
171
|
+
|
172
|
+
for(protein=0;protein<sequences_to_generate;protein++)
|
173
|
+
{
|
174
|
+
if (!(protein%1000)) {printf(".");fflush(stdout);}
|
175
|
+
i=0; j=0;
|
176
|
+
while (1)
|
177
|
+
{
|
178
|
+
x=(double)row_sum[j]*((double)rand()/RAND_MAX);
|
179
|
+
partial_sum=MP[0][j]; i=1;
|
180
|
+
while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
|
181
|
+
if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
|
182
|
+
if (i<21)
|
183
|
+
{
|
184
|
+
random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
|
185
|
+
}
|
186
|
+
else /* i==21, i.e. protein sequence terminated */
|
187
|
+
{
|
188
|
+
k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
|
189
|
+
for(l=0;l<j;l++)
|
190
|
+
{
|
191
|
+
random_sequence_output[k]=random_sequence[l]; k++;
|
192
|
+
if (!((k+1)%61))
|
193
|
+
{
|
194
|
+
random_sequence_output[k]='\n'; k++;
|
195
|
+
}
|
196
|
+
}
|
197
|
+
|
198
|
+
random_sequence_output[k]='\0';
|
199
|
+
if (!(k%61)) random_sequence_output[k-1]='\0'; /* remove extra newline for sequence length multiple of 60 */
|
200
|
+
fprintf(outp,">%srp%li\n%s\n",prefix_string,protein,random_sequence_output);
|
201
|
+
break;
|
202
|
+
}
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
fclose(outp);
|
207
|
+
|
208
|
+
|
209
|
+
/* freeing some memory... */
|
210
|
+
|
211
|
+
free(index);
|
212
|
+
|
213
|
+
printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
|
214
|
+
|
215
|
+
k=0;l=0;
|
216
|
+
for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
|
217
|
+
printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
|
218
|
+
for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
|
219
|
+
printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
|
220
|
+
|
221
|
+
return 0;
|
222
|
+
|
223
|
+
}
|
224
|
+
|
225
|
+
/* ruby calls this to load the extension */
|
226
|
+
void Init_protk(void) {
|
227
|
+
/* assume we haven't yet defined Hola */
|
228
|
+
VALUE klass = rb_define_class("Protk",
|
229
|
+
rb_cObject);
|
230
|
+
|
231
|
+
/* the hola_bonjour function can be called
|
232
|
+
* from ruby as "Hola.bonjour" */
|
233
|
+
rb_define_singleton_method(klass,
|
234
|
+
"make_decoys", protk_make_decoys, 4);
|
235
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# create rake dependencies
|
2
|
+
# run rakefile
|
3
|
+
#
|
4
|
+
require 'optparse'
|
5
|
+
require 'pathname'
|
6
|
+
require 'protk/tool'
|
7
|
+
require 'protk/command_runner'
|
8
|
+
require 'pp'
|
9
|
+
require 'rake'
|
10
|
+
|
11
|
+
class BigSearchTool < Tool
|
12
|
+
|
13
|
+
def run input_files
|
14
|
+
command = "rake -f #{rakefile_path} #{input_files.join(" ")}"
|
15
|
+
runner=CommandRunner.new(Constants.new)
|
16
|
+
runner.run_local(command)
|
17
|
+
end
|
18
|
+
|
19
|
+
def rakefile_path
|
20
|
+
"#{File.dirname(__FILE__)}/big_search_rakefile.rake"
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,210 @@
|
|
1
|
+
# Add methods to the Bio::SPTR class to retrieve objects using the keys defined in proteinannotator.rb
|
2
|
+
#
|
3
|
+
# newColumnKeys=['recname','cd','altnames','accessions','location','function','ipi','intact','pride','ensembl','refsMASS SPEC','refsNUCLEOTIDE SEQUENCE','refsX-RAY CRYSTALLOGRAPHY','refs3D-STRUCTURE MODELLING','refsPROTEIN SEQUENCE','refsGLYCOSYLATION','glycosites']
|
4
|
+
#
|
5
|
+
#
|
6
|
+
|
7
|
+
## We start the columns off with the header name
|
8
|
+
#newColumns={'recname'=>["Primary Name"],'cd'=>["CD Antigen Name"],'altnames'=>["Alternate Names"],
|
9
|
+
# 'accessions' =>["Swissprot Accessions"],
|
10
|
+
# 'location' => ["Subcellular Location"],
|
11
|
+
# 'function' => ["Known Function"],
|
12
|
+
# 'ipi' => ["IPI"],
|
13
|
+
# 'intact' => ["Interactions"],
|
14
|
+
# 'pride' => ['Pride'],
|
15
|
+
# 'ensembl'=> ['Ensembl'],
|
16
|
+
# 'refsMASS SPEC'=>["MS Refs"],
|
17
|
+
# 'refsGLYCOSYLATION'=>["Glyco Refs"],
|
18
|
+
# 'refsNUCLEOTIDE SEQUENCE'=>["Nucleotide Refs"],
|
19
|
+
# 'refsX-RAY CRYSTALLOGRAPHY'=>["Crystallography Refs"],
|
20
|
+
# 'refs3D-STRUCTURE MODELLING'=>["3D-Modelling Refs"],
|
21
|
+
# 'refsPROTEIN SEQUENCE'=>["Protein sequence Refs"],
|
22
|
+
# 'glycosites'=>["Glycosylation Sites"]
|
23
|
+
#}
|
24
|
+
require 'rubygems'
|
25
|
+
require 'bio'
|
26
|
+
|
27
|
+
class Bio::SPTR < Bio::EMBLDB
|
28
|
+
|
29
|
+
#
|
30
|
+
# Functions corresponding to retrieving data for specific keys
|
31
|
+
#
|
32
|
+
|
33
|
+
# The recommended name for the Protein
|
34
|
+
#
|
35
|
+
def recname
|
36
|
+
pname_field=self.de
|
37
|
+
entries=pname_field.split(";")
|
38
|
+
entries.each do |entry|
|
39
|
+
m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
|
40
|
+
if ( m!=nil)
|
41
|
+
if ( m[1]=="RecName")
|
42
|
+
return m[3]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
return ""
|
47
|
+
end
|
48
|
+
|
49
|
+
# The CD Antigen name
|
50
|
+
#
|
51
|
+
def cd
|
52
|
+
pname_field=self.de
|
53
|
+
entries=pname_field.split(";")
|
54
|
+
entries.each do |entry|
|
55
|
+
m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
|
56
|
+
if ( m!=nil)
|
57
|
+
if ( (m[1]=="AltName") && (m[2]=="CD_antigen") )
|
58
|
+
return m[3]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
return ""
|
64
|
+
end
|
65
|
+
|
66
|
+
# All alternate names
|
67
|
+
#
|
68
|
+
def altnames
|
69
|
+
altnames=""
|
70
|
+
|
71
|
+
pname_field=self.de
|
72
|
+
entries=pname_field.split(";")
|
73
|
+
entries.each do |entry|
|
74
|
+
m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
|
75
|
+
if ( m!=nil)
|
76
|
+
if ( (m[1]=="AltName") && (m[2]!="CD_antigen") )
|
77
|
+
altnames << "#{m[3]}; "
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
if ( altnames!="") # Get ride of extraneous "; "
|
84
|
+
altnames.chop!.chop!
|
85
|
+
end
|
86
|
+
|
87
|
+
return altnames
|
88
|
+
end
|
89
|
+
|
90
|
+
# SwissProt Accessions
|
91
|
+
#
|
92
|
+
def accessions
|
93
|
+
return ""
|
94
|
+
end
|
95
|
+
|
96
|
+
# Subcellular Location
|
97
|
+
#
|
98
|
+
def location
|
99
|
+
return self.cc["SUBCELLULAR LOCATION"].to_s
|
100
|
+
end
|
101
|
+
|
102
|
+
# Function
|
103
|
+
#
|
104
|
+
def function
|
105
|
+
return self.cc["FUNCTION"].to_s
|
106
|
+
end
|
107
|
+
|
108
|
+
# Similarity
|
109
|
+
#
|
110
|
+
def similarity
|
111
|
+
return self.cc["SIMILARITY"].to_s
|
112
|
+
end
|
113
|
+
|
114
|
+
# Tissue Specificity
|
115
|
+
#
|
116
|
+
def tissues
|
117
|
+
return self.cc["TISSUE SPECIFICITY"].to_s
|
118
|
+
end
|
119
|
+
|
120
|
+
# Disease
|
121
|
+
#
|
122
|
+
def disease
|
123
|
+
return self.cc["DISEASE"].to_s
|
124
|
+
end
|
125
|
+
|
126
|
+
# Subunit
|
127
|
+
def subunit
|
128
|
+
return self.cc["SUBUNIT"].to_s
|
129
|
+
end
|
130
|
+
|
131
|
+
# Domain
|
132
|
+
def domain
|
133
|
+
return self.cc["DOMAIN"].to_s
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# Getting dr entry
|
138
|
+
#
|
139
|
+
|
140
|
+
# Helper Function to create links
|
141
|
+
#
|
142
|
+
def safely_get_drentry_for_key(key)
|
143
|
+
if ( self.dr[key]==nil)
|
144
|
+
return ""
|
145
|
+
end
|
146
|
+
|
147
|
+
return dr[key][0][0]
|
148
|
+
end
|
149
|
+
|
150
|
+
# IPI Accession number
|
151
|
+
#
|
152
|
+
def ipi
|
153
|
+
return self.safely_get_drentry_for_key("IPI")
|
154
|
+
end
|
155
|
+
|
156
|
+
# Intact accession number
|
157
|
+
#
|
158
|
+
def intact
|
159
|
+
return self.safely_get_drentry_for_key("PRIDE")
|
160
|
+
end
|
161
|
+
|
162
|
+
# Pride accession number
|
163
|
+
#
|
164
|
+
def pride
|
165
|
+
return self.safely_get_drentry_for_key("PRIDE")
|
166
|
+
end
|
167
|
+
|
168
|
+
# Ensembl accession number
|
169
|
+
#
|
170
|
+
def ensembl
|
171
|
+
return self.safely_get_drentry_for_key("Ensembl")
|
172
|
+
end
|
173
|
+
|
174
|
+
# NextBIO accession number
|
175
|
+
#
|
176
|
+
def nextbio
|
177
|
+
return self.safely_get_drentry_for_key("NextBio")
|
178
|
+
end
|
179
|
+
|
180
|
+
|
181
|
+
# Number of transmembrane regions
|
182
|
+
#
|
183
|
+
def num_transmem
|
184
|
+
begin
|
185
|
+
if ( self.ft["TRANSMEM"]==nil)
|
186
|
+
return 0.to_s
|
187
|
+
else
|
188
|
+
return self.ft["TRANSMEM"].length.to_s
|
189
|
+
end
|
190
|
+
rescue
|
191
|
+
p "Warning: Unable to parse feature table for entry #{self.accession}"
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
|
196
|
+
# Number of signal peptide features
|
197
|
+
#
|
198
|
+
def signalp
|
199
|
+
begin
|
200
|
+
if ( self.ft["SIGNAL"]==nil)
|
201
|
+
return 0.to_s
|
202
|
+
else
|
203
|
+
return self.ft["SIGNAL"].length.to_s
|
204
|
+
end
|
205
|
+
rescue
|
206
|
+
p "Warning: Unable to parse feature table for entry #{self.accession}"
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spreadsheet'
|
3
|
+
|
4
|
+
|
5
|
+
class BioToolsExcelConverter
|
6
|
+
|
7
|
+
def initialize(filename)
|
8
|
+
@inputBook = Spreadsheet.open File.new("#{filename}")
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.isBiotools(filename)
|
12
|
+
testBook = Spreadsheet.open File.new("#{filename}")
|
13
|
+
testSheet = testBook.worksheet 0
|
14
|
+
|
15
|
+
isbiotools=FALSE
|
16
|
+
testSheet.each do |row|
|
17
|
+
if (row[0].class==String) && row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
|
18
|
+
isbiotools=TRUE
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
isbiotools
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_rows
|
27
|
+
|
28
|
+
sheet=@inputBook.worksheet 0
|
29
|
+
|
30
|
+
protein_rows=[]
|
31
|
+
|
32
|
+
n_rows=sheet.dimensions[1]
|
33
|
+
|
34
|
+
protein_rows=(0...n_rows).collect do |row_i|
|
35
|
+
new_row=nil
|
36
|
+
|
37
|
+
row=sheet.row row_i
|
38
|
+
if ( row[0]!=nil)
|
39
|
+
digmatch=row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
|
40
|
+
if ( digmatch!=nil )
|
41
|
+
new_row=[]
|
42
|
+
text= sheet.row(row_i-1)[0]
|
43
|
+
m=text.match(/\s(\S*)\s*$/)
|
44
|
+
throw "Badly formed protein line in biotools file ... could not parse protein name from #{text}" unless m!=nil
|
45
|
+
new_row[0]=m[1]
|
46
|
+
new_row[1]=digmatch[1]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
new_row
|
51
|
+
end
|
52
|
+
|
53
|
+
protein_rows.compact!
|
54
|
+
protein_rows.insert(0,["Accession","Ion Scores"])
|
55
|
+
|
56
|
+
protein_rows
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#
|
2
|
+
# This file is part of protk
|
3
|
+
# Created by Ira Cooke 15/12/2010
|
4
|
+
#
|
5
|
+
# Runs system commands and provides methods for monitoring output
|
6
|
+
#
|
7
|
+
|
8
|
+
|
9
|
+
require 'open4'
|
10
|
+
require 'protk/constants'
|
11
|
+
|
12
|
+
class CommandRunner
|
13
|
+
|
14
|
+
# The protk environment in which to run commands
|
15
|
+
#
|
16
|
+
attr :env
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
def initialize(environment)
|
22
|
+
@env=environment
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
# Runs the given command in a local shell
|
29
|
+
#
|
30
|
+
def run_local(command_string)
|
31
|
+
@env.log("Command: #{command_string} started",:info)
|
32
|
+
status = Open4::popen4("#{command_string} ") do |pid, stdin, stdout, stderr|
|
33
|
+
puts "PID #{pid}"
|
34
|
+
|
35
|
+
stdout.each { |line| @env.log(line.chomp,:info) }
|
36
|
+
|
37
|
+
stderr.each { |line| @env.log(line.chomp,:warn) }
|
38
|
+
|
39
|
+
end
|
40
|
+
if ( status!=0 )
|
41
|
+
# We terminated with some error code so log as an error
|
42
|
+
@env.log( "Command: #{command_string} exited with status #{status.to_s}",:error)
|
43
|
+
else
|
44
|
+
@env.log( "Command: #{command_string} exited with status #{status.to_s}",:info)
|
45
|
+
end
|
46
|
+
status
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
# Runs the given command as a background job
|
53
|
+
# At present this sends the job to a PBS system, but in future we might support other types of background jobs
|
54
|
+
#
|
55
|
+
def run_batch(command_string,job_params,jobscript_path,autodelete)
|
56
|
+
@env.log("Creating batch file for command: #{command_string}",:info)
|
57
|
+
|
58
|
+
if ( autodelete )
|
59
|
+
# command_string<<";rm #{jobscript_path}"
|
60
|
+
end
|
61
|
+
|
62
|
+
jobid=job_params[:jobid]
|
63
|
+
if ( job_params[:vmem]==nil)
|
64
|
+
job_params[:vmem]="900mb"
|
65
|
+
end
|
66
|
+
if (job_params[:queue] ==nil )
|
67
|
+
job_params[:queue]="lowmem"
|
68
|
+
end
|
69
|
+
|
70
|
+
job_script="#!/bin/bash
|
71
|
+
#PBS -N #{jobid}
|
72
|
+
#PBS -e pbs.#{jobid}.err
|
73
|
+
#PBS -o pbs.#{jobid}.log
|
74
|
+
#PBS -l nodes=1:ppn=1,vmem=#{job_params[:vmem]}
|
75
|
+
#PBS -q #{job_params[:queue]}
|
76
|
+
#{command_string}"
|
77
|
+
|
78
|
+
p File.open(jobscript_path, 'w') {|f| f.write(job_script) }
|
79
|
+
|
80
|
+
self.run_local("qsub #{jobscript_path}")
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|