protk 1.1.0.pre
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +85 -0
- data/bin/annotate_ids.rb +59 -0
- data/bin/big_search.rb +41 -0
- data/bin/correct_omssa_retention_times.rb +27 -0
- data/bin/feature_finder.rb +76 -0
- data/bin/file_convert.rb +157 -0
- data/bin/generate_omssa_loc.rb +42 -0
- data/bin/interprophet.rb +91 -0
- data/bin/make_decoy.rb +64 -0
- data/bin/manage_db.rb +123 -0
- data/bin/mascot_search.rb +187 -0
- data/bin/mascot_to_pepxml.rb +44 -0
- data/bin/msgfplus_search.rb +191 -0
- data/bin/omssa_search.rb +205 -0
- data/bin/peptide_prophet.rb +245 -0
- data/bin/pepxml_to_table.rb +78 -0
- data/bin/protein_prophet.rb +140 -0
- data/bin/protk_setup.rb +31 -0
- data/bin/repair_run_summary.rb +113 -0
- data/bin/tandem_search.rb +292 -0
- data/bin/template_search.rb +144 -0
- data/bin/unimod_to_loc.rb +118 -0
- data/bin/xls_to_table.rb +46 -0
- data/ext/protk/extconf.rb +3 -0
- data/ext/protk/protk.c +235 -0
- data/lib/protk/big_search_rakefile.rake +16 -0
- data/lib/protk/big_search_tool.rb +23 -0
- data/lib/protk/bio_sptr_extensions.rb +210 -0
- data/lib/protk/biotools_excel_converter.rb +60 -0
- data/lib/protk/command_runner.rb +84 -0
- data/lib/protk/constants.rb +296 -0
- data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
- data/lib/protk/data/apt-get_packages.yaml +47 -0
- data/lib/protk/data/brew_packages.yaml +10 -0
- data/lib/protk/data/default_config.yml +20 -0
- data/lib/protk/data/predefined_db.crap.yaml +19 -0
- data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
- data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
- data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
- data/lib/protk/data/tandem_params.xml +56 -0
- data/lib/protk/data/taxonomy_template.xml +9 -0
- data/lib/protk/data/unimod.xml +16780 -0
- data/lib/protk/eupathdb_gene_information_table.rb +158 -0
- data/lib/protk/galaxy_stager.rb +24 -0
- data/lib/protk/galaxy_util.rb +9 -0
- data/lib/protk/manage_db_rakefile.rake +484 -0
- data/lib/protk/manage_db_tool.rb +181 -0
- data/lib/protk/mascot_util.rb +63 -0
- data/lib/protk/omssa_util.rb +57 -0
- data/lib/protk/plasmodb.rb +50 -0
- data/lib/protk/prophet_tool.rb +85 -0
- data/lib/protk/protein_annotator.rb +646 -0
- data/lib/protk/protxml.rb +137 -0
- data/lib/protk/randomize.rb +7 -0
- data/lib/protk/search_tool.rb +182 -0
- data/lib/protk/setup_rakefile.rake +245 -0
- data/lib/protk/setup_tool.rb +19 -0
- data/lib/protk/spreadsheet_extensions.rb +78 -0
- data/lib/protk/swissprot_database.rb +38 -0
- data/lib/protk/tool.rb +182 -0
- data/lib/protk/xtandem_defaults.rb +11 -0
- data/lib/protk.rb +18 -0
- metadata +256 -0
data/bin/xls_to_table.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Created by Ira Cooke 18/1/2011
|
5
|
+
#
|
6
|
+
# Converts an Excel Spreadsheet to a tab delimited table
|
7
|
+
#
|
8
|
+
#
|
9
|
+
|
10
|
+
require 'protk/constants'
|
11
|
+
require 'protk/command_runner'
|
12
|
+
require 'protk/tool'
|
13
|
+
require 'spreadsheet'
|
14
|
+
|
15
|
+
# Setup command-line options for this tool.
|
16
|
+
#
|
17
|
+
tool=Tool.new({:explicit_output=>true})
|
18
|
+
tool.option_parser.banner = "Convert an xls file to a tab delimited table.\n\nUsage: xls_to_table.rb [options] file1.xls"
|
19
|
+
|
20
|
+
tool.option_parser.parse!
|
21
|
+
|
22
|
+
input_file=ARGV[0]
|
23
|
+
|
24
|
+
output_file=tool.explicit_output
|
25
|
+
output_file="#{input_file}.csv" unless ( output_file != nil )
|
26
|
+
|
27
|
+
output_fh = File.new(output_file,'w')
|
28
|
+
|
29
|
+
|
30
|
+
# Open the original excel workbook for reading
|
31
|
+
Spreadsheet.client_encoding = 'UTF-8'
|
32
|
+
inputBook = Spreadsheet.open "#{input_file}"
|
33
|
+
inputSheet = inputBook.worksheet 0
|
34
|
+
|
35
|
+
inputSheet.each do |row|
|
36
|
+
line=""
|
37
|
+
row.each do |colv|
|
38
|
+
line << "#{colv}\t"
|
39
|
+
end
|
40
|
+
line.chop!
|
41
|
+
output_fh.write "#{line}\n"
|
42
|
+
end
|
43
|
+
|
44
|
+
output_fh.close
|
45
|
+
|
46
|
+
|
data/ext/protk/protk.c
ADDED
@@ -0,0 +1,235 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
|
3
|
+
|
4
|
+
/* */
|
5
|
+
/* make_random.c - make random protein sequence database using Markov chain with transitional */
|
6
|
+
/* probabilities from amino acid frequencies in a real database in FASTA format */
|
7
|
+
/* */
|
8
|
+
/* (c) Magnus Palmblad, Division of Ion Physics, Uppsala University, Sweden, 2001- */
|
9
|
+
/* */
|
10
|
+
/* Usage: make_random <sequence database> <number of sequences to generate> <output file> */
|
11
|
+
/* */
|
12
|
+
/* Example: mmpi 562.fasta 1000000 562_random_1000000.fasta */
|
13
|
+
/* */
|
14
|
+
/* Compile with gcc -o make_random make_random.c -lm */
|
15
|
+
/* */
|
16
|
+
|
17
|
+
#include <stdio.h>
|
18
|
+
#include <stdlib.h>
|
19
|
+
#include <ctype.h>
|
20
|
+
#include <string.h>
|
21
|
+
#include <math.h>
|
22
|
+
|
23
|
+
#define AMINO_ACIDS "ARNDCEQGHILKMFPSTWYV"
|
24
|
+
#define NOT_AMINO_ACIDS "BJOUXZ*"
|
25
|
+
#define MAX_SEQUENCE_LENGTH 20000
|
26
|
+
#define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
|
27
|
+
|
28
|
+
static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
|
29
|
+
char *input_file = RSTRING_PTR(input_file_in);
|
30
|
+
long sequences_to_generate = NUM2INT(db_length_in);
|
31
|
+
char * output_file = RSTRING_PTR(output_file_in);
|
32
|
+
|
33
|
+
char line[MAX_LINE_LENGTH];
|
34
|
+
char settings_line[60][70];
|
35
|
+
char infile[255], outfile[255]; /* for reading input and writing output */
|
36
|
+
char prefix_string[255];
|
37
|
+
char *p,**index;
|
38
|
+
char *sequence;
|
39
|
+
char one_sequence[MAX_SEQUENCE_LENGTH],random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)],random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
|
40
|
+
char *temp_sequence;
|
41
|
+
int a;
|
42
|
+
FILE *inp, *outp;
|
43
|
+
|
44
|
+
long i, j, k, l, n, n_sequences, protein;
|
45
|
+
long MP[21][MAX_SEQUENCE_LENGTH];
|
46
|
+
long measured_aa_freq[21], generated_aa_freq[21], measured_pl_sum=0, generated_pl_sum=0;
|
47
|
+
long row_sum[MAX_SEQUENCE_LENGTH],partial_sum;
|
48
|
+
long one_index,pl;
|
49
|
+
double x;
|
50
|
+
|
51
|
+
/* scanning sequence database */
|
52
|
+
|
53
|
+
strcpy(infile,input_file);
|
54
|
+
if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
|
55
|
+
printf("scanning sequence database %s",infile);fflush(stdout);
|
56
|
+
i=0;n=0;k=0;
|
57
|
+
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {i++; if(line[0]=='>') {if (!(n%1000)) printf(".");fflush(stdout); n++;} }
|
58
|
+
|
59
|
+
n_sequences=n;
|
60
|
+
|
61
|
+
|
62
|
+
/* reading sequence database */
|
63
|
+
|
64
|
+
temp_sequence=(char*)calloc(sizeof(char),MAX_SEQUENCE_LENGTH);
|
65
|
+
sequence=(char*)malloc(sizeof(char)*(i*80)); /* allocate enough memory for 80 characters per line in FASTA database */
|
66
|
+
index=(char**)malloc(sizeof(char*)*n_sequences);
|
67
|
+
index[0]=sequence; /* set first index pointer to beginning of first database sequence */
|
68
|
+
|
69
|
+
if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
|
70
|
+
|
71
|
+
printf("done\nreading sequence database %s",infile);fflush(stdout);
|
72
|
+
n=-1;
|
73
|
+
strcpy(temp_sequence,"\0");
|
74
|
+
while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
|
75
|
+
{
|
76
|
+
if (strcmp(line,"\n")==0) {
|
77
|
+
continue;
|
78
|
+
}
|
79
|
+
if (line[0]=='>') {
|
80
|
+
if (n>=0) {
|
81
|
+
if (!(n%1000)&&n>0) {
|
82
|
+
printf(".");fflush(stdout);
|
83
|
+
}
|
84
|
+
strcpy(index[n],temp_sequence);
|
85
|
+
n++; index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
|
86
|
+
strcpy(temp_sequence,"\0");
|
87
|
+
}
|
88
|
+
else
|
89
|
+
{
|
90
|
+
n++;
|
91
|
+
strcpy(temp_sequence,"\0");
|
92
|
+
}
|
93
|
+
}
|
94
|
+
else
|
95
|
+
{
|
96
|
+
if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) continue;
|
97
|
+
strncat(temp_sequence,line,strlen(line)-1);
|
98
|
+
}
|
99
|
+
}
|
100
|
+
|
101
|
+
strcpy(index[n],temp_sequence);
|
102
|
+
fclose(inp);
|
103
|
+
|
104
|
+
n_sequences=n+1;
|
105
|
+
|
106
|
+
printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
|
107
|
+
measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
/* generating Markov probabilities */
|
112
|
+
|
113
|
+
printf("generating Markov probability matrix...");fflush(stdout);
|
114
|
+
|
115
|
+
srand(time(0)); /* replace with constant to re-generate identical random databases */
|
116
|
+
|
117
|
+
for(i=0;i<MAX_SEQUENCE_LENGTH;i++) {
|
118
|
+
for(j=0;j<=20;j++) {
|
119
|
+
MP[j][i]=0;
|
120
|
+
}
|
121
|
+
}
|
122
|
+
for(j=0;j<=20;j++) {
|
123
|
+
measured_aa_freq[j]=0;generated_aa_freq[j]=0;
|
124
|
+
}
|
125
|
+
|
126
|
+
for(protein=0;protein<n_sequences;protein++)
|
127
|
+
{
|
128
|
+
if (!(protein%1000)) {printf(".");fflush(stdout);}
|
129
|
+
if (protein<(n_sequences-1))
|
130
|
+
{
|
131
|
+
strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
|
132
|
+
one_sequence[(index[protein+1]-index[protein])/sizeof(char)]='\0';
|
133
|
+
}
|
134
|
+
else strcpy(one_sequence,index[protein]);
|
135
|
+
pl=strlen(one_sequence);
|
136
|
+
n=1;one_index=0;
|
137
|
+
|
138
|
+
for(i=0;i<pl;i++)
|
139
|
+
{
|
140
|
+
if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
|
141
|
+
{
|
142
|
+
if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
|
143
|
+
{
|
144
|
+
printf("Unknown amino acid %c",one_sequence[i]);
|
145
|
+
} else {
|
146
|
+
a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
|
147
|
+
MP[a][i]++;
|
148
|
+
measured_aa_freq[a]++;
|
149
|
+
}
|
150
|
+
}
|
151
|
+
else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
|
152
|
+
}
|
153
|
+
MP[20][pl]++;
|
154
|
+
measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
|
155
|
+
}
|
156
|
+
printf("done\n"); fflush(stdout);
|
157
|
+
|
158
|
+
|
159
|
+
|
160
|
+
for(i=0;i<MAX_SEQUENCE_LENGTH;i++) row_sum[i]=0;
|
161
|
+
for(i=0;i<MAX_SEQUENCE_LENGTH;i++) for(j=0;j<=20;j++) row_sum[i]+=MP[j][i];
|
162
|
+
|
163
|
+
|
164
|
+
/* generate random protein sequences through Markov chain */
|
165
|
+
|
166
|
+
strcpy(outfile,output_file);
|
167
|
+
if ((outp = fopen(outfile, "w"))==NULL) {printf("error opening output file %s\n",outfile); return -1;}
|
168
|
+
printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
|
169
|
+
|
170
|
+
strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
|
171
|
+
|
172
|
+
for(protein=0;protein<sequences_to_generate;protein++)
|
173
|
+
{
|
174
|
+
if (!(protein%1000)) {printf(".");fflush(stdout);}
|
175
|
+
i=0; j=0;
|
176
|
+
while (1)
|
177
|
+
{
|
178
|
+
x=(double)row_sum[j]*((double)rand()/RAND_MAX);
|
179
|
+
partial_sum=MP[0][j]; i=1;
|
180
|
+
while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
|
181
|
+
if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
|
182
|
+
if (i<21)
|
183
|
+
{
|
184
|
+
random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
|
185
|
+
}
|
186
|
+
else /* i==21, i.e. protein sequence terminated */
|
187
|
+
{
|
188
|
+
k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
|
189
|
+
for(l=0;l<j;l++)
|
190
|
+
{
|
191
|
+
random_sequence_output[k]=random_sequence[l]; k++;
|
192
|
+
if (!((k+1)%61))
|
193
|
+
{
|
194
|
+
random_sequence_output[k]='\n'; k++;
|
195
|
+
}
|
196
|
+
}
|
197
|
+
|
198
|
+
random_sequence_output[k]='\0';
|
199
|
+
if (!(k%61)) random_sequence_output[k-1]='\0'; /* remove extra newline for sequence length multiple of 60 */
|
200
|
+
fprintf(outp,">%srp%li\n%s\n",prefix_string,protein,random_sequence_output);
|
201
|
+
break;
|
202
|
+
}
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
fclose(outp);
|
207
|
+
|
208
|
+
|
209
|
+
/* freeing some memory... */
|
210
|
+
|
211
|
+
free(index);
|
212
|
+
|
213
|
+
printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
|
214
|
+
|
215
|
+
k=0;l=0;
|
216
|
+
for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
|
217
|
+
printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
|
218
|
+
for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
|
219
|
+
printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
|
220
|
+
|
221
|
+
return 0;
|
222
|
+
|
223
|
+
}
|
224
|
+
|
225
|
+
/* ruby calls this to load the extension */
|
226
|
+
void Init_protk(void) {
|
227
|
+
/* assume we haven't yet defined Hola */
|
228
|
+
VALUE klass = rb_define_class("Protk",
|
229
|
+
rb_cObject);
|
230
|
+
|
231
|
+
/* the hola_bonjour function can be called
|
232
|
+
* from ruby as "Hola.bonjour" */
|
233
|
+
rb_define_singleton_method(klass,
|
234
|
+
"make_decoys", protk_make_decoys, 4);
|
235
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# create rake dependencies
|
2
|
+
# run rakefile
|
3
|
+
#
|
4
|
+
require 'optparse'
|
5
|
+
require 'pathname'
|
6
|
+
require 'protk/tool'
|
7
|
+
require 'protk/command_runner'
|
8
|
+
require 'pp'
|
9
|
+
require 'rake'
|
10
|
+
|
11
|
+
class BigSearchTool < Tool
|
12
|
+
|
13
|
+
def run input_files
|
14
|
+
command = "rake -f #{rakefile_path} #{input_files.join(" ")}"
|
15
|
+
runner=CommandRunner.new(Constants.new)
|
16
|
+
runner.run_local(command)
|
17
|
+
end
|
18
|
+
|
19
|
+
def rakefile_path
|
20
|
+
"#{File.dirname(__FILE__)}/big_search_rakefile.rake"
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,210 @@
|
|
1
|
+
# Add methods to the Bio::SPTR class to retrieve objects using the keys defined in proteinannotator.rb
|
2
|
+
#
|
3
|
+
# newColumnKeys=['recname','cd','altnames','accessions','location','function','ipi','intact','pride','ensembl','refsMASS SPEC','refsNUCLEOTIDE SEQUENCE','refsX-RAY CRYSTALLOGRAPHY','refs3D-STRUCTURE MODELLING','refsPROTEIN SEQUENCE','refsGLYCOSYLATION','glycosites']
|
4
|
+
#
|
5
|
+
#
|
6
|
+
|
7
|
+
## We start the columns off with the header name
|
8
|
+
#newColumns={'recname'=>["Primary Name"],'cd'=>["CD Antigen Name"],'altnames'=>["Alternate Names"],
|
9
|
+
# 'accessions' =>["Swissprot Accessions"],
|
10
|
+
# 'location' => ["Subcellular Location"],
|
11
|
+
# 'function' => ["Known Function"],
|
12
|
+
# 'ipi' => ["IPI"],
|
13
|
+
# 'intact' => ["Interactions"],
|
14
|
+
# 'pride' => ['Pride'],
|
15
|
+
# 'ensembl'=> ['Ensembl'],
|
16
|
+
# 'refsMASS SPEC'=>["MS Refs"],
|
17
|
+
# 'refsGLYCOSYLATION'=>["Glyco Refs"],
|
18
|
+
# 'refsNUCLEOTIDE SEQUENCE'=>["Nucleotide Refs"],
|
19
|
+
# 'refsX-RAY CRYSTALLOGRAPHY'=>["Crystallography Refs"],
|
20
|
+
# 'refs3D-STRUCTURE MODELLING'=>["3D-Modelling Refs"],
|
21
|
+
# 'refsPROTEIN SEQUENCE'=>["Protein sequence Refs"],
|
22
|
+
# 'glycosites'=>["Glycosylation Sites"]
|
23
|
+
#}
|
24
|
+
require 'rubygems'
|
25
|
+
require 'bio'
|
26
|
+
|
27
|
+
class Bio::SPTR < Bio::EMBLDB
|
28
|
+
|
29
|
+
#
|
30
|
+
# Functions corresponding to retrieving data for specific keys
|
31
|
+
#
|
32
|
+
|
33
|
+
# The recommended name for the Protein
|
34
|
+
#
|
35
|
+
def recname
|
36
|
+
pname_field=self.de
|
37
|
+
entries=pname_field.split(";")
|
38
|
+
entries.each do |entry|
|
39
|
+
m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
|
40
|
+
if ( m!=nil)
|
41
|
+
if ( m[1]=="RecName")
|
42
|
+
return m[3]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
return ""
|
47
|
+
end
|
48
|
+
|
49
|
+
# The CD Antigen name
|
50
|
+
#
|
51
|
+
def cd
|
52
|
+
pname_field=self.de
|
53
|
+
entries=pname_field.split(";")
|
54
|
+
entries.each do |entry|
|
55
|
+
m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
|
56
|
+
if ( m!=nil)
|
57
|
+
if ( (m[1]=="AltName") && (m[2]=="CD_antigen") )
|
58
|
+
return m[3]
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
return ""
|
64
|
+
end
|
65
|
+
|
66
|
+
# All alternate names
|
67
|
+
#
|
68
|
+
def altnames
|
69
|
+
altnames=""
|
70
|
+
|
71
|
+
pname_field=self.de
|
72
|
+
entries=pname_field.split(";")
|
73
|
+
entries.each do |entry|
|
74
|
+
m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
|
75
|
+
if ( m!=nil)
|
76
|
+
if ( (m[1]=="AltName") && (m[2]!="CD_antigen") )
|
77
|
+
altnames << "#{m[3]}; "
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
if ( altnames!="") # Get ride of extraneous "; "
|
84
|
+
altnames.chop!.chop!
|
85
|
+
end
|
86
|
+
|
87
|
+
return altnames
|
88
|
+
end
|
89
|
+
|
90
|
+
# SwissProt Accessions
|
91
|
+
#
|
92
|
+
def accessions
|
93
|
+
return ""
|
94
|
+
end
|
95
|
+
|
96
|
+
# Subcellular Location
|
97
|
+
#
|
98
|
+
def location
|
99
|
+
return self.cc["SUBCELLULAR LOCATION"].to_s
|
100
|
+
end
|
101
|
+
|
102
|
+
# Function
|
103
|
+
#
|
104
|
+
def function
|
105
|
+
return self.cc["FUNCTION"].to_s
|
106
|
+
end
|
107
|
+
|
108
|
+
# Similarity
|
109
|
+
#
|
110
|
+
def similarity
|
111
|
+
return self.cc["SIMILARITY"].to_s
|
112
|
+
end
|
113
|
+
|
114
|
+
# Tissue Specificity
|
115
|
+
#
|
116
|
+
def tissues
|
117
|
+
return self.cc["TISSUE SPECIFICITY"].to_s
|
118
|
+
end
|
119
|
+
|
120
|
+
# Disease
|
121
|
+
#
|
122
|
+
def disease
|
123
|
+
return self.cc["DISEASE"].to_s
|
124
|
+
end
|
125
|
+
|
126
|
+
# Subunit
|
127
|
+
def subunit
|
128
|
+
return self.cc["SUBUNIT"].to_s
|
129
|
+
end
|
130
|
+
|
131
|
+
# Domain
|
132
|
+
def domain
|
133
|
+
return self.cc["DOMAIN"].to_s
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# Getting dr entry
|
138
|
+
#
|
139
|
+
|
140
|
+
# Helper Function to create links
|
141
|
+
#
|
142
|
+
def safely_get_drentry_for_key(key)
|
143
|
+
if ( self.dr[key]==nil)
|
144
|
+
return ""
|
145
|
+
end
|
146
|
+
|
147
|
+
return dr[key][0][0]
|
148
|
+
end
|
149
|
+
|
150
|
+
# IPI Accession number
|
151
|
+
#
|
152
|
+
def ipi
|
153
|
+
return self.safely_get_drentry_for_key("IPI")
|
154
|
+
end
|
155
|
+
|
156
|
+
# Intact accession number
|
157
|
+
#
|
158
|
+
def intact
|
159
|
+
return self.safely_get_drentry_for_key("PRIDE")
|
160
|
+
end
|
161
|
+
|
162
|
+
# Pride accession number
|
163
|
+
#
|
164
|
+
def pride
|
165
|
+
return self.safely_get_drentry_for_key("PRIDE")
|
166
|
+
end
|
167
|
+
|
168
|
+
# Ensembl accession number
|
169
|
+
#
|
170
|
+
def ensembl
|
171
|
+
return self.safely_get_drentry_for_key("Ensembl")
|
172
|
+
end
|
173
|
+
|
174
|
+
# NextBIO accession number
|
175
|
+
#
|
176
|
+
def nextbio
|
177
|
+
return self.safely_get_drentry_for_key("NextBio")
|
178
|
+
end
|
179
|
+
|
180
|
+
|
181
|
+
# Number of transmembrane regions
|
182
|
+
#
|
183
|
+
def num_transmem
|
184
|
+
begin
|
185
|
+
if ( self.ft["TRANSMEM"]==nil)
|
186
|
+
return 0.to_s
|
187
|
+
else
|
188
|
+
return self.ft["TRANSMEM"].length.to_s
|
189
|
+
end
|
190
|
+
rescue
|
191
|
+
p "Warning: Unable to parse feature table for entry #{self.accession}"
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
|
196
|
+
# Number of signal peptide features
|
197
|
+
#
|
198
|
+
def signalp
|
199
|
+
begin
|
200
|
+
if ( self.ft["SIGNAL"]==nil)
|
201
|
+
return 0.to_s
|
202
|
+
else
|
203
|
+
return self.ft["SIGNAL"].length.to_s
|
204
|
+
end
|
205
|
+
rescue
|
206
|
+
p "Warning: Unable to parse feature table for entry #{self.accession}"
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'spreadsheet'
|
3
|
+
|
4
|
+
|
5
|
+
class BioToolsExcelConverter
|
6
|
+
|
7
|
+
def initialize(filename)
|
8
|
+
@inputBook = Spreadsheet.open File.new("#{filename}")
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.isBiotools(filename)
|
12
|
+
testBook = Spreadsheet.open File.new("#{filename}")
|
13
|
+
testSheet = testBook.worksheet 0
|
14
|
+
|
15
|
+
isbiotools=FALSE
|
16
|
+
testSheet.each do |row|
|
17
|
+
if (row[0].class==String) && row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
|
18
|
+
isbiotools=TRUE
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
isbiotools
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_rows
|
27
|
+
|
28
|
+
sheet=@inputBook.worksheet 0
|
29
|
+
|
30
|
+
protein_rows=[]
|
31
|
+
|
32
|
+
n_rows=sheet.dimensions[1]
|
33
|
+
|
34
|
+
protein_rows=(0...n_rows).collect do |row_i|
|
35
|
+
new_row=nil
|
36
|
+
|
37
|
+
row=sheet.row row_i
|
38
|
+
if ( row[0]!=nil)
|
39
|
+
digmatch=row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
|
40
|
+
if ( digmatch!=nil )
|
41
|
+
new_row=[]
|
42
|
+
text= sheet.row(row_i-1)[0]
|
43
|
+
m=text.match(/\s(\S*)\s*$/)
|
44
|
+
throw "Badly formed protein line in biotools file ... could not parse protein name from #{text}" unless m!=nil
|
45
|
+
new_row[0]=m[1]
|
46
|
+
new_row[1]=digmatch[1]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
new_row
|
51
|
+
end
|
52
|
+
|
53
|
+
protein_rows.compact!
|
54
|
+
protein_rows.insert(0,["Accession","Ion Scores"])
|
55
|
+
|
56
|
+
protein_rows
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
#
|
2
|
+
# This file is part of protk
|
3
|
+
# Created by Ira Cooke 15/12/2010
|
4
|
+
#
|
5
|
+
# Runs system commands and provides methods for monitoring output
|
6
|
+
#
|
7
|
+
|
8
|
+
|
9
|
+
require 'open4'
|
10
|
+
require 'protk/constants'
|
11
|
+
|
12
|
+
class CommandRunner
|
13
|
+
|
14
|
+
# The protk environment in which to run commands
|
15
|
+
#
|
16
|
+
attr :env
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
def initialize(environment)
|
22
|
+
@env=environment
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
|
27
|
+
|
28
|
+
# Runs the given command in a local shell
|
29
|
+
#
|
30
|
+
def run_local(command_string)
|
31
|
+
@env.log("Command: #{command_string} started",:info)
|
32
|
+
status = Open4::popen4("#{command_string} ") do |pid, stdin, stdout, stderr|
|
33
|
+
puts "PID #{pid}"
|
34
|
+
|
35
|
+
stdout.each { |line| @env.log(line.chomp,:info) }
|
36
|
+
|
37
|
+
stderr.each { |line| @env.log(line.chomp,:warn) }
|
38
|
+
|
39
|
+
end
|
40
|
+
if ( status!=0 )
|
41
|
+
# We terminated with some error code so log as an error
|
42
|
+
@env.log( "Command: #{command_string} exited with status #{status.to_s}",:error)
|
43
|
+
else
|
44
|
+
@env.log( "Command: #{command_string} exited with status #{status.to_s}",:info)
|
45
|
+
end
|
46
|
+
status
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
|
52
|
+
# Runs the given command as a background job
|
53
|
+
# At present this sends the job to a PBS system, but in future we might support other types of background jobs
|
54
|
+
#
|
55
|
+
def run_batch(command_string,job_params,jobscript_path,autodelete)
|
56
|
+
@env.log("Creating batch file for command: #{command_string}",:info)
|
57
|
+
|
58
|
+
if ( autodelete )
|
59
|
+
# command_string<<";rm #{jobscript_path}"
|
60
|
+
end
|
61
|
+
|
62
|
+
jobid=job_params[:jobid]
|
63
|
+
if ( job_params[:vmem]==nil)
|
64
|
+
job_params[:vmem]="900mb"
|
65
|
+
end
|
66
|
+
if (job_params[:queue] ==nil )
|
67
|
+
job_params[:queue]="lowmem"
|
68
|
+
end
|
69
|
+
|
70
|
+
job_script="#!/bin/bash
|
71
|
+
#PBS -N #{jobid}
|
72
|
+
#PBS -e pbs.#{jobid}.err
|
73
|
+
#PBS -o pbs.#{jobid}.log
|
74
|
+
#PBS -l nodes=1:ppn=1,vmem=#{job_params[:vmem]}
|
75
|
+
#PBS -q #{job_params[:queue]}
|
76
|
+
#{command_string}"
|
77
|
+
|
78
|
+
p File.open(jobscript_path, 'w') {|f| f.write(job_script) }
|
79
|
+
|
80
|
+
self.run_local("qsub #{jobscript_path}")
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|