protk 1.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/README.md +85 -0
  2. data/bin/annotate_ids.rb +59 -0
  3. data/bin/big_search.rb +41 -0
  4. data/bin/correct_omssa_retention_times.rb +27 -0
  5. data/bin/feature_finder.rb +76 -0
  6. data/bin/file_convert.rb +157 -0
  7. data/bin/generate_omssa_loc.rb +42 -0
  8. data/bin/interprophet.rb +91 -0
  9. data/bin/make_decoy.rb +64 -0
  10. data/bin/manage_db.rb +123 -0
  11. data/bin/mascot_search.rb +187 -0
  12. data/bin/mascot_to_pepxml.rb +44 -0
  13. data/bin/msgfplus_search.rb +191 -0
  14. data/bin/omssa_search.rb +205 -0
  15. data/bin/peptide_prophet.rb +245 -0
  16. data/bin/pepxml_to_table.rb +78 -0
  17. data/bin/protein_prophet.rb +140 -0
  18. data/bin/protk_setup.rb +31 -0
  19. data/bin/repair_run_summary.rb +113 -0
  20. data/bin/tandem_search.rb +292 -0
  21. data/bin/template_search.rb +144 -0
  22. data/bin/unimod_to_loc.rb +118 -0
  23. data/bin/xls_to_table.rb +46 -0
  24. data/ext/protk/extconf.rb +3 -0
  25. data/ext/protk/protk.c +235 -0
  26. data/lib/protk/big_search_rakefile.rake +16 -0
  27. data/lib/protk/big_search_tool.rb +23 -0
  28. data/lib/protk/bio_sptr_extensions.rb +210 -0
  29. data/lib/protk/biotools_excel_converter.rb +60 -0
  30. data/lib/protk/command_runner.rb +84 -0
  31. data/lib/protk/constants.rb +296 -0
  32. data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
  33. data/lib/protk/data/apt-get_packages.yaml +47 -0
  34. data/lib/protk/data/brew_packages.yaml +10 -0
  35. data/lib/protk/data/default_config.yml +20 -0
  36. data/lib/protk/data/predefined_db.crap.yaml +19 -0
  37. data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
  38. data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
  39. data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
  40. data/lib/protk/data/tandem_params.xml +56 -0
  41. data/lib/protk/data/taxonomy_template.xml +9 -0
  42. data/lib/protk/data/unimod.xml +16780 -0
  43. data/lib/protk/eupathdb_gene_information_table.rb +158 -0
  44. data/lib/protk/galaxy_stager.rb +24 -0
  45. data/lib/protk/galaxy_util.rb +9 -0
  46. data/lib/protk/manage_db_rakefile.rake +484 -0
  47. data/lib/protk/manage_db_tool.rb +181 -0
  48. data/lib/protk/mascot_util.rb +63 -0
  49. data/lib/protk/omssa_util.rb +57 -0
  50. data/lib/protk/plasmodb.rb +50 -0
  51. data/lib/protk/prophet_tool.rb +85 -0
  52. data/lib/protk/protein_annotator.rb +646 -0
  53. data/lib/protk/protxml.rb +137 -0
  54. data/lib/protk/randomize.rb +7 -0
  55. data/lib/protk/search_tool.rb +182 -0
  56. data/lib/protk/setup_rakefile.rake +245 -0
  57. data/lib/protk/setup_tool.rb +19 -0
  58. data/lib/protk/spreadsheet_extensions.rb +78 -0
  59. data/lib/protk/swissprot_database.rb +38 -0
  60. data/lib/protk/tool.rb +182 -0
  61. data/lib/protk/xtandem_defaults.rb +11 -0
  62. data/lib/protk.rb +18 -0
  63. metadata +256 -0
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 18/1/2011
5
+ #
6
+ # Converts an Excel Spreadsheet to a tab delimited table
7
+ #
8
+ #
9
+
10
+ require 'protk/constants'
11
+ require 'protk/command_runner'
12
+ require 'protk/tool'
13
+ require 'spreadsheet'
14
+
15
+ # Setup command-line options for this tool.
16
+ #
17
+ tool=Tool.new({:explicit_output=>true})
18
+ tool.option_parser.banner = "Convert an xls file to a tab delimited table.\n\nUsage: xls_to_table.rb [options] file1.xls"
19
+
20
+ tool.option_parser.parse!
21
+
22
+ input_file=ARGV[0]
23
+
24
+ output_file=tool.explicit_output
25
+ output_file="#{input_file}.csv" unless ( output_file != nil )
26
+
27
+ output_fh = File.new(output_file,'w')
28
+
29
+
30
+ # Open the original excel workbook for reading
31
+ Spreadsheet.client_encoding = 'UTF-8'
32
+ inputBook = Spreadsheet.open "#{input_file}"
33
+ inputSheet = inputBook.worksheet 0
34
+
35
+ inputSheet.each do |row|
36
+ line=""
37
+ row.each do |colv|
38
+ line << "#{colv}\t"
39
+ end
40
+ line.chop!
41
+ output_fh.write "#{line}\n"
42
+ end
43
+
44
+ output_fh.close
45
+
46
+
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('protk/protk')
data/ext/protk/protk.c ADDED
@@ -0,0 +1,235 @@
1
+ #include <ruby.h>
2
+
3
+
4
+ /* */
5
+ /* make_random.c - make random protein sequence database using Markov chain with transitional */
6
+ /* probabilities from amino acid frequencies in a real database in FASTA format */
7
+ /* */
8
+ /* (c) Magnus Palmblad, Division of Ion Physics, Uppsala University, Sweden, 2001- */
9
+ /* */
10
+ /* Usage: make_random <sequence database> <number of sequences to generate> <output file> */
11
+ /* */
12
+ /* Example: mmpi 562.fasta 1000000 562_random_1000000.fasta */
13
+ /* */
14
+ /* Compile with gcc -o make_random make_random.c -lm */
15
+ /* */
16
+
17
+ #include <stdio.h>
18
+ #include <stdlib.h>
19
+ #include <ctype.h>
20
+ #include <string.h>
21
+ #include <math.h>
22
+
23
+ #define AMINO_ACIDS "ARNDCEQGHILKMFPSTWYV"
24
+ #define NOT_AMINO_ACIDS "BJOUXZ*"
25
+ #define MAX_SEQUENCE_LENGTH 20000
26
+ #define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
27
+
28
+ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
29
+ char *input_file = RSTRING_PTR(input_file_in);
30
+ long sequences_to_generate = NUM2INT(db_length_in);
31
+ char * output_file = RSTRING_PTR(output_file_in);
32
+
33
+ char line[MAX_LINE_LENGTH];
34
+ char settings_line[60][70];
35
+ char infile[255], outfile[255]; /* for reading input and writing output */
36
+ char prefix_string[255];
37
+ char *p,**index;
38
+ char *sequence;
39
+ char one_sequence[MAX_SEQUENCE_LENGTH],random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)],random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
40
+ char *temp_sequence;
41
+ int a;
42
+ FILE *inp, *outp;
43
+
44
+ long i, j, k, l, n, n_sequences, protein;
45
+ long MP[21][MAX_SEQUENCE_LENGTH];
46
+ long measured_aa_freq[21], generated_aa_freq[21], measured_pl_sum=0, generated_pl_sum=0;
47
+ long row_sum[MAX_SEQUENCE_LENGTH],partial_sum;
48
+ long one_index,pl;
49
+ double x;
50
+
51
+ /* scanning sequence database */
52
+
53
+ strcpy(infile,input_file);
54
+ if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
55
+ printf("scanning sequence database %s",infile);fflush(stdout);
56
+ i=0;n=0;k=0;
57
+ while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {i++; if(line[0]=='>') {if (!(n%1000)) printf(".");fflush(stdout); n++;} }
58
+
59
+ n_sequences=n;
60
+
61
+
62
+ /* reading sequence database */
63
+
64
+ temp_sequence=(char*)calloc(sizeof(char),MAX_SEQUENCE_LENGTH);
65
+ sequence=(char*)malloc(sizeof(char)*(i*80)); /* allocate enough memory for 80 characters per line in FASTA database */
66
+ index=(char**)malloc(sizeof(char*)*n_sequences);
67
+ index[0]=sequence; /* set first index pointer to beginning of first database sequence */
68
+
69
+ if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
70
+
71
+ printf("done\nreading sequence database %s",infile);fflush(stdout);
72
+ n=-1;
73
+ strcpy(temp_sequence,"\0");
74
+ while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
75
+ {
76
+ if (strcmp(line,"\n")==0) {
77
+ continue;
78
+ }
79
+ if (line[0]=='>') {
80
+ if (n>=0) {
81
+ if (!(n%1000)&&n>0) {
82
+ printf(".");fflush(stdout);
83
+ }
84
+ strcpy(index[n],temp_sequence);
85
+ n++; index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
86
+ strcpy(temp_sequence,"\0");
87
+ }
88
+ else
89
+ {
90
+ n++;
91
+ strcpy(temp_sequence,"\0");
92
+ }
93
+ }
94
+ else
95
+ {
96
+ if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) continue;
97
+ strncat(temp_sequence,line,strlen(line)-1);
98
+ }
99
+ }
100
+
101
+ strcpy(index[n],temp_sequence);
102
+ fclose(inp);
103
+
104
+ n_sequences=n+1;
105
+
106
+ printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
107
+ measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
108
+
109
+
110
+
111
+ /* generating Markov probabilities */
112
+
113
+ printf("generating Markov probability matrix...");fflush(stdout);
114
+
115
+ srand(time(0)); /* replace with constant to re-generate identical random databases */
116
+
117
+ for(i=0;i<MAX_SEQUENCE_LENGTH;i++) {
118
+ for(j=0;j<=20;j++) {
119
+ MP[j][i]=0;
120
+ }
121
+ }
122
+ for(j=0;j<=20;j++) {
123
+ measured_aa_freq[j]=0;generated_aa_freq[j]=0;
124
+ }
125
+
126
+ for(protein=0;protein<n_sequences;protein++)
127
+ {
128
+ if (!(protein%1000)) {printf(".");fflush(stdout);}
129
+ if (protein<(n_sequences-1))
130
+ {
131
+ strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
132
+ one_sequence[(index[protein+1]-index[protein])/sizeof(char)]='\0';
133
+ }
134
+ else strcpy(one_sequence,index[protein]);
135
+ pl=strlen(one_sequence);
136
+ n=1;one_index=0;
137
+
138
+ for(i=0;i<pl;i++)
139
+ {
140
+ if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
141
+ {
142
+ if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
143
+ {
144
+ printf("Unknown amino acid %c",one_sequence[i]);
145
+ } else {
146
+ a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
147
+ MP[a][i]++;
148
+ measured_aa_freq[a]++;
149
+ }
150
+ }
151
+ else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
152
+ }
153
+ MP[20][pl]++;
154
+ measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
155
+ }
156
+ printf("done\n"); fflush(stdout);
157
+
158
+
159
+
160
+ for(i=0;i<MAX_SEQUENCE_LENGTH;i++) row_sum[i]=0;
161
+ for(i=0;i<MAX_SEQUENCE_LENGTH;i++) for(j=0;j<=20;j++) row_sum[i]+=MP[j][i];
162
+
163
+
164
+ /* generate random protein sequences through Markov chain */
165
+
166
+ strcpy(outfile,output_file);
167
+ if ((outp = fopen(outfile, "w"))==NULL) {printf("error opening output file %s\n",outfile); return -1;}
168
+ printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
169
+
170
+ strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
171
+
172
+ for(protein=0;protein<sequences_to_generate;protein++)
173
+ {
174
+ if (!(protein%1000)) {printf(".");fflush(stdout);}
175
+ i=0; j=0;
176
+ while (1)
177
+ {
178
+ x=(double)row_sum[j]*((double)rand()/RAND_MAX);
179
+ partial_sum=MP[0][j]; i=1;
180
+ while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
181
+ if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
182
+ if (i<21)
183
+ {
184
+ random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
185
+ }
186
+ else /* i==21, i.e. protein sequence terminated */
187
+ {
188
+ k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
189
+ for(l=0;l<j;l++)
190
+ {
191
+ random_sequence_output[k]=random_sequence[l]; k++;
192
+ if (!((k+1)%61))
193
+ {
194
+ random_sequence_output[k]='\n'; k++;
195
+ }
196
+ }
197
+
198
+ random_sequence_output[k]='\0';
199
+ if (!(k%61)) random_sequence_output[k-1]='\0'; /* remove extra newline for sequence length multiple of 60 */
200
+ fprintf(outp,">%srp%li\n%s\n",prefix_string,protein,random_sequence_output);
201
+ break;
202
+ }
203
+ }
204
+ }
205
+
206
+ fclose(outp);
207
+
208
+
209
+ /* freeing some memory... */
210
+
211
+ free(index);
212
+
213
+ printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
214
+
215
+ k=0;l=0;
216
+ for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
217
+ printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
218
+ for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
219
+ printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
220
+
221
+ return 0;
222
+
223
+ }
224
+
225
+ /* ruby calls this to load the extension */
226
+ void Init_protk(void) {
227
+ /* assume we haven't yet defined Hola */
228
+ VALUE klass = rb_define_class("Protk",
229
+ rb_cObject);
230
+
231
+ /* the hola_bonjour function can be called
232
+ * from ruby as "Hola.bonjour" */
233
+ rb_define_singleton_method(klass,
234
+ "make_decoys", protk_make_decoys, 4);
235
+ }
@@ -0,0 +1,16 @@
1
+ require 'protk/constants'
2
+
3
+ $genv=Constants.new()
4
+
5
+ ARGV.each do |fname|
6
+
7
+ file fname do
8
+ puts fname
9
+ end
10
+
11
+ multitask :run => fname
12
+
13
+ end
14
+
15
+
16
+ task :default => :run
@@ -0,0 +1,23 @@
1
+ # create rake dependencies
2
+ # run rakefile
3
+ #
4
+ require 'optparse'
5
+ require 'pathname'
6
+ require 'protk/tool'
7
+ require 'protk/command_runner'
8
+ require 'pp'
9
+ require 'rake'
10
+
11
+ class BigSearchTool < Tool
12
+
13
+ def run input_files
14
+ command = "rake -f #{rakefile_path} #{input_files.join(" ")}"
15
+ runner=CommandRunner.new(Constants.new)
16
+ runner.run_local(command)
17
+ end
18
+
19
+ def rakefile_path
20
+ "#{File.dirname(__FILE__)}/big_search_rakefile.rake"
21
+ end
22
+
23
+ end
@@ -0,0 +1,210 @@
1
+ # Add methods to the Bio::SPTR class to retrieve objects using the keys defined in proteinannotator.rb
2
+ #
3
+ # newColumnKeys=['recname','cd','altnames','accessions','location','function','ipi','intact','pride','ensembl','refsMASS SPEC','refsNUCLEOTIDE SEQUENCE','refsX-RAY CRYSTALLOGRAPHY','refs3D-STRUCTURE MODELLING','refsPROTEIN SEQUENCE','refsGLYCOSYLATION','glycosites']
4
+ #
5
+ #
6
+
7
+ ## We start the columns off with the header name
8
+ #newColumns={'recname'=>["Primary Name"],'cd'=>["CD Antigen Name"],'altnames'=>["Alternate Names"],
9
+ # 'accessions' =>["Swissprot Accessions"],
10
+ # 'location' => ["Subcellular Location"],
11
+ # 'function' => ["Known Function"],
12
+ # 'ipi' => ["IPI"],
13
+ # 'intact' => ["Interactions"],
14
+ # 'pride' => ['Pride'],
15
+ # 'ensembl'=> ['Ensembl'],
16
+ # 'refsMASS SPEC'=>["MS Refs"],
17
+ # 'refsGLYCOSYLATION'=>["Glyco Refs"],
18
+ # 'refsNUCLEOTIDE SEQUENCE'=>["Nucleotide Refs"],
19
+ # 'refsX-RAY CRYSTALLOGRAPHY'=>["Crystallography Refs"],
20
+ # 'refs3D-STRUCTURE MODELLING'=>["3D-Modelling Refs"],
21
+ # 'refsPROTEIN SEQUENCE'=>["Protein sequence Refs"],
22
+ # 'glycosites'=>["Glycosylation Sites"]
23
+ #}
24
+ require 'rubygems'
25
+ require 'bio'
26
+
27
+ class Bio::SPTR < Bio::EMBLDB
28
+
29
+ #
30
+ # Functions corresponding to retrieving data for specific keys
31
+ #
32
+
33
+ # The recommended name for the Protein
34
+ #
35
+ def recname
36
+ pname_field=self.de
37
+ entries=pname_field.split(";")
38
+ entries.each do |entry|
39
+ m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
40
+ if ( m!=nil)
41
+ if ( m[1]=="RecName")
42
+ return m[3]
43
+ end
44
+ end
45
+ end
46
+ return ""
47
+ end
48
+
49
+ # The CD Antigen name
50
+ #
51
+ def cd
52
+ pname_field=self.de
53
+ entries=pname_field.split(";")
54
+ entries.each do |entry|
55
+ m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
56
+ if ( m!=nil)
57
+ if ( (m[1]=="AltName") && (m[2]=="CD_antigen") )
58
+ return m[3]
59
+ end
60
+ end
61
+ end
62
+
63
+ return ""
64
+ end
65
+
66
+ # All alternate names
67
+ #
68
+ def altnames
69
+ altnames=""
70
+
71
+ pname_field=self.de
72
+ entries=pname_field.split(";")
73
+ entries.each do |entry|
74
+ m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
75
+ if ( m!=nil)
76
+ if ( (m[1]=="AltName") && (m[2]!="CD_antigen") )
77
+ altnames << "#{m[3]}; "
78
+
79
+ end
80
+ end
81
+ end
82
+
83
+ if ( altnames!="") # Get ride of extraneous "; "
84
+ altnames.chop!.chop!
85
+ end
86
+
87
+ return altnames
88
+ end
89
+
90
+ # SwissProt Accessions
91
+ #
92
+ def accessions
93
+ return ""
94
+ end
95
+
96
+ # Subcellular Location
97
+ #
98
+ def location
99
+ return self.cc["SUBCELLULAR LOCATION"].to_s
100
+ end
101
+
102
+ # Function
103
+ #
104
+ def function
105
+ return self.cc["FUNCTION"].to_s
106
+ end
107
+
108
+ # Similarity
109
+ #
110
+ def similarity
111
+ return self.cc["SIMILARITY"].to_s
112
+ end
113
+
114
+ # Tissue Specificity
115
+ #
116
+ def tissues
117
+ return self.cc["TISSUE SPECIFICITY"].to_s
118
+ end
119
+
120
+ # Disease
121
+ #
122
+ def disease
123
+ return self.cc["DISEASE"].to_s
124
+ end
125
+
126
+ # Subunit
127
+ def subunit
128
+ return self.cc["SUBUNIT"].to_s
129
+ end
130
+
131
+ # Domain
132
+ def domain
133
+ return self.cc["DOMAIN"].to_s
134
+ end
135
+
136
+ #
137
+ # Getting dr entry
138
+ #
139
+
140
+ # Helper Function to create links
141
+ #
142
+ def safely_get_drentry_for_key(key)
143
+ if ( self.dr[key]==nil)
144
+ return ""
145
+ end
146
+
147
+ return dr[key][0][0]
148
+ end
149
+
150
+ # IPI Accession number
151
+ #
152
+ def ipi
153
+ return self.safely_get_drentry_for_key("IPI")
154
+ end
155
+
156
+ # Intact accession number
157
+ #
158
+ def intact
159
+ return self.safely_get_drentry_for_key("PRIDE")
160
+ end
161
+
162
+ # Pride accession number
163
+ #
164
+ def pride
165
+ return self.safely_get_drentry_for_key("PRIDE")
166
+ end
167
+
168
+ # Ensembl accession number
169
+ #
170
+ def ensembl
171
+ return self.safely_get_drentry_for_key("Ensembl")
172
+ end
173
+
174
+ # NextBIO accession number
175
+ #
176
+ def nextbio
177
+ return self.safely_get_drentry_for_key("NextBio")
178
+ end
179
+
180
+
181
+ # Number of transmembrane regions
182
+ #
183
+ def num_transmem
184
+ begin
185
+ if ( self.ft["TRANSMEM"]==nil)
186
+ return 0.to_s
187
+ else
188
+ return self.ft["TRANSMEM"].length.to_s
189
+ end
190
+ rescue
191
+ p "Warning: Unable to parse feature table for entry #{self.accession}"
192
+ end
193
+ end
194
+
195
+
196
+ # Number of signal peptide features
197
+ #
198
+ def signalp
199
+ begin
200
+ if ( self.ft["SIGNAL"]==nil)
201
+ return 0.to_s
202
+ else
203
+ return self.ft["SIGNAL"].length.to_s
204
+ end
205
+ rescue
206
+ p "Warning: Unable to parse feature table for entry #{self.accession}"
207
+ end
208
+ end
209
+
210
+ end
@@ -0,0 +1,60 @@
1
+ require 'rubygems'
2
+ require 'spreadsheet'
3
+
4
+
5
+ class BioToolsExcelConverter
6
+
7
+ def initialize(filename)
8
+ @inputBook = Spreadsheet.open File.new("#{filename}")
9
+ end
10
+
11
+ def self.isBiotools(filename)
12
+ testBook = Spreadsheet.open File.new("#{filename}")
13
+ testSheet = testBook.worksheet 0
14
+
15
+ isbiotools=FALSE
16
+ testSheet.each do |row|
17
+ if (row[0].class==String) && row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
18
+ isbiotools=TRUE
19
+ end
20
+ end
21
+
22
+
23
+ isbiotools
24
+ end
25
+
26
+ def get_rows
27
+
28
+ sheet=@inputBook.worksheet 0
29
+
30
+ protein_rows=[]
31
+
32
+ n_rows=sheet.dimensions[1]
33
+
34
+ protein_rows=(0...n_rows).collect do |row_i|
35
+ new_row=nil
36
+
37
+ row=sheet.row row_i
38
+ if ( row[0]!=nil)
39
+ digmatch=row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
40
+ if ( digmatch!=nil )
41
+ new_row=[]
42
+ text= sheet.row(row_i-1)[0]
43
+ m=text.match(/\s(\S*)\s*$/)
44
+ throw "Badly formed protein line in biotools file ... could not parse protein name from #{text}" unless m!=nil
45
+ new_row[0]=m[1]
46
+ new_row[1]=digmatch[1]
47
+ end
48
+ end
49
+
50
+ new_row
51
+ end
52
+
53
+ protein_rows.compact!
54
+ protein_rows.insert(0,["Accession","Ion Scores"])
55
+
56
+ protein_rows
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,84 @@
1
+ #
2
+ # This file is part of protk
3
+ # Created by Ira Cooke 15/12/2010
4
+ #
5
+ # Runs system commands and provides methods for monitoring output
6
+ #
7
+
8
+
9
+ require 'open4'
10
+ require 'protk/constants'
11
+
12
+ class CommandRunner
13
+
14
+ # The protk environment in which to run commands
15
+ #
16
+ attr :env
17
+
18
+
19
+
20
+
21
+ def initialize(environment)
22
+ @env=environment
23
+ end
24
+
25
+
26
+
27
+
28
+ # Runs the given command in a local shell
29
+ #
30
+ def run_local(command_string)
31
+ @env.log("Command: #{command_string} started",:info)
32
+ status = Open4::popen4("#{command_string} ") do |pid, stdin, stdout, stderr|
33
+ puts "PID #{pid}"
34
+
35
+ stdout.each { |line| @env.log(line.chomp,:info) }
36
+
37
+ stderr.each { |line| @env.log(line.chomp,:warn) }
38
+
39
+ end
40
+ if ( status!=0 )
41
+ # We terminated with some error code so log as an error
42
+ @env.log( "Command: #{command_string} exited with status #{status.to_s}",:error)
43
+ else
44
+ @env.log( "Command: #{command_string} exited with status #{status.to_s}",:info)
45
+ end
46
+ status
47
+ end
48
+
49
+
50
+
51
+
52
+ # Runs the given command as a background job
53
+ # At present this sends the job to a PBS system, but in future we might support other types of background jobs
54
+ #
55
+ def run_batch(command_string,job_params,jobscript_path,autodelete)
56
+ @env.log("Creating batch file for command: #{command_string}",:info)
57
+
58
+ if ( autodelete )
59
+ # command_string<<";rm #{jobscript_path}"
60
+ end
61
+
62
+ jobid=job_params[:jobid]
63
+ if ( job_params[:vmem]==nil)
64
+ job_params[:vmem]="900mb"
65
+ end
66
+ if (job_params[:queue] ==nil )
67
+ job_params[:queue]="lowmem"
68
+ end
69
+
70
+ job_script="#!/bin/bash
71
+ #PBS -N #{jobid}
72
+ #PBS -e pbs.#{jobid}.err
73
+ #PBS -o pbs.#{jobid}.log
74
+ #PBS -l nodes=1:ppn=1,vmem=#{job_params[:vmem]}
75
+ #PBS -q #{job_params[:queue]}
76
+ #{command_string}"
77
+
78
+ p File.open(jobscript_path, 'w') {|f| f.write(job_script) }
79
+
80
+ self.run_local("qsub #{jobscript_path}")
81
+
82
+ end
83
+
84
+ end