protk 1.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/README.md +85 -0
  2. data/bin/annotate_ids.rb +59 -0
  3. data/bin/big_search.rb +41 -0
  4. data/bin/correct_omssa_retention_times.rb +27 -0
  5. data/bin/feature_finder.rb +76 -0
  6. data/bin/file_convert.rb +157 -0
  7. data/bin/generate_omssa_loc.rb +42 -0
  8. data/bin/interprophet.rb +91 -0
  9. data/bin/make_decoy.rb +64 -0
  10. data/bin/manage_db.rb +123 -0
  11. data/bin/mascot_search.rb +187 -0
  12. data/bin/mascot_to_pepxml.rb +44 -0
  13. data/bin/msgfplus_search.rb +191 -0
  14. data/bin/omssa_search.rb +205 -0
  15. data/bin/peptide_prophet.rb +245 -0
  16. data/bin/pepxml_to_table.rb +78 -0
  17. data/bin/protein_prophet.rb +140 -0
  18. data/bin/protk_setup.rb +31 -0
  19. data/bin/repair_run_summary.rb +113 -0
  20. data/bin/tandem_search.rb +292 -0
  21. data/bin/template_search.rb +144 -0
  22. data/bin/unimod_to_loc.rb +118 -0
  23. data/bin/xls_to_table.rb +46 -0
  24. data/ext/protk/extconf.rb +3 -0
  25. data/ext/protk/protk.c +235 -0
  26. data/lib/protk/big_search_rakefile.rake +16 -0
  27. data/lib/protk/big_search_tool.rb +23 -0
  28. data/lib/protk/bio_sptr_extensions.rb +210 -0
  29. data/lib/protk/biotools_excel_converter.rb +60 -0
  30. data/lib/protk/command_runner.rb +84 -0
  31. data/lib/protk/constants.rb +296 -0
  32. data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
  33. data/lib/protk/data/apt-get_packages.yaml +47 -0
  34. data/lib/protk/data/brew_packages.yaml +10 -0
  35. data/lib/protk/data/default_config.yml +20 -0
  36. data/lib/protk/data/predefined_db.crap.yaml +19 -0
  37. data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
  38. data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
  39. data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
  40. data/lib/protk/data/tandem_params.xml +56 -0
  41. data/lib/protk/data/taxonomy_template.xml +9 -0
  42. data/lib/protk/data/unimod.xml +16780 -0
  43. data/lib/protk/eupathdb_gene_information_table.rb +158 -0
  44. data/lib/protk/galaxy_stager.rb +24 -0
  45. data/lib/protk/galaxy_util.rb +9 -0
  46. data/lib/protk/manage_db_rakefile.rake +484 -0
  47. data/lib/protk/manage_db_tool.rb +181 -0
  48. data/lib/protk/mascot_util.rb +63 -0
  49. data/lib/protk/omssa_util.rb +57 -0
  50. data/lib/protk/plasmodb.rb +50 -0
  51. data/lib/protk/prophet_tool.rb +85 -0
  52. data/lib/protk/protein_annotator.rb +646 -0
  53. data/lib/protk/protxml.rb +137 -0
  54. data/lib/protk/randomize.rb +7 -0
  55. data/lib/protk/search_tool.rb +182 -0
  56. data/lib/protk/setup_rakefile.rake +245 -0
  57. data/lib/protk/setup_tool.rb +19 -0
  58. data/lib/protk/spreadsheet_extensions.rb +78 -0
  59. data/lib/protk/swissprot_database.rb +38 -0
  60. data/lib/protk/tool.rb +182 -0
  61. data/lib/protk/xtandem_defaults.rb +11 -0
  62. data/lib/protk.rb +18 -0
  63. metadata +256 -0
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 18/1/2011
5
+ #
6
+ # Converts an Excel Spreadsheet to a tab delimited table
7
+ #
8
+ #
9
+
10
+ require 'protk/constants'
11
+ require 'protk/command_runner'
12
+ require 'protk/tool'
13
+ require 'spreadsheet'
14
+
15
+ # Setup command-line options for this tool.
16
+ #
17
+ tool=Tool.new({:explicit_output=>true})
18
+ tool.option_parser.banner = "Convert an xls file to a tab delimited table.\n\nUsage: xls_to_table.rb [options] file1.xls"
19
+
20
+ tool.option_parser.parse!
21
+
22
+ input_file=ARGV[0]
23
+
24
+ output_file=tool.explicit_output
25
+ output_file="#{input_file}.csv" unless ( output_file != nil )
26
+
27
+ output_fh = File.new(output_file,'w')
28
+
29
+
30
+ # Open the original excel workbook for reading
31
+ Spreadsheet.client_encoding = 'UTF-8'
32
+ inputBook = Spreadsheet.open "#{input_file}"
33
+ inputSheet = inputBook.worksheet 0
34
+
35
+ inputSheet.each do |row|
36
+ line=""
37
+ row.each do |colv|
38
+ line << "#{colv}\t"
39
+ end
40
+ line.chop!
41
+ output_fh.write "#{line}\n"
42
+ end
43
+
44
+ output_fh.close
45
+
46
+
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('protk/protk')
data/ext/protk/protk.c ADDED
@@ -0,0 +1,235 @@
1
+ #include <ruby.h>
2
+
3
+
4
+ /* */
5
+ /* make_random.c - make random protein sequence database using Markov chain with transitional */
6
+ /* probabilities from amino acid frequencies in a real database in FASTA format */
7
+ /* */
8
+ /* (c) Magnus Palmblad, Division of Ion Physics, Uppsala University, Sweden, 2001- */
9
+ /* */
10
+ /* Usage: make_random <sequence database> <number of sequences to generate> <output file> */
11
+ /* */
12
+ /* Example: mmpi 562.fasta 1000000 562_random_1000000.fasta */
13
+ /* */
14
+ /* Compile with gcc -o make_random make_random.c -lm */
15
+ /* */
16
+
17
+ #include <stdio.h>
18
+ #include <stdlib.h>
19
+ #include <ctype.h>
20
+ #include <string.h>
21
+ #include <math.h>
22
+
23
+ #define AMINO_ACIDS "ARNDCEQGHILKMFPSTWYV"
24
+ #define NOT_AMINO_ACIDS "BJOUXZ*"
25
+ #define MAX_SEQUENCE_LENGTH 20000
26
+ #define MAX_LINE_LENGTH 20000 /* large enough to read in long header lines */
27
+
28
+ static VALUE protk_make_decoys(VALUE self,VALUE input_file_in,VALUE db_length_in,VALUE output_file_in,char *prefix_string_in) {
29
+ char *input_file = RSTRING_PTR(input_file_in);
30
+ long sequences_to_generate = NUM2INT(db_length_in);
31
+ char * output_file = RSTRING_PTR(output_file_in);
32
+
33
+ char line[MAX_LINE_LENGTH];
34
+ char settings_line[60][70];
35
+ char infile[255], outfile[255]; /* for reading input and writing output */
36
+ char prefix_string[255];
37
+ char *p,**index;
38
+ char *sequence;
39
+ char one_sequence[MAX_SEQUENCE_LENGTH],random_sequence[(int)(MAX_SEQUENCE_LENGTH*1.5)],random_sequence_output[(int)(MAX_SEQUENCE_LENGTH*1.5)];
40
+ char *temp_sequence;
41
+ int a;
42
+ FILE *inp, *outp;
43
+
44
+ long i, j, k, l, n, n_sequences, protein;
45
+ long MP[21][MAX_SEQUENCE_LENGTH];
46
+ long measured_aa_freq[21], generated_aa_freq[21], measured_pl_sum=0, generated_pl_sum=0;
47
+ long row_sum[MAX_SEQUENCE_LENGTH],partial_sum;
48
+ long one_index,pl;
49
+ double x;
50
+
51
+ /* scanning sequence database */
52
+
53
+ strcpy(infile,input_file);
54
+ if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
55
+ printf("scanning sequence database %s",infile);fflush(stdout);
56
+ i=0;n=0;k=0;
57
+ while (fgets(line, MAX_LINE_LENGTH, inp) != NULL) {i++; if(line[0]=='>') {if (!(n%1000)) printf(".");fflush(stdout); n++;} }
58
+
59
+ n_sequences=n;
60
+
61
+
62
+ /* reading sequence database */
63
+
64
+ temp_sequence=(char*)calloc(sizeof(char),MAX_SEQUENCE_LENGTH);
65
+ sequence=(char*)malloc(sizeof(char)*(i*80)); /* allocate enough memory for 80 characters per line in FASTA database */
66
+ index=(char**)malloc(sizeof(char*)*n_sequences);
67
+ index[0]=sequence; /* set first index pointer to beginning of first database sequence */
68
+
69
+ if ((inp = fopen(infile, "r"))==NULL) {printf("error opening sequence database %s\n",infile);return -1;}
70
+
71
+ printf("done\nreading sequence database %s",infile);fflush(stdout);
72
+ n=-1;
73
+ strcpy(temp_sequence,"\0");
74
+ while (fgets(line, MAX_LINE_LENGTH, inp) != NULL)
75
+ {
76
+ if (strcmp(line,"\n")==0) {
77
+ continue;
78
+ }
79
+ if (line[0]=='>') {
80
+ if (n>=0) {
81
+ if (!(n%1000)&&n>0) {
82
+ printf(".");fflush(stdout);
83
+ }
84
+ strcpy(index[n],temp_sequence);
85
+ n++; index[n]=index[n-1]+sizeof(char)*strlen(temp_sequence);
86
+ strcpy(temp_sequence,"\0");
87
+ }
88
+ else
89
+ {
90
+ n++;
91
+ strcpy(temp_sequence,"\0");
92
+ }
93
+ }
94
+ else
95
+ {
96
+ if ( (strlen(temp_sequence)+strlen(line))>=MAX_SEQUENCE_LENGTH ) continue;
97
+ strncat(temp_sequence,line,strlen(line)-1);
98
+ }
99
+ }
100
+
101
+ strcpy(index[n],temp_sequence);
102
+ fclose(inp);
103
+
104
+ n_sequences=n+1;
105
+
106
+ printf("done [read %li sequences (%li amino acids)]\n",n_sequences,(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence));fflush(stdout);
107
+ measured_pl_sum=(int)(index[n_sequences-1]-index[0])/sizeof(char)+strlen(temp_sequence);
108
+
109
+
110
+
111
+ /* generating Markov probabilities */
112
+
113
+ printf("generating Markov probability matrix...");fflush(stdout);
114
+
115
+ srand(time(0)); /* replace with constant to re-generate identical random databases */
116
+
117
+ for(i=0;i<MAX_SEQUENCE_LENGTH;i++) {
118
+ for(j=0;j<=20;j++) {
119
+ MP[j][i]=0;
120
+ }
121
+ }
122
+ for(j=0;j<=20;j++) {
123
+ measured_aa_freq[j]=0;generated_aa_freq[j]=0;
124
+ }
125
+
126
+ for(protein=0;protein<n_sequences;protein++)
127
+ {
128
+ if (!(protein%1000)) {printf(".");fflush(stdout);}
129
+ if (protein<(n_sequences-1))
130
+ {
131
+ strncpy(one_sequence,index[protein],(index[protein+1]-index[protein])/sizeof(char));
132
+ one_sequence[(index[protein+1]-index[protein])/sizeof(char)]='\0';
133
+ }
134
+ else strcpy(one_sequence,index[protein]);
135
+ pl=strlen(one_sequence);
136
+ n=1;one_index=0;
137
+
138
+ for(i=0;i<pl;i++)
139
+ {
140
+ if(strpbrk(NOT_AMINO_ACIDS,(const char *)&one_sequence)==NULL)
141
+ {
142
+ if ( strchr(AMINO_ACIDS,one_sequence[i])==NULL)
143
+ {
144
+ printf("Unknown amino acid %c",one_sequence[i]);
145
+ } else {
146
+ a=20-strlen(strchr(AMINO_ACIDS,one_sequence[i])); // current amino acid
147
+ MP[a][i]++;
148
+ measured_aa_freq[a]++;
149
+ }
150
+ }
151
+ else {a=floor(20*(float)rand()/RAND_MAX);MP[a][i]++; measured_aa_freq[a]++;} // replace B, X, Z etc. with random amino acid to preserve size distribution
152
+ }
153
+ MP[20][pl]++;
154
+ measured_aa_freq[20]++; // MP[20][n] is the number of sequences of length n in the database
155
+ }
156
+ printf("done\n"); fflush(stdout);
157
+
158
+
159
+
160
+ for(i=0;i<MAX_SEQUENCE_LENGTH;i++) row_sum[i]=0;
161
+ for(i=0;i<MAX_SEQUENCE_LENGTH;i++) for(j=0;j<=20;j++) row_sum[i]+=MP[j][i];
162
+
163
+
164
+ /* generate random protein sequences through Markov chain */
165
+
166
+ strcpy(outfile,output_file);
167
+ if ((outp = fopen(outfile, "w"))==NULL) {printf("error opening output file %s\n",outfile); return -1;}
168
+ printf("generating %li random protein sequences",sequences_to_generate);fflush(stdout);
169
+
170
+ strcpy(prefix_string,RSTRING_PTR(prefix_string_in));
171
+
172
+ for(protein=0;protein<sequences_to_generate;protein++)
173
+ {
174
+ if (!(protein%1000)) {printf(".");fflush(stdout);}
175
+ i=0; j=0;
176
+ while (1)
177
+ {
178
+ x=(double)row_sum[j]*((double)rand()/RAND_MAX);
179
+ partial_sum=MP[0][j]; i=1;
180
+ while (partial_sum<x) {partial_sum+=MP[i][j]; i++;}
181
+ if (j>=MAX_SEQUENCE_LENGTH) i=21; /* terminate when sequence has reached MAX_SEQUENCE_LENGTH */
182
+ if (i<21)
183
+ {
184
+ random_sequence[j]=AMINO_ACIDS[i-1];j++;generated_aa_freq[i-1]++;
185
+ }
186
+ else /* i==21, i.e. protein sequence terminated */
187
+ {
188
+ k=0; generated_aa_freq[20]++; generated_pl_sum+=j;
189
+ for(l=0;l<j;l++)
190
+ {
191
+ random_sequence_output[k]=random_sequence[l]; k++;
192
+ if (!((k+1)%61))
193
+ {
194
+ random_sequence_output[k]='\n'; k++;
195
+ }
196
+ }
197
+
198
+ random_sequence_output[k]='\0';
199
+ if (!(k%61)) random_sequence_output[k-1]='\0'; /* remove extra newline for sequence length multiple of 60 */
200
+ fprintf(outp,">%srp%li\n%s\n",prefix_string,protein,random_sequence_output);
201
+ break;
202
+ }
203
+ }
204
+ }
205
+
206
+ fclose(outp);
207
+
208
+
209
+ /* freeing some memory... */
210
+
211
+ free(index);
212
+
213
+ printf("done (wrote %li random protein sequences to %s)\n",sequences_to_generate,outfile);
214
+
215
+ k=0;l=0;
216
+ for(i=0;i<=20;i++) {k+=measured_aa_freq[i];l+=generated_aa_freq[i];}
217
+ printf("<f(aa) in %s> <f(aa) in %s>\n",infile,outfile);
218
+ for(i=0;i<=20;i++) printf("%f %f\n",(float)measured_aa_freq[i]/k,(float)generated_aa_freq[i]/l);
219
+ printf("<average sequence length in %s> = %f\n<average sequence length in %s> = %f\n",infile,measured_pl_sum/(float)n_sequences,outfile,generated_pl_sum/(float)sequences_to_generate);
220
+
221
+ return 0;
222
+
223
+ }
224
+
225
+ /* ruby calls this to load the extension */
226
+ void Init_protk(void) {
227
+ /* assume we haven't yet defined Hola */
228
+ VALUE klass = rb_define_class("Protk",
229
+ rb_cObject);
230
+
231
+ /* the hola_bonjour function can be called
232
+ * from ruby as "Hola.bonjour" */
233
+ rb_define_singleton_method(klass,
234
+ "make_decoys", protk_make_decoys, 4);
235
+ }
@@ -0,0 +1,16 @@
1
+ require 'protk/constants'
2
+
3
+ $genv=Constants.new()
4
+
5
+ ARGV.each do |fname|
6
+
7
+ file fname do
8
+ puts fname
9
+ end
10
+
11
+ multitask :run => fname
12
+
13
+ end
14
+
15
+
16
+ task :default => :run
@@ -0,0 +1,23 @@
1
+ # create rake dependencies
2
+ # run rakefile
3
+ #
4
+ require 'optparse'
5
+ require 'pathname'
6
+ require 'protk/tool'
7
+ require 'protk/command_runner'
8
+ require 'pp'
9
+ require 'rake'
10
+
11
+ class BigSearchTool < Tool
12
+
13
+ def run input_files
14
+ command = "rake -f #{rakefile_path} #{input_files.join(" ")}"
15
+ runner=CommandRunner.new(Constants.new)
16
+ runner.run_local(command)
17
+ end
18
+
19
+ def rakefile_path
20
+ "#{File.dirname(__FILE__)}/big_search_rakefile.rake"
21
+ end
22
+
23
+ end
@@ -0,0 +1,210 @@
1
+ # Add methods to the Bio::SPTR class to retrieve objects using the keys defined in proteinannotator.rb
2
+ #
3
+ # newColumnKeys=['recname','cd','altnames','accessions','location','function','ipi','intact','pride','ensembl','refsMASS SPEC','refsNUCLEOTIDE SEQUENCE','refsX-RAY CRYSTALLOGRAPHY','refs3D-STRUCTURE MODELLING','refsPROTEIN SEQUENCE','refsGLYCOSYLATION','glycosites']
4
+ #
5
+ #
6
+
7
+ ## We start the columns off with the header name
8
+ #newColumns={'recname'=>["Primary Name"],'cd'=>["CD Antigen Name"],'altnames'=>["Alternate Names"],
9
+ # 'accessions' =>["Swissprot Accessions"],
10
+ # 'location' => ["Subcellular Location"],
11
+ # 'function' => ["Known Function"],
12
+ # 'ipi' => ["IPI"],
13
+ # 'intact' => ["Interactions"],
14
+ # 'pride' => ['Pride'],
15
+ # 'ensembl'=> ['Ensembl'],
16
+ # 'refsMASS SPEC'=>["MS Refs"],
17
+ # 'refsGLYCOSYLATION'=>["Glyco Refs"],
18
+ # 'refsNUCLEOTIDE SEQUENCE'=>["Nucleotide Refs"],
19
+ # 'refsX-RAY CRYSTALLOGRAPHY'=>["Crystallography Refs"],
20
+ # 'refs3D-STRUCTURE MODELLING'=>["3D-Modelling Refs"],
21
+ # 'refsPROTEIN SEQUENCE'=>["Protein sequence Refs"],
22
+ # 'glycosites'=>["Glycosylation Sites"]
23
+ #}
24
+ require 'rubygems'
25
+ require 'bio'
26
+
27
+ class Bio::SPTR < Bio::EMBLDB
28
+
29
+ #
30
+ # Functions corresponding to retrieving data for specific keys
31
+ #
32
+
33
+ # The recommended name for the Protein
34
+ #
35
+ def recname
36
+ pname_field=self.de
37
+ entries=pname_field.split(";")
38
+ entries.each do |entry|
39
+ m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
40
+ if ( m!=nil)
41
+ if ( m[1]=="RecName")
42
+ return m[3]
43
+ end
44
+ end
45
+ end
46
+ return ""
47
+ end
48
+
49
+ # The CD Antigen name
50
+ #
51
+ def cd
52
+ pname_field=self.de
53
+ entries=pname_field.split(";")
54
+ entries.each do |entry|
55
+ m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
56
+ if ( m!=nil)
57
+ if ( (m[1]=="AltName") && (m[2]=="CD_antigen") )
58
+ return m[3]
59
+ end
60
+ end
61
+ end
62
+
63
+ return ""
64
+ end
65
+
66
+ # All alternate names
67
+ #
68
+ def altnames
69
+ altnames=""
70
+
71
+ pname_field=self.de
72
+ entries=pname_field.split(";")
73
+ entries.each do |entry|
74
+ m=entry.match(/\s*(.*?):\s*(.*?)=(.*)/)
75
+ if ( m!=nil)
76
+ if ( (m[1]=="AltName") && (m[2]!="CD_antigen") )
77
+ altnames << "#{m[3]}; "
78
+
79
+ end
80
+ end
81
+ end
82
+
83
+ if ( altnames!="") # Get ride of extraneous "; "
84
+ altnames.chop!.chop!
85
+ end
86
+
87
+ return altnames
88
+ end
89
+
90
+ # SwissProt Accessions
91
+ #
92
+ def accessions
93
+ return ""
94
+ end
95
+
96
+ # Subcellular Location
97
+ #
98
+ def location
99
+ return self.cc["SUBCELLULAR LOCATION"].to_s
100
+ end
101
+
102
+ # Function
103
+ #
104
+ def function
105
+ return self.cc["FUNCTION"].to_s
106
+ end
107
+
108
+ # Similarity
109
+ #
110
+ def similarity
111
+ return self.cc["SIMILARITY"].to_s
112
+ end
113
+
114
+ # Tissue Specificity
115
+ #
116
+ def tissues
117
+ return self.cc["TISSUE SPECIFICITY"].to_s
118
+ end
119
+
120
+ # Disease
121
+ #
122
+ def disease
123
+ return self.cc["DISEASE"].to_s
124
+ end
125
+
126
+ # Subunit
127
+ def subunit
128
+ return self.cc["SUBUNIT"].to_s
129
+ end
130
+
131
+ # Domain
132
+ def domain
133
+ return self.cc["DOMAIN"].to_s
134
+ end
135
+
136
+ #
137
+ # Getting dr entry
138
+ #
139
+
140
+ # Helper Function to create links
141
+ #
142
+ def safely_get_drentry_for_key(key)
143
+ if ( self.dr[key]==nil)
144
+ return ""
145
+ end
146
+
147
+ return dr[key][0][0]
148
+ end
149
+
150
+ # IPI Accession number
151
+ #
152
+ def ipi
153
+ return self.safely_get_drentry_for_key("IPI")
154
+ end
155
+
156
+ # Intact accession number
157
+ #
158
+ def intact
159
+ return self.safely_get_drentry_for_key("PRIDE")
160
+ end
161
+
162
+ # Pride accession number
163
+ #
164
+ def pride
165
+ return self.safely_get_drentry_for_key("PRIDE")
166
+ end
167
+
168
+ # Ensembl accession number
169
+ #
170
+ def ensembl
171
+ return self.safely_get_drentry_for_key("Ensembl")
172
+ end
173
+
174
+ # NextBIO accession number
175
+ #
176
+ def nextbio
177
+ return self.safely_get_drentry_for_key("NextBio")
178
+ end
179
+
180
+
181
+ # Number of transmembrane regions
182
+ #
183
+ def num_transmem
184
+ begin
185
+ if ( self.ft["TRANSMEM"]==nil)
186
+ return 0.to_s
187
+ else
188
+ return self.ft["TRANSMEM"].length.to_s
189
+ end
190
+ rescue
191
+ p "Warning: Unable to parse feature table for entry #{self.accession}"
192
+ end
193
+ end
194
+
195
+
196
+ # Number of signal peptide features
197
+ #
198
+ def signalp
199
+ begin
200
+ if ( self.ft["SIGNAL"]==nil)
201
+ return 0.to_s
202
+ else
203
+ return self.ft["SIGNAL"].length.to_s
204
+ end
205
+ rescue
206
+ p "Warning: Unable to parse feature table for entry #{self.accession}"
207
+ end
208
+ end
209
+
210
+ end
@@ -0,0 +1,60 @@
1
+ require 'rubygems'
2
+ require 'spreadsheet'
3
+
4
+
5
+ class BioToolsExcelConverter
6
+
7
+ def initialize(filename)
8
+ @inputBook = Spreadsheet.open File.new("#{filename}")
9
+ end
10
+
11
+ def self.isBiotools(filename)
12
+ testBook = Spreadsheet.open File.new("#{filename}")
13
+ testSheet = testBook.worksheet 0
14
+
15
+ isbiotools=FALSE
16
+ testSheet.each do |row|
17
+ if (row[0].class==String) && row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
18
+ isbiotools=TRUE
19
+ end
20
+ end
21
+
22
+
23
+ isbiotools
24
+ end
25
+
26
+ def get_rows
27
+
28
+ sheet=@inputBook.worksheet 0
29
+
30
+ protein_rows=[]
31
+
32
+ n_rows=sheet.dimensions[1]
33
+
34
+ protein_rows=(0...n_rows).collect do |row_i|
35
+ new_row=nil
36
+
37
+ row=sheet.row row_i
38
+ if ( row[0]!=nil)
39
+ digmatch=row[0].match(/Digest Matches.*?Score:\s(.*)\)/)
40
+ if ( digmatch!=nil )
41
+ new_row=[]
42
+ text= sheet.row(row_i-1)[0]
43
+ m=text.match(/\s(\S*)\s*$/)
44
+ throw "Badly formed protein line in biotools file ... could not parse protein name from #{text}" unless m!=nil
45
+ new_row[0]=m[1]
46
+ new_row[1]=digmatch[1]
47
+ end
48
+ end
49
+
50
+ new_row
51
+ end
52
+
53
+ protein_rows.compact!
54
+ protein_rows.insert(0,["Accession","Ion Scores"])
55
+
56
+ protein_rows
57
+
58
+ end
59
+
60
+ end
@@ -0,0 +1,84 @@
1
+ #
2
+ # This file is part of protk
3
+ # Created by Ira Cooke 15/12/2010
4
+ #
5
+ # Runs system commands and provides methods for monitoring output
6
+ #
7
+
8
+
9
+ require 'open4'
10
+ require 'protk/constants'
11
+
12
+ class CommandRunner
13
+
14
+ # The protk environment in which to run commands
15
+ #
16
+ attr :env
17
+
18
+
19
+
20
+
21
+ def initialize(environment)
22
+ @env=environment
23
+ end
24
+
25
+
26
+
27
+
28
+ # Runs the given command in a local shell
29
+ #
30
+ def run_local(command_string)
31
+ @env.log("Command: #{command_string} started",:info)
32
+ status = Open4::popen4("#{command_string} ") do |pid, stdin, stdout, stderr|
33
+ puts "PID #{pid}"
34
+
35
+ stdout.each { |line| @env.log(line.chomp,:info) }
36
+
37
+ stderr.each { |line| @env.log(line.chomp,:warn) }
38
+
39
+ end
40
+ if ( status!=0 )
41
+ # We terminated with some error code so log as an error
42
+ @env.log( "Command: #{command_string} exited with status #{status.to_s}",:error)
43
+ else
44
+ @env.log( "Command: #{command_string} exited with status #{status.to_s}",:info)
45
+ end
46
+ status
47
+ end
48
+
49
+
50
+
51
+
52
+ # Runs the given command as a background job
53
+ # At present this sends the job to a PBS system, but in future we might support other types of background jobs
54
+ #
55
+ def run_batch(command_string,job_params,jobscript_path,autodelete)
56
+ @env.log("Creating batch file for command: #{command_string}",:info)
57
+
58
+ if ( autodelete )
59
+ # command_string<<";rm #{jobscript_path}"
60
+ end
61
+
62
+ jobid=job_params[:jobid]
63
+ if ( job_params[:vmem]==nil)
64
+ job_params[:vmem]="900mb"
65
+ end
66
+ if (job_params[:queue] ==nil )
67
+ job_params[:queue]="lowmem"
68
+ end
69
+
70
+ job_script="#!/bin/bash
71
+ #PBS -N #{jobid}
72
+ #PBS -e pbs.#{jobid}.err
73
+ #PBS -o pbs.#{jobid}.log
74
+ #PBS -l nodes=1:ppn=1,vmem=#{job_params[:vmem]}
75
+ #PBS -q #{job_params[:queue]}
76
+ #{command_string}"
77
+
78
+ p File.open(jobscript_path, 'w') {|f| f.write(job_script) }
79
+
80
+ self.run_local("qsub #{jobscript_path}")
81
+
82
+ end
83
+
84
+ end