protk 1.1.0.pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/README.md +85 -0
  2. data/bin/annotate_ids.rb +59 -0
  3. data/bin/big_search.rb +41 -0
  4. data/bin/correct_omssa_retention_times.rb +27 -0
  5. data/bin/feature_finder.rb +76 -0
  6. data/bin/file_convert.rb +157 -0
  7. data/bin/generate_omssa_loc.rb +42 -0
  8. data/bin/interprophet.rb +91 -0
  9. data/bin/make_decoy.rb +64 -0
  10. data/bin/manage_db.rb +123 -0
  11. data/bin/mascot_search.rb +187 -0
  12. data/bin/mascot_to_pepxml.rb +44 -0
  13. data/bin/msgfplus_search.rb +191 -0
  14. data/bin/omssa_search.rb +205 -0
  15. data/bin/peptide_prophet.rb +245 -0
  16. data/bin/pepxml_to_table.rb +78 -0
  17. data/bin/protein_prophet.rb +140 -0
  18. data/bin/protk_setup.rb +31 -0
  19. data/bin/repair_run_summary.rb +113 -0
  20. data/bin/tandem_search.rb +292 -0
  21. data/bin/template_search.rb +144 -0
  22. data/bin/unimod_to_loc.rb +118 -0
  23. data/bin/xls_to_table.rb +46 -0
  24. data/ext/protk/extconf.rb +3 -0
  25. data/ext/protk/protk.c +235 -0
  26. data/lib/protk/big_search_rakefile.rake +16 -0
  27. data/lib/protk/big_search_tool.rb +23 -0
  28. data/lib/protk/bio_sptr_extensions.rb +210 -0
  29. data/lib/protk/biotools_excel_converter.rb +60 -0
  30. data/lib/protk/command_runner.rb +84 -0
  31. data/lib/protk/constants.rb +296 -0
  32. data/lib/protk/data/FeatureFinderCentroided.ini +63 -0
  33. data/lib/protk/data/apt-get_packages.yaml +47 -0
  34. data/lib/protk/data/brew_packages.yaml +10 -0
  35. data/lib/protk/data/default_config.yml +20 -0
  36. data/lib/protk/data/predefined_db.crap.yaml +19 -0
  37. data/lib/protk/data/predefined_db.sphuman.yaml +25 -0
  38. data/lib/protk/data/predefined_db.swissprot_annotation.yaml +20 -0
  39. data/lib/protk/data/predefined_db.swissprot_fasta_annotation.yaml +20 -0
  40. data/lib/protk/data/tandem_params.xml +56 -0
  41. data/lib/protk/data/taxonomy_template.xml +9 -0
  42. data/lib/protk/data/unimod.xml +16780 -0
  43. data/lib/protk/eupathdb_gene_information_table.rb +158 -0
  44. data/lib/protk/galaxy_stager.rb +24 -0
  45. data/lib/protk/galaxy_util.rb +9 -0
  46. data/lib/protk/manage_db_rakefile.rake +484 -0
  47. data/lib/protk/manage_db_tool.rb +181 -0
  48. data/lib/protk/mascot_util.rb +63 -0
  49. data/lib/protk/omssa_util.rb +57 -0
  50. data/lib/protk/plasmodb.rb +50 -0
  51. data/lib/protk/prophet_tool.rb +85 -0
  52. data/lib/protk/protein_annotator.rb +646 -0
  53. data/lib/protk/protxml.rb +137 -0
  54. data/lib/protk/randomize.rb +7 -0
  55. data/lib/protk/search_tool.rb +182 -0
  56. data/lib/protk/setup_rakefile.rake +245 -0
  57. data/lib/protk/setup_tool.rb +19 -0
  58. data/lib/protk/spreadsheet_extensions.rb +78 -0
  59. data/lib/protk/swissprot_database.rb +38 -0
  60. data/lib/protk/tool.rb +182 -0
  61. data/lib/protk/xtandem_defaults.rb +11 -0
  62. data/lib/protk.rb +18 -0
  63. metadata +256 -0
@@ -0,0 +1,646 @@
1
+ require 'rubygems'
2
+ require 'spreadsheet'
3
+ require 'protk/swissprot_database'
4
+ require 'protk/bio_sptr_extensions'
5
+ require 'protk/protxml'
6
+ require 'protk/spreadsheet_extensions'
7
+ require 'protk/biotools_excel_converter'
8
+ require 'protk/plasmodb'
9
+ require 'protk/constants'
10
+
11
+
12
+ class ProteinAnnotator < Object
13
+
14
+
15
+ def initialize()
16
+ @genv = Constants.new()
17
+ end
18
+
19
+ def env
20
+ return @genv
21
+ end
22
+
23
+ def outputBookFromExcelInput(inputFile,numrows=0)
24
+
25
+ # Open the original excel workbook for reading
26
+ inputBook = Spreadsheet.open "#{inputFile}"
27
+
28
+ return inputBook.copyBook(numrows)
29
+
30
+ end
31
+
32
+
33
+
34
+ # Combines results from prot and pep xml files
35
+ #
36
+ def outputBookFromProtXMLAndPepXML(inputFileProt,inputFilePep,numrows=0)
37
+ protxml=ProtXML.new(inputFileProt)
38
+
39
+ # By default here we don't report anything with a probability less than 0.6
40
+ # This should be a user parameter someday
41
+ #
42
+ rows=protxml.as_rows(0.6)
43
+
44
+ # Figure out how many rows to convert if not specified
45
+ #
46
+ if ( numrows==0 || numrows >= rows.length)
47
+ numrows=rows.length
48
+ else
49
+ rows=rows[0...numrows]
50
+ end
51
+
52
+
53
+
54
+ # Create a new workbook from scratch for writing
55
+ outputBook = Spreadsheet::Workbook.new
56
+ outputSheet = outputBook.create_worksheet
57
+
58
+ rows.reverse!
59
+
60
+ rows.each { |row|
61
+ outputSheet.insert_row(0,row)
62
+ }
63
+
64
+ outputBook
65
+ end
66
+
67
+
68
+
69
+
70
+ # Takes a prot.xml file as input and returns an excel workbook with a single column containing the Accessions of proteins in the file
71
+ # The header of the accessions column will be 'Accessions'
72
+ # If a protein has 'indistinguishable proteins' each of those is given a separate line
73
+ #
74
+ # Throws an error if no proteins could be found in the prot.xml file
75
+ # In addition to the Accessions column, other information will be extracted from the file including
76
+ # 1. A list of indistinguishable proteins
77
+ # 2. The number of peptides on which the ID was based
78
+ # 3. The protein probability
79
+ # 4. A ; separated list of peptides on which the ID is based
80
+ # 5. Percent coverage for the protein
81
+ #
82
+ def outputBookFromProtXML(inputFile,numrows=0)
83
+ protxml=ProtXML.new(inputFile)
84
+
85
+ # By default here we don't report anything with a probability less than 0.6
86
+ # This should be a user parameter someday
87
+ #
88
+ rows=protxml.as_rows(0.6)
89
+
90
+ # Figure out how many rows to convert if not specified
91
+ #
92
+ if ( numrows==0 || numrows>=rows.length)
93
+ numrows=rows.length
94
+ else
95
+ rows=rows[0...numrows]
96
+ end
97
+
98
+
99
+
100
+ # Create a new workbook from scratch for writing
101
+ outputBook = Spreadsheet::Workbook.new
102
+ outputSheet = outputBook.create_worksheet
103
+
104
+ rows.reverse!
105
+
106
+ rows.each { |row|
107
+ outputSheet.insert_row(0,row)
108
+ }
109
+
110
+ outputBook
111
+ end
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+ # Takes a biotools outputted excel file and produces an excel workbook with a single Accessions column of proteins
120
+ #
121
+ def outputBookFromBiotoolsExcel(inputFile,numrows=0)
122
+ converter=BioToolsExcelConverter.new(inputFile)
123
+ rows=converter.get_rows
124
+
125
+ # Create a new workbook from scratch for writing
126
+ outputBook = Spreadsheet::Workbook.new
127
+ outputSheet = outputBook.create_worksheet
128
+
129
+ rows.reverse!
130
+
131
+ rows.each { |row|
132
+ outputSheet.insert_row(0,row)
133
+ }
134
+
135
+ outputBook
136
+
137
+
138
+ end
139
+
140
+ # Takes a WarpLC Protein List file as input and returns an excel workbook with a single column containing the Accessions of proteins in the file
141
+ # The header of the accessions column will be 'Accessions'
142
+ # Throws an error if no proteins could be found in the WarpLC file .. this could also happen if the file is the wrong format
143
+ #
144
+ def outputBookFromWarpLCInput(inputFile,numrows=0)
145
+ file=File.new(inputFile)
146
+ xmldoc=REXML::Document.new(file)
147
+ accessions=REXML::XPath.match(xmldoc,"//ProteinReport/Protein")
148
+ if ( accessions==nil )
149
+ throw "No proteins found in the WarpLC Proteinlist file #{inputFile}"
150
+ end
151
+ accessions=accessions.collect { |el| el.attributes['Accession']}
152
+ accessions.insert(0,"Accession")
153
+
154
+ # Figure out how many rows to convert if not specified
155
+ if ( numrows==0 || numrows>accessions.length)
156
+ numrows=accessions.length+1
157
+ else
158
+ accessions=accessions[0...numrows]
159
+ end
160
+
161
+
162
+
163
+ # Create a new workbook from scratch for writing
164
+ outputBook = Spreadsheet::Workbook.new
165
+ outputSheet = outputBook.create_worksheet
166
+
167
+ outputSheet.insert_column(accessions,0)
168
+
169
+ outputBook
170
+ end
171
+
172
+ # First looks at the file extension. If it is xls then filetype 'xls' is returned.
173
+ # Otherwise, we assume the file is XML
174
+ #
175
+ def isExcelFile(fileName,input_type)
176
+ if ( input_type=="excel")
177
+ return true
178
+ end
179
+
180
+ ext=fileName.split(".").last
181
+ if(ext=="xls")
182
+ return true
183
+ end
184
+ return false
185
+ end
186
+
187
+ def isProtXMLFile(fileName,input_type)
188
+ if ( input_type=="protXML")
189
+ return true
190
+ end
191
+
192
+
193
+ if ( fileName.match(/\.prot\.xml$/) != nil )
194
+ return true
195
+ else
196
+ return false
197
+ end
198
+ end
199
+
200
+ def isBioToolsFile(fileName,input_type)
201
+ BioToolsExcelConverter.isBiotools(fileName)
202
+ end
203
+
204
+ # Given a worksheet with a column called 'Status' change true values to 'Validated' and false values to 'Potential'
205
+ def renameValuesInColumn(workSheet,colIndex,from,to)
206
+ workSheet.rows.each { |row|
207
+ if ( row[colIndex]==from)
208
+ row[colIndex]=to
209
+ end
210
+ }
211
+ end
212
+
213
+ def hasAccession(row)
214
+ hasit=false
215
+ row.each do|cell|
216
+ if (cell.to_s=="Accession")
217
+ hasit=true
218
+ end
219
+ end
220
+ hasit
221
+ end
222
+
223
+ def row_is_empty(row)
224
+ isempty=true
225
+ if (row==nil)
226
+ return true
227
+ end
228
+
229
+ row.each do |cell|
230
+ if ( cell!=nil && cell.to_s!="")
231
+ isempty=false
232
+ end
233
+ end
234
+ isempty
235
+ end
236
+
237
+ def convert(inputFile,outputFile,input_type=nil,output_type="xls",numrows=0,accessionColumnName="Accession",entrezIDColumnName="Entrez.ID",hiddenColumns=[])
238
+
239
+ @genv.log("Converting #{inputFile} to #{outputFile}",:info)
240
+
241
+ Spreadsheet.client_encoding = 'UTF-8'
242
+
243
+
244
+ case true
245
+ when isExcelFile(inputFile,input_type)
246
+ @genv.log("Excel file was biotools",:info)
247
+ if ( isBioToolsFile(inputFile,input_type))
248
+ outputBook=outputBookFromBiotoolsExcel(inputFile,numrows)
249
+ else
250
+ @genv.log("Excel file was non biotools",:info)
251
+ outputBook=outputBookFromExcelInput(inputFile,numrows)
252
+ end
253
+ outputSheet=outputBook.worksheet 0
254
+ when isProtXMLFile(inputFile,input_type)
255
+ @genv.log("Got a Prot XML File as Input",:info)
256
+ outputBook=outputBookFromProtXML(inputFile,numrows)
257
+ outputSheet=outputBook.worksheet 0
258
+ else
259
+ @genv.log("File is not prot.xml or excel .. trying WarpLCResult",:info)
260
+ outputBook=outputBookFromWarpLCInput(inputFile,numrows)
261
+ outputSheet=outputBook.worksheet 0
262
+ end
263
+
264
+ # Chop off and save any rows prior to the header and remove any empty rows
265
+ #
266
+ rows_for_deletion=[]
267
+ header_row=nil
268
+ keep_rows=[]
269
+ rowi=0
270
+ outputSheet.each do |row|
271
+
272
+ if ( !row_is_empty(row) && header_row==nil && hasAccession(row))
273
+ header_row=rowi
274
+ end
275
+
276
+ if (row_is_empty(row) || header_row==nil)
277
+ rows_for_deletion.push(rowi)
278
+ end
279
+
280
+ if (header_row==nil)
281
+ keep_rows.push(row)
282
+ end
283
+ rowi=rowi+1
284
+ end
285
+
286
+ deletion_index=0
287
+ rows_for_deletion.each do |i|
288
+ outputSheet.delete_row(i-deletion_index)
289
+ deletion_index=deletion_index+1
290
+ end
291
+
292
+ header=outputSheet.row 0
293
+ lastcolIndex=0
294
+ accessionColumn=nil
295
+
296
+ # Grab the accession column
297
+ for i in 0...header.length
298
+ if ( header[i]==accessionColumnName)
299
+ accessionColumn=outputSheet.column i
300
+ accessionColumnIndex=i
301
+ end
302
+ if ( header[i]=="" && lastcolIndex==0)
303
+ lastcolIndex=i
304
+ end
305
+
306
+ if ( header[i]=="OK")
307
+ header[i]="Status"
308
+ renameValuesInColumn(outputSheet,i,"true","Validated")
309
+ renameValuesInColumn(outputSheet,i,"false","Contaminant")
310
+ end
311
+
312
+ end
313
+
314
+ # If we didn't find an empty column then just set lastcolIndex to i
315
+ if ( lastcolIndex==0)
316
+ lastcolIndex=i
317
+ end
318
+
319
+ if ( accessionColumn==nil)
320
+ throw "No Accession column in input excel file. One column must have the header 'Accession'"
321
+ end
322
+
323
+ ids = accessionColumn.collect { |id|
324
+ if ( id!=nil)
325
+ id
326
+ else
327
+ ""
328
+ end
329
+ }
330
+ # Remove the 0th value because it is the header
331
+ ids.delete_at(0)
332
+
333
+ #### Now grab some additional column information from uniprot ####
334
+
335
+ # Create a Hash with keys corresponding to the keys returned by uniprot.parse and with values corresponding to arrays of column values
336
+ # We start the columns off with the header name
337
+ newColumns={'recname'=>["Primary Name"],'cd'=>["CD Antigen Name"],'altnames'=>["Alternate Names"],
338
+ 'location' => ["Subcellular Location"],
339
+ 'function' => ["Known Function"],
340
+ 'similarity' => ["Similarity"],
341
+ 'tissues' => ["Tissue Specificity"],
342
+ 'disease' => ["Disease Association"],
343
+ 'domain' => ["Domain"],
344
+ 'subunit' => ["Sub Unit"],
345
+ 'nextbio' => ["NextBio"],
346
+ 'ipi' => ["IPI"],
347
+ 'intact' => ["Interactions"],
348
+ 'pride' => ['Pride'],
349
+ 'ensembl'=> ['Ensembl'],
350
+ 'num_transmem'=>["Transmembrane Regions"],
351
+ 'signalp'=>['Signal Peptide']
352
+ }
353
+
354
+ newColumnKeys=['recname','cd','altnames','location','function','similarity','tissues','disease','domain','subunit','nextbio','ipi','intact','pride','ensembl','num_transmem','signalp']
355
+
356
+
357
+ # xmlurls=accs.collect {|acc| uniprot.entry_url_for_accession(acc,'xml') }
358
+
359
+ @genv.log("Initializing database",:info)
360
+
361
+ swissprotdb=SwissprotDatabase.new(@genv)
362
+ @genv.log("Retrieving data for #{ids.length} entries from Swissprot database ",:info)
363
+ accs=[]
364
+ plasmodbids=[]
365
+ found_plasmodb_ids=false
366
+
367
+ $stdout.putc "\n"
368
+ ids.each { |uniprot_id|
369
+
370
+ $stdout.putc "."
371
+ $stdout.flush
372
+
373
+ sptr_entry=swissprotdb.get_entry_for_name(uniprot_id)
374
+
375
+
376
+ if ( sptr_entry==nil)
377
+ @genv.log("No entry for #{uniprot_id} in uniprot database",:warn)
378
+ newColumnKeys.each { |key| newColumns[key].push("") }
379
+ accs.push("")
380
+
381
+ # Bit of a hack. If the id is not sp and not decoy we assume it is plasmodb
382
+ #
383
+ if ( uniprot_id=~/^decoy_/)
384
+ else
385
+ plasmodbids.push(uniprot_id)
386
+ found_plasmodb_ids=true
387
+ end
388
+
389
+ else
390
+ accs.push(sptr_entry.accession)
391
+ plasmodbids.push("")
392
+
393
+ newColumnKeys.each { |key|
394
+
395
+ val=sptr_entry.send(key)
396
+ if ( val==nil)
397
+ str=""
398
+ elsif ( val.class==Array)
399
+ str=val.join(";")
400
+ else
401
+ str=val.to_s
402
+ end
403
+ newColumns[key].push(str)
404
+ }
405
+ end
406
+ }
407
+ $stdout.putc "\n"
408
+
409
+
410
+ # Trying PlasmoDB for unknown IDs
411
+ #
412
+ if ( found_plasmodb_ids )
413
+ $stdout.putc "Searching PlasmoDB for unknown Id's\n"
414
+ @genv.log "Searching PlasmoDB for unknown Id's", :info
415
+
416
+ plasmodb = PlasmoDB.new(@genv)
417
+
418
+ row_index=1 # Starts from 1 because of the header
419
+
420
+ plasmodbids.each { |plasmodb_id|
421
+
422
+ if ( plasmodb_id!="")
423
+ p plasmodb_id
424
+
425
+ plasmodb_entry = plasmodb.get_entry_for_name(plasmodb_id)
426
+
427
+ if ( plasmodb_entry != nil )
428
+
429
+ # newColumnKeys=['recname','cd','altnames','location','function','similarity','tissues','disease','domain','subunit','nextbio','ipi','intact','pride','ensembl','num_transmem','signalp']
430
+
431
+ newColumns['recname'][row_index]=plasmodb_entry['Product Description']
432
+
433
+ if ( plasmodb_entry['Annotated GO Component']!="null" )
434
+ newColumns['location'][row_index]=plasmodb_entry['Annotated GO Component']
435
+ else
436
+ newColumns['location'][row_index]=plasmodb_entry['Predicted GO Component']
437
+ end
438
+
439
+ if ( plasmodb_entry['Annotated GO Function'] !="null" )
440
+ newColumns['function'][row_index]=plasmodb_entry['Annotated GO Function']
441
+ else
442
+ newColumns['function'][row_index]=plasmodb_entry['Predicted GO Function']
443
+ end
444
+
445
+ newColumns['signalp'][row_index]=plasmodb_entry['SignalP Peptide']
446
+
447
+ newColumns['num_transmem'][row_index] = plasmodb_entry['# TM Domains']
448
+
449
+ end
450
+ end
451
+
452
+ row_index=row_index+1
453
+
454
+
455
+ }
456
+
457
+
458
+ end
459
+
460
+
461
+ @genv.log("Done",:info)
462
+
463
+ newColumnKeys.reverse.each { |key|
464
+ outputSheet.insert_column(newColumns[key],lastcolIndex)
465
+ }
466
+
467
+ # Now hide some columns
468
+ hide=hiddenColumns
469
+ for i in 0...outputSheet.row(0).length
470
+ if ( hide.detect { |h| header[i].include?(h)} !=nil)
471
+ outputSheet.column(i).hidden=TRUE
472
+ accessionColumn=outputSheet.column i
473
+ accessionColumnIndex=i
474
+ end
475
+ if ( header[i]=="" && lastcolIndex==0)
476
+ lastcolIndex=i
477
+ end
478
+ end
479
+
480
+
481
+ # Now add hyperlinks to various columns
482
+ @genv.log("Creating Hyperlinks",:info)
483
+
484
+ # Figure out column indexes for all the hyperlinked columns
485
+ header=outputSheet.row 0
486
+
487
+ entrezIDColumn=nil
488
+
489
+
490
+ # Grab the column indexes of existing columns to be hyperlinked
491
+ for i in 0...header.length
492
+ if ( header[i]==accessionColumnName )
493
+ accessionColumnIndex=i
494
+ end
495
+ if ( header[i]=="IPI")
496
+ ipiColumnIndex=i
497
+ end
498
+ if ( header[i]=="Interactions")
499
+ intactColumnIndex=i
500
+ end
501
+ if ( header[i]=="Pride")
502
+ prideColumnIndex=i
503
+ end
504
+ if ( header[i]=="Ensembl")
505
+ ensemblColumnIndex=i
506
+ end
507
+ if ( header[i]=="NextBio")
508
+ nextbioColumnIndex=i
509
+ end
510
+
511
+ if (header[i]==entrezIDColumnName)
512
+ entrezIDColumnIndex=i
513
+ entrezIDColumn=outputSheet.column i
514
+ entrezIDs=entrezIDColumn.collect { |id| id }
515
+ end
516
+
517
+ end
518
+
519
+
520
+ # Create a format for the hyperlinks
521
+ hyperlink_format = Spreadsheet::Format.new({:color => :blue,:weight => :bold,:size => 10})
522
+
523
+ # Add hyperlink format to the appropriate columns
524
+ outputSheet.column(accessionColumnIndex).default_format=hyperlink_format
525
+ outputSheet.column(nextbioColumnIndex).default_format=hyperlink_format
526
+ outputSheet.column(ipiColumnIndex).default_format=hyperlink_format
527
+ outputSheet.column(intactColumnIndex).default_format=hyperlink_format
528
+ outputSheet.column(prideColumnIndex).default_format=hyperlink_format
529
+ outputSheet.column(ensemblColumnIndex).default_format=hyperlink_format
530
+
531
+ if ( entrezIDColumn!=nil)
532
+ outputSheet.column(entrezIDColumnIndex).default_format=hyperlink_format
533
+ end
534
+
535
+ # Create all the hyperlinks
536
+ for rowi in 1...outputSheet.rows.length do
537
+
538
+ if ( plasmodbids[rowi-1]!="")
539
+ # Assume plasmodb .. and use plasmodb url
540
+ outputSheet.row(rowi)[accessionColumnIndex]=Spreadsheet::Link.new(url="http://www.plasmodb.org/plasmo/showRecord.do?name=GeneRecordClasses.GeneRecordClass&project_id=&primary_key=#{ids[rowi-1]}",description=plasmodbids[rowi-1])
541
+ else
542
+ # Otherwise assume sp
543
+ outputSheet.row(rowi)[accessionColumnIndex]=Spreadsheet::Link.new(url="http://www.uniprot.org/uniprot/#{accs[rowi-1]}.html",description=ids[rowi-1])
544
+ end
545
+
546
+ outputSheet.row(rowi)[nextbioColumnIndex]=Spreadsheet::Link.new(url="http://www.nextbio.com/b/home/home.nb?id=#{newColumns['nextbio'][rowi]}&type=feature",description=newColumns['nextbio'][rowi])
547
+ outputSheet.row(rowi)[ipiColumnIndex]=Spreadsheet::Link.new(url="http://www.ebi.ac.uk/cgi-bin/dbfetch?db=IPI&id=#{newColumns['ipi'][rowi]}",description=newColumns['ipi'][rowi])
548
+ outputSheet.row(rowi)[intactColumnIndex]=Spreadsheet::Link.new(url="http://www.ebi.ac.uk/intact/pages/interactions/interactions.xhtml?query=#{newColumns['intact'][rowi]}*",description=newColumns['intact'][rowi])
549
+ outputSheet.row(rowi)[prideColumnIndex]=Spreadsheet::Link.new(url="http://www.ebi.ac.uk/pride/searchSummary.do?queryTypeSelected=identification%20accession%20number&identificationAccessionNumber=#{newColumns['pride'][rowi]}",description=newColumns['pride'][rowi])
550
+ outputSheet.row(rowi)[ensemblColumnIndex]=Spreadsheet::Link.new(url="http://www.ensembl.org/Homo_sapiens/Transcript/Summary?db=core;t=#{newColumns['ensembl'][rowi]}",description=newColumns['ensembl'][rowi])
551
+ outputSheet.row(rowi).height=24
552
+
553
+
554
+ if ( entrezIDColumn!=nil && entrezIDs[rowi]!=nil)
555
+ outputSheet.row(rowi)[entrezIDColumnIndex]=Spreadsheet::Link.new(url="http://www.ncbi.nlm.nih.gov/gene/#{entrezIDs[rowi].to_i.to_s}",description=entrezIDs[rowi].to_i.to_s)
556
+ end
557
+
558
+ end
559
+
560
+ # Change the names of any columns to nicer values if you need to
561
+ #
562
+ outputSheet.row(0)[accessionColumnIndex]="Uniprot Link"
563
+
564
+ if ( entrezIDColumn!=nil)
565
+ outputSheet.row(0)[entrezIDColumnIndex]="Entrez.ID"
566
+ end
567
+
568
+
569
+
570
+ # Having hyperlinked existing columns we now add any additional columns (hyperlinks based on existing data)
571
+ # Note that all the column indexes will now be invalid which is why this is done near the end
572
+ #
573
+
574
+ # Insert an entrez ID based iHOP literature search link if possible
575
+ if ( entrezIDColumn!=nil)
576
+
577
+ @genv.log("Creating iHOP literature search link",:info)
578
+
579
+ ihopURLs=entrezIDs.collect do |entrezid|
580
+ "http://www.ihop-net.org/UniPub/iHOP/in?dbrefs_1=NCBI_GENE__ID|#{entrezid.to_i.to_s}"
581
+ end
582
+
583
+ columnIndex=ensemblColumnIndex+1
584
+
585
+ # Insert this column after the ensembl Link (which is before other literature based stuff)
586
+ outputSheet.insert_column(ihopURLs,columnIndex)
587
+
588
+ # Create the links
589
+ for rowi in 0...outputSheet.rows.length do
590
+ outputSheet.row(rowi)[columnIndex]=Spreadsheet::Link.new(url=ihopURLs[rowi],description=entrezIDs[rowi].to_i.to_s)
591
+ end
592
+
593
+ # Format the links
594
+ outputSheet.column(columnIndex).default_format=hyperlink_format
595
+
596
+ # And give the header a proper name
597
+ outputSheet.row(0)[columnIndex]="iHOP literature search"
598
+
599
+ end
600
+
601
+
602
+ @genv.log("Formatting header",:info)
603
+
604
+ # Format the Header row
605
+ headerFormat=Spreadsheet::Format.new({ :weight => :bold,:size => 11 })
606
+ outputSheet.row(0).default_format=headerFormat
607
+
608
+ # Here we put in a little workaround for a problem with the Spreadsheet gem.
609
+ # If the text "false" is in a column it will substitute nil for the false value and then fail when attempting to convert nil to an integer.
610
+ # We workaround by changing the word "true" to "positive" and false to "negative"
611
+ outputSheet.rows.each { |row|
612
+
613
+ row.each_index { |ri|
614
+
615
+ if ( row[ri].class==NilClass)
616
+ p "Encountered a nil value in the sheet converting to empty string"
617
+ row[ri]=""
618
+ end
619
+
620
+ if ( row[ri]==true)
621
+ row[ri]="positive"
622
+ elsif (row[ri]==false)
623
+ row[ri]="negative"
624
+ end
625
+ }
626
+
627
+
628
+
629
+ }
630
+
631
+
632
+ # Put the header rows back
633
+ #
634
+ keep_rows.reverse!
635
+ keep_rows.each do |row|
636
+ outputSheet.insert_row(0,row)
637
+ end
638
+
639
+
640
+
641
+ # Finally write the results
642
+ @genv.log("Writing New Workbook #{outputFile}",:info)
643
+ outputBook.write outputFile
644
+ end
645
+
646
+ end