rbbt-marq 1.0.9 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/MARQ/GEO.rb CHANGED
@@ -1,594 +1,602 @@
1
1
  require 'MARQ'
2
- require 'rbbt/util/open'
3
2
  require 'rbbt/sources/organism'
4
3
 
4
+ # Work with GEO datasets
5
5
  module GEO
6
6
 
7
- CACHE_DIR = File.join(MARQ.cachedir,'GEO')
8
- FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
9
-
10
- GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
11
- def self.get_soft(item)
12
- item = item.strip
13
- cache_file = File.join(CACHE_DIR, item + '.soft')
14
- if File.exist?( cache_file )
15
- File.open(cache_file).read
16
- else
17
- content = Open.read(GEO_SOFT + item, :nocache => true)
18
- fout = File.open(cache_file,'w')
19
- fout.write content
20
- fout.close
21
- content
22
- end
23
- end
7
+ # Get information from Entrez
8
+ module Remote
24
9
 
25
- #{{{ Eutils
26
- module Eutils
27
10
  def self.organism_platforms(org)
28
11
  name = Organism.name(org)
29
12
  Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=\"#{name}\"[Organism:exp]+AND+%22gpl%22[Filter]&retmax=10000").
30
13
  scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.collect{|id| "GPL#{id.sub(/^100*/,'')}"}
31
14
  end
32
15
 
33
- def self.GPL_datasets(platform)
16
+ def self.platform_datasets(platform)
34
17
  Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=#{platform}[Accession]&retmax=2000").
35
18
  scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.select{|id| !id.match(/^(1|2)000/) }.collect{|id| "GDS#{id}"}
36
19
  end
37
20
 
38
- def self.GSE_dataset?(gse)
21
+ def self.dataset_platform(dataset)
22
+ if dataset =~ /GSE/
23
+ Open.read("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
24
+ else
25
+ Open.read("http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
26
+ end
27
+ end
28
+
29
+ def self.series_dataset?(gse)
39
30
  Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=geo&term=#{gse}[Accession]&retmax=2000").
40
31
  match(/<Id>(\d+?)<\/Id>/) != nil
41
32
  end
42
33
 
43
34
  end
44
35
 
36
+ CACHE_DIR = File.join(MARQ.cachedir,'GEO')
37
+ FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
45
38
 
46
39
 
47
- #{{{ Helper functions
48
-
40
+ # Parse information in .soft files
41
+ module SOFT
49
42
 
50
- def self.consecutive?(ids)
51
- ids.collect{|id| id.to_i}.sort[0..19] == (1..20).to_a
52
- end
43
+ GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
53
44
 
54
- def self.numerical?(ids)
55
- ids.compact.select{|id| ! id.match(/^\d+$/)}.uniq.length < ids.length.to_f / 10
56
- end
45
+ # Download a soft file. Uses cache
46
+ def self.get_soft(item)
47
+ item = item.strip
48
+ cache_file = File.join(CACHE_DIR, item + '.soft')
49
+ if File.exist?( cache_file )
50
+ File.open(cache_file).read
51
+ else
52
+ content = Open.read(GEO_SOFT + item, :nocache => true)
53
+ raise "SOFT file error" if content !~ /!/
54
+ fout = File.open(cache_file,'w')
55
+ fout.write content
56
+ fout.close
57
+ content
58
+ end
59
+ end
57
60
 
58
- def self.dna_sequence?(ids)
59
- ids.compact.select{|id| ! id.strip.match(/^[ATCG]+$/i)}.empty?
60
- end
61
+ #{{{ Guess the format of the IDS
61
62
 
63
+ @@formats = {}
62
64
 
63
- ID_FIX = {
64
- :mgi_unigene => proc{|gene| if gene then gene.match(/^Mm./) ? gene : "Mm." + gene end},
65
- :human_unigene => proc{|gene| if gene then gene.match(/^Hs./) ? gene : "Hs." + gene end},
66
- }
65
+ ID_FIX = {
66
+ :mgi_unigene => proc{|gene| if gene then gene.match(/^Mm./) ? gene : "Mm." + gene end},
67
+ :human_unigene => proc{|gene| if gene then gene.match(/^Hs./) ? gene : "Hs." + gene end},
68
+ }
67
69
 
68
- @@formats = {}
69
- def self.guessIds(genes,org, name = nil)
70
- @@formats[org] ||= Organism.id_formats(org)
71
- if consecutive?(genes) || dna_sequence?(genes) || (numerical?(genes) && (name.nil? || !name.match(/entrez/i)))
72
- id = nil
73
- else
74
- fix = ID_FIX[(org + "_" + name.downcase).to_sym] if name
75
- if fix
76
- genes = genes.collect{|gene| fix.call(gene)}
77
- end
78
- id = Organism.guessIdFormat(@@formats[org], genes)
70
+ # Id list is in sequence
71
+ def self.consecutive?(ids)
72
+ ids.collect{|id| id.to_i}.sort[0..19] == (1..20).to_a
79
73
  end
80
-
81
- id
82
- end
83
-
84
- @@r = nil
85
- def self.r
86
- if @@r.nil?
87
74
 
88
- # FIXME: RSruby does not install very well, this require id hidden here.
89
- require 'rsruby'
75
+ # Id list is numerical
76
+ def self.numerical?(ids)
77
+ ids.compact.select{|id| ! id.match(/^\d+$/)}.uniq.length < ids.length.to_f / 10
78
+ end
90
79
 
91
- RSRuby.instance.source(MARQ.rootdir + '/R/MA.R')
92
- RSRuby.instance.source(MARQ.rootdir + '/R/GEO.R')
93
- @@r = RSRuby.instance
80
+ # ID are DNA bases
81
+ def self.dna_sequence?(ids)
82
+ ids.compact.select{|id| ! id.strip.match(/^[ATCG]+$/i)}.empty?
94
83
  end
95
- @@r
96
- end
97
84
 
85
+ # Guess the format of the id in the list. The name parameter can be used to
86
+ # identify some exceptions
87
+ def self.guessIds(genes,org, name = nil)
88
+ @@formats[org] ||= Organism.id_formats(org)
89
+ if consecutive?(genes) || dna_sequence?(genes) || (numerical?(genes) && (name.nil? || !name.match(/entrez/i)))
90
+ id = nil
91
+ else
92
+ fix = ID_FIX[(org + "_" + name.downcase).to_sym] if name
93
+ if fix
94
+ genes = genes.collect{|gene| fix.call(gene)}
95
+ end
96
+ id = Organism.guessIdFormat(@@formats[org], genes)
97
+ end
98
98
 
99
- #{{{ Process
99
+ id
100
+ end
100
101
 
101
- def self.get_GPL(name, prefix, id_field = nil)
102
- r.GEO_GPL_process(name, prefix, id_field, CACHE_DIR)
103
- end
104
102
 
105
- def self.get_GDS(name, prefix, id_field = nil, id_file = nil)
106
- r.GEO_GDS_process(name, prefix, id_field, id_file, CACHE_DIR)
107
- end
108
103
 
109
- def self.get_GSE(gsms, conditions, do_log, prefix, id_file = nil, fields= nil, title = nil, description = nil)
110
- r.GEO_GSE_process(gsms, conditions, prefix, do_log, id_file, fields, title, description, CACHE_DIR)
111
- end
104
+ def self.GSE(series)
105
+ soft = get_soft(series)
112
106
 
113
- def self.GSE_info(series)
114
- soft = get_soft(series)
115
- raise "SOFT file error" if soft !~ /!/
107
+ if match = soft.scan(/!Series_platform_id\s*=?\s*(.*)/)
108
+ platform = match.flatten.collect{|p| p.strip}
109
+ else
110
+ raise "No Platform information"
111
+ end
116
112
 
117
- if match = soft.scan(/!Series_platform_id\s*=?\s*(.*)/)
118
- platform = match.flatten.collect{|p| p.strip}
119
- else
120
- raise "No Platform information"
121
- end
113
+ if soft.match(/!Series_title \s*=?\s*(.*)/)
114
+ title = $1
115
+ else
116
+ raise "No Title information"
117
+ end
122
118
 
123
- if soft.match(/!Series_title \s*=?\s*(.*)/)
124
- title = $1
125
- else
126
- raise "No Title information"
127
- end
119
+ if soft.match(/!Series_summary \s*=?\s*(.*)/)
120
+ matches = soft.scan(/!Series_summary \s*=?\s*(.*)/).to_a
121
+ description = matches.collect{|m| m.to_s.strip.sub(/!Series_summary \s*=?\s*/,'')}.join("\n")
122
+ else
123
+ raise "No Summary information"
124
+ end
128
125
 
129
- if soft.match(/!Series_summary \s*=?\s*(.*)/)
130
- matches = soft.scan(/!Series_summary \s*=?\s*(.*)/).to_a
131
- description = matches.collect{|m| m.to_s.strip.sub(/!Series_summary \s*=?\s*/,'')}.join("\n")
132
- else
133
- raise "No Summary information"
134
- end
126
+ if soft.match(/!Series_sample_id \s*=?\s*(.*)/)
127
+ matches = soft.scan(/!Series_sample_id \s*=?\s*(.*)/).to_a
128
+ samples = matches.collect{|m| m.to_s.strip.sub(/!Series_sample_id \s*=?\s*/,'')}
129
+ else
130
+ raise "No Summary information"
131
+ end
135
132
 
136
- if soft.match(/!Series_sample_id \s*=?\s*(.*)/)
137
- matches = soft.scan(/!Series_sample_id \s*=?\s*(.*)/).to_a
138
- samples = matches.collect{|m| m.to_s.strip.sub(/!Series_sample_id \s*=?\s*/,'')}
139
- else
140
- raise "No Summary information"
133
+ {
134
+ :platform => platform.join("_"),
135
+ :description =>description.strip,
136
+ :title => title.strip,
137
+ :samples => samples,
138
+ }
141
139
  end
142
140
 
143
- {
144
- :platform => platform.join("_"),
145
- :description =>description.strip,
146
- :title => title.strip,
147
- :samples => samples,
148
- }
149
- end
141
+ def self.GSM(array)
142
+ soft = get_soft(array)
150
143
 
151
- def self.GSM_info(array)
152
- soft = get_soft(array)
144
+ if soft.match(/!Sample_title\s*=?\s*(.*)/)
145
+ title = $1
146
+ else
147
+ raise "No Title information"
148
+ end
153
149
 
154
- if soft.match(/!Sample_title\s*=?\s*(.*)/)
155
- title = $1
156
- else
157
- raise "No Title information"
158
- end
159
150
 
151
+ if soft.match(/!Sample_description \s*=?\s*(.*)/)
152
+ description = $1
153
+ else
154
+ raise "No Description information"
155
+ end
160
156
 
161
- if soft.match(/!Sample_description \s*=?\s*(.*)/)
162
- description = $1
163
- else
164
- raise "No Description information"
157
+ {
158
+
159
+ :description =>description.strip,
160
+ :title => title.strip,
161
+ }
165
162
  end
166
163
 
167
- {
168
-
169
- :description =>description.strip,
170
- :title => title.strip,
171
- }
172
- end
164
+ def self.GPL(platform)
165
+ if !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")) &&
166
+ !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
167
+ begin
168
+ if platform =~ /_/
169
+ organism = GPL(platform.match(/(.*?)_/)[1])[:organism]
173
170
 
174
- def self.GPL_id_fields(platform)
175
- soft = get_soft(platform)
176
- data = soft.split(/!platform_table_begin/s)[1].collect{|l| l.chomp.split(/\t/)}
177
- data.shift
178
- data.shift
179
- end
180
-
181
- def self.GPL_info(platform)
182
- if !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")) &&
183
- !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
184
- begin
185
- if platform =~ /_/
186
- organism = GPL_info(platform.match(/(.*?)_/)[1])[:organism]
187
-
188
- info = {
189
- :organism => organism,
190
- :title => "Merged platforms #{ platform }",
191
- }
192
- return info
193
- end
194
- soft = get_soft(platform)
171
+ info = {
172
+ :organism => organism,
173
+ :title => "Merged platforms #{ platform }",
174
+ }
175
+ return info
176
+ end
177
+ soft = get_soft(platform)
195
178
 
196
179
 
197
- raise "SOFT file error" if soft !~ /!/
180
+ raise "SOFT file error" if soft !~ /!/
198
181
 
199
- organisms = soft.scan(/!Platform_organism\s*=\s*(.*)/).collect{|v| v.first.strip}
182
+ organisms = soft.scan(/!Platform_organism\s*=\s*(.*)/).collect{|v| v.first.strip}
200
183
 
201
- if organisms.empty?
202
- raise "No Organism information"
203
- else
204
- # This might happen actually GPL2529
205
- organisms.delete('Schizosaccharomyces pombe') if organisms.include?('Saccharomyces cerevisiae')
206
- org_name = organisms.first
207
- end
184
+ if organisms.empty?
185
+ raise "No Organism information"
186
+ else
187
+ # This might happen actually GPL2529
188
+ organisms.delete('Schizosaccharomyces pombe') if organisms.include?('Saccharomyces cerevisiae')
189
+ org_name = organisms.first
190
+ end
208
191
 
209
192
 
210
- title = ""
211
- if soft.match(/!Platform_title\s*=\s*(.*)/)
212
- title = $1
213
- end
193
+ title = ""
194
+ if soft.match(/!Platform_title\s*=\s*(.*)/)
195
+ title = $1
196
+ end
214
197
 
215
- org = Organism.name2org(org_name)
216
- raise "Organism not identified" if org.nil?
217
-
218
- if soft.match(/!platform_table_begin/)
219
- data = soft.split(/!platform_table_begin/s)[1].collect{|l| l.chomp.split(/\t/)}
220
- data.shift
221
- names = data.shift
222
- total = data.first.length
223
- genes = data.sort_by{ rand }[1..1000].collect{|v| v.first}
224
-
225
- id = guessIds(genes,org, names.first)
226
- other = nil
227
- other_pos = 0
228
- other_count = 0
229
- other_name = 0
230
- if id.nil?
231
- (1..total - 1).to_a.each{|num|
232
- genes = data.collect{|v| v[num]}
233
- other = guessIds(genes,org, name = names[num])
234
-
235
- if other && other[1] > other_count
236
- other_pos = num
237
- other_count = other[1]
238
- other_name = names[num]
239
- end
240
- }
198
+ org = Organism.name2org(org_name)
199
+ raise "Organism not identified: #{org_name}" if org.nil?
200
+
201
+ if soft.match(/!platform_table_begin/)
202
+ data = soft.split(/!platform_table_begin/s)[1].collect{|l| l.chomp.split(/\t/)}
203
+ data.shift
204
+ names = data.shift
205
+ total = data.first.length
206
+ genes = data.sort_by{ rand }[1..1000].collect{|v| v.first}
207
+
208
+ id = guessIds(genes,org, names.first)
209
+ other = nil
210
+ other_pos = 0
211
+ other_count = 0
212
+ other_name = 0
213
+ if id.nil?
214
+ (1..total - 1).to_a.each{|num|
215
+ genes = data.collect{|v| v[num]}
216
+ other = guessIds(genes,org, name = names[num])
217
+
218
+ if other && other[1] > other_count
219
+ other_pos = num
220
+ other_count = other[1]
221
+ other_name = names[num]
222
+ end
223
+ }
224
+ end
225
+ else
226
+ raise "Soft file incomplete"
241
227
  end
242
- else
243
- raise "Soft file incomplete"
244
- end
245
228
 
246
- info = {:organism => org, :BioMart_ID => id ? id.first : nil, :title => title }
247
- info[:other_ID_field] = [other_pos + 1, other_name] if other_pos > 0
229
+ info = {:organism => org, :BioMart_ID => id ? id.first : nil, :title => title }
230
+ info[:other_ID_field] = [other_pos + 1, other_name] if other_pos > 0
248
231
 
249
232
 
250
- Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml"), info.to_yaml)
251
- rescue Exception
252
- puts $!.message
253
- puts $!.backtrace
254
- Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"), $!.message)
233
+ Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml"), info.to_yaml)
234
+ rescue Exception
235
+ puts $!.message
236
+ puts $!.backtrace
237
+ Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"), $!.message)
238
+ end
255
239
  end
240
+
241
+ raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
242
+
243
+ YAML::load(File.open(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")))
256
244
  end
257
245
 
258
- raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
259
246
 
260
- YAML::load(File.open(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")))
261
247
  end
262
248
 
263
- def self.GDS_info(name)
264
- begin
265
- title, description = Open.read(dataset_path(name) + '.description').split(/\n--\n/).values_at(0,1)
266
- {:title => title.strip, :description => description.strip}
267
- rescue Exception
268
- puts $!.message
269
- {:title => "" , :description => "" }
270
- end
271
249
 
272
- end
250
+ #{{{ Process
273
251
 
252
+ # Use R to load and process the datasets
253
+ module Process
274
254
 
275
- #{{{ Misc Info
255
+ # R library wrapper
256
+ module R
257
+ @@r = nil
276
258
 
277
- def self.clean(name)
278
- name.sub(/_cross_platform/,'') if name
279
- end
259
+ # Get the R instance
260
+ def self.r
261
+ if @@r.nil?
280
262
 
281
- def self.platform_path(platform)
282
- File.join(MARQ.datadir, "GEO/#{clean(platform)}")
283
- end
263
+ # FIXME: RSruby does not install very well, this require id hidden here.
264
+ require 'rsruby'
284
265
 
285
- def self.dataset_path(dataset, platform = nil)
286
- if platform
287
- return Dir.glob(File.join(platform_path(clean(platform)),"/*/#{ dataset }")).first.match(/(.*)\./)[1]
288
- else
289
- files = Dir.glob(File.join(MARQ.datadir, "GEO/GPL*/*/#{ dataset }.*"))
290
- if files.any?
291
- return files.first.match(/(.*)\./)[1]
292
- else
293
- return ""
266
+ RSRuby.instance.source(MARQ.rootdir + '/R/MA.R')
267
+ RSRuby.instance.source(MARQ.rootdir + '/R/GEO.R')
268
+ RSRuby.instance.source(MARQ.rootdir + '/R/GEOquery_patch.R')
269
+ @@r = RSRuby.instance
270
+ end
271
+ @@r
294
272
  end
295
- end
296
- end
297
273
 
298
- def self.is_cross_platform?(dataset)
299
- dataset =~ /_cross_platform/
300
- end
274
+ # Use R to load GPL info
275
+ def self.GPL(name, prefix, id_field = nil)
276
+ r.GEO_GPL_process(name, prefix, id_field, CACHE_DIR)
277
+ end
301
278
 
302
- def self.has_cross_platform?(dataset = nil, platform = nil)
303
- platform = clean(platform)
304
- raise "Dataset #{ dataset } not found" if dataset && dataset_path(dataset, platform).nil?
305
- raise "Platform #{ platform } not found" if platform && platform_path(platform).nil?
306
- if dataset
307
- File.exists?(dataset_path(dataset, platform) + "_cross_platform.orders")
308
- else
309
- Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
310
- end
311
- end
279
+ # Use R to load process the dataset
280
+ def self.GDS(name, prefix, id_field = nil, id_file = nil)
281
+ r.GEO_GDS_process(name, prefix, id_field, id_file, CACHE_DIR)
282
+ end
312
283
 
284
+ # Use R to load process the series
285
+ def self.GSE(gsms, conditions, do_log, prefix, id_file = nil, fields= nil, title = nil, description = nil)
286
+ r.GEO_GSE_process(gsms, conditions, prefix, do_log, id_file, fields, title, description, CACHE_DIR)
287
+ end
288
+ end
313
289
 
314
- def self.platform_datasets(platform)
315
- Dir.glob(File.join(platform_path(platform),"*/*.orders")).collect{|f| File.basename(f).sub(/.orders$/,'')}.select{|d| !is_cross_platform?(d)}
316
- end
290
+ def self.translate(org, list)
291
+ begin
292
+ ID.translate_DB(org, list)
293
+ rescue
294
+ puts "DB translation failed, resorting to index"
295
+ ID.translate_index(org, list)
296
+ end
297
+ end
317
298
 
318
- def self.dataset_platform(dataset)
319
- dataset_path(dataset).match(/(GPL\d+)/)
320
- $1
321
- end
299
+ # Rearange the lines of a file with the given order. The order specifies, for
300
+ # each position in the original file, where it should en in the final file
301
+ def self.rearange(order, file, missing = "NA")
302
+ orig_lines = []
303
+ File.open(file).each_line{|l| orig_lines << l}
322
304
 
323
- def self.organism_platforms(organism)
324
- Dir.glob(File.join(MARQ.datadir, "GEO/GPL*")).collect{|f|
325
- File.basename(f)
326
- }.select{|platform|
327
- GPL_info(platform)[:organism] == organism &&
328
- platform_datasets(platform).any?
329
- }
330
- end
305
+ return if orig_lines.empty?
306
+ columns = orig_lines.first.split(/\t/).length
331
307
 
332
- #{{{ Processing
308
+ lines = Array.new(order.length)
333
309
 
334
- def self.process_GDS(dataset, platform, field = nil)
335
- puts "Processing GDS #{ dataset }. Platform #{ platform }"
310
+ orig_lines.each_with_index{|l,i|
311
+ next if order[i].nil?
312
+ lines[order[i]] = l.chomp
313
+ }
336
314
 
337
- puts "-- Original"
338
- prefix = File.join(platform_path(platform), 'GDS', dataset.to_s)
339
- GEO.get_GDS(dataset, prefix, field, nil)
315
+ lines = lines.collect{|l| l || [missing]*columns*"\t"}
340
316
 
341
- # Was there an error?
342
- if File.exist?(prefix + '.skip')
343
- FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
344
- return
317
+ fout = File.open(file, 'w')
318
+ fout.puts(lines.join("\n"))
319
+ fout.close
345
320
  end
346
321
 
347
- if File.exist?(File.join(platform,'cross_platform'))
348
- puts "-- Translated to cross_platform format"
349
- GEO.get_GDS(dataset, prefix + '_cross_platform', field, File.join(platform_path(platform), 'translations'))
322
+ # Fix possible discrepancies in ids between series and platforms
323
+ def self.fix_GSE_ids(platform_codes_file, prefix)
324
+ platform_codes = File.open(platform_codes_file).collect{|l| l.chomp}
325
+ platform_order = {}
326
+
327
+ platform_codes.each_with_index{|code, i|
328
+ platform_order[code] = i
329
+ }
330
+
331
+ series_codes = File.open(prefix + '.codes').collect{|l| l.chomp}
332
+
333
+ platform_positions = platform_order.values_at(*series_codes)
334
+
335
+ # Fill with nil for missing positions
336
+ platform_positions[platform_codes.length - 1] ||= nil
337
+
338
+ %w(t logratios orders pvalues).each{|ext|
339
+ rearange(platform_positions, prefix + '.' + ext)
340
+ }
341
+
342
+ Open.write(prefix + '.swap', platform_positions.join("\n"))
350
343
  end
351
- end
352
344
 
353
- # Rearange the lines of a file with the given order. The order specifies, for
354
- # each position in the original file, where it should en in the final file
355
- def self.rearange(order, file, missing = "NA")
356
- orig_lines = []
357
- File.open(file).each_line{|l| orig_lines << l}
358
345
 
359
- return if orig_lines.empty?
360
- columns = orig_lines.first.split(/\t/).length
361
-
362
- lines = Array.new(order.length)
346
+ # Process a dataset. Need to specify the platform. The field parameter can
347
+ # be used to use a different column for the field.
348
+ #
349
+ # Deprecated in favor of using the original firt column and using a
350
+ # different one only for translation
351
+ def self.GDS(dataset, platform, field = nil)
352
+ puts "Processing GDS #{ dataset }. Platform #{ platform }"
353
+ platform_path = GEO.platform_path(platform)
363
354
 
364
- orig_lines.each_with_index{|l,i|
365
- next if order[i].nil?
366
- lines[order[i]] = l.chomp
367
- }
355
+ puts "-- Original"
356
+ prefix = File.join(platform_path, 'GDS', dataset.to_s)
357
+ R.GDS(dataset, prefix, field, nil)
368
358
 
369
- lines = lines.collect{|l| l || [missing]*columns*"\t"}
359
+ # Was there an error?
360
+ if File.exist?(prefix + '.skip')
361
+ FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
362
+ return
363
+ end
370
364
 
371
- fout = File.open(file, 'w')
372
- fout.puts(lines.join("\n"))
373
- fout.close
374
- end
365
+ if File.exist?(File.join(platform,'cross_platform'))
366
+ puts "-- Translated to cross_platform format"
367
+ R.GDS(dataset, prefix + '_cross_platform', field, File.join(platform_path, 'translations'))
368
+ end
369
+ end
375
370
 
376
- # Fix possible discrepancies in ids between series and platforms
377
- def self.fix_GSE_ids(platform_codes_file, prefix)
378
- platform_codes = File.open(platform_codes_file).collect{|l| l.chomp}
379
- platform_order = {}
380
-
381
- platform_codes.each_with_index{|code, i|
382
- platform_order[code] = i
383
- }
371
+ # Process a series. The info parameters is a hash with the :array,
372
+ # :platform, :log2 and :fields keys
373
+ def self.GSE(series, info)
374
+ return if Dir.glob(File.join(info[:platform], 'GSE', series) + '.*').any?
384
375
 
385
- series_codes = File.open(prefix + '.codes').collect{|l| l.chomp}
386
376
 
387
- platform_positions = platform_order.values_at(*series_codes)
377
+ gsms = []
378
+ conditions = {}
379
+ info[:arrays].each{|gsm, cond|
380
+ gsms << gsm
381
+ cond.each{|condition, value|
382
+ conditions[condition] ||= []
383
+ conditions[condition] << value
384
+ }
385
+ }
386
+ platform = info[:platform]
387
+ do_log = nil
388
+ do_log = !info[:log2] if info[:log2]
389
+ fields = info[:fields]
390
+
391
+ puts "Processing GSE #{ series }. Platform #{ platform }"
392
+
393
+ platform_path = GEO::platform_path(platform)
394
+ prefix = File.join(platform_path, 'GSE', series.to_s)
395
+ puts "-- Original"
396
+ R.GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
397
+
398
+ # Was there an error?
399
+ if File.exist?(prefix + '.skip')
400
+ FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
401
+ return
402
+ end
388
403
 
389
- # Fill with nil for missing positions
390
- platform_positions[platform_codes.length - 1] ||= nil
404
+ if platform =~ /_/
405
+ FileUtils.cp(prefix + '.codes', File.join(platform_path,'codes'))
406
+ codes = Open.read(File.join(platform_path, 'codes')).collect{|l| l.chomp}
407
+ organism = SOFT::GPL(platform.match(/(.*?)_/)[1])[:organism]
408
+ translations = translate(organism, codes)
409
+ Open.write(File.join(platform_path, 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
410
+ Open.write(File.join(platform_path, 'cross_platform'), translations.compact.sort.uniq.join("\n"))
411
+ else
412
+ # Are the codes of the series equivalent to the ones in the platform?
413
+ if File.open(File.join(platform_path,'codes')).collect{|l| l.chomp} != File.open(prefix + '.codes').collect{|l| l.chomp}
414
+ fix_GSE_ids(File.join(platform_path, 'codes'),prefix);
415
+ FileUtils.cp(File.join(platform_path, 'codes'),prefix + '.codes')
416
+ end
417
+ end
391
418
 
392
- %w(t logratios orders pvalues).each{|ext|
393
- rearange(platform_positions, prefix + '.' + ext)
394
- }
395
419
 
396
- Open.write(prefix + '.swap', platform_positions.join("\n"))
397
- end
420
+ if File.exist?(File.join(platform,'translations'))
421
+ FileUtils.cp(File.join(platform,'translations'), prefix + '.translations')
422
+ if File.exist?(prefix + '.swap')
423
+ orders = Open.read(prefix + '.swap').collect{|l| l.chomp}
424
+ inverse_orders = Array.new(orders.length)
425
+ orders.each_with_index{|pos,i|
426
+ next if pos !~ /\d/
427
+ inverse_orders[pos.to_i] = i
428
+ }
429
+ rearange(inverse_orders, prefix + '.translations', "NO MATCH")
430
+ end
431
+ puts "-- Translated to cross_platform format"
432
+ R.GSE(gsms, conditions, do_log, prefix + '_cross_platform', prefix + '.translations',fields, info[:title], info[:description])
433
+ fix_GSE_ids(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform');
434
+ FileUtils.cp(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform.codes')
435
+ FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
436
+ end
437
+ FileUtils.rm(prefix + '.swap') if File.exist?(prefix + '.swap')
438
+ end
439
+
440
+ # Load GPL data. Translates IDS of the platform probes using AILUN and our
441
+ # system (called biomart for clarity)
442
+ def self.GPL(platform)
443
+ path = GEO::platform_path(platform)
444
+ return if File.exist? path
445
+
446
+ if platform =~ /_/
447
+ FileUtils.mkdir(path)
448
+ FileUtils.mkdir(path + '/GSE')
449
+ FileUtils.mkdir(path + '/GDS')
450
+ return
451
+ end
398
452
 
453
+ info = SOFT.GPL(platform)
454
+ organism = info[:organism]
399
455
 
456
+ field = info[:other_ID_field]
457
+ id = info[:BioMart_ID]
458
+ org = info[:organism]
459
+ field = nil if field == ""
460
+ id = nil if id == ""
400
461
 
401
- def self.process_GSE(series, info)
402
- return if Dir.glob(File.join(info[:platform], 'GSE', series) + '.*').any?
403
462
 
404
- gsms = []
405
- conditions = {}
406
- info[:arrays].each{|gsm, cond|
407
- gsms << gsm
408
- cond.each{|condition, value|
409
- conditions[condition] ||= []
410
- conditions[condition] << value
463
+ puts "Processing Platform #{ platform }"
464
+ [platform,
465
+ File.join(path, 'GDS'),
466
+ File.join(path, 'GSE'),
467
+ ].each{|d|
468
+ FileUtils.mkdir d unless File.exist? d
411
469
  }
412
- }
413
- platform = info[:platform]
414
- do_log = nil
415
- do_log = !info[:log2] if info[:log2]
416
- fields = info[:fields]
417
470
 
418
- puts "Processing GSE #{ series }. Platform #{ platform }"
471
+ R.GPL(platform, path, nil)
472
+ FileUtils.mv path + '.codes', File.join(path, 'codes')
419
473
 
420
- prefix = File.join(platform_path(platform), 'GSE', series.to_s)
421
- puts "-- Original"
422
- GEO.get_GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
423
474
 
424
- # Was there an error?
425
- if File.exist?(prefix + '.skip')
426
- FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
427
- return
428
- end
475
+ # AILUN translations
476
+ codes = Open.read(File.join(path, 'codes')).collect{|l| l.chomp}
477
+ ailun = ID.AILUN_translate(platform, codes)
478
+ Open.write(File.join(path, 'ailun'), ailun.collect{|v| v || "NO MATCH"}.join("\n")) if ailun.compact.length > codes.length.to_f / 10
429
479
 
430
- if platform =~ /_/
431
- FileUtils.cp(prefix + '.codes', File.join(platform_path(platform),'codes'))
432
- codes = Open.read(File.join(platform_path(platform), 'codes')).collect{|l| l.chomp}
433
- organism = GEO::GPL_info(platform.match(/(.*?)_/)[1])[:organism]
434
- translations = ID.translate(organism, codes)
435
- Open.write(File.join(platform_path(platform), 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
436
- Open.write(File.join(platform_path(platform), 'cross_platform'), translations.compact.sort.uniq.join("\n"))
437
- else
438
- # Are the codes of the series equivalent to the ones in the platform?
439
- if File.open(File.join(platform_path(platform),'codes')).collect{|l| l.chomp} != File.open(prefix + '.codes').collect{|l| l.chomp}
440
- fix_GSE_ids(File.join(platform_path(platform), 'codes'),prefix);
441
- FileUtils.cp(File.join(platform_path(platform), 'codes'),prefix + '.codes')
480
+ # BioMart translations
481
+ biomart = []
482
+ if id || field
483
+ if id
484
+ codes = Open.read(File.join(path, 'codes')).collect{|l| l.chomp}
485
+ else
486
+ if field
487
+ R.GPL(platform, path, field[0])
488
+ FileUtils.mv path + '.codes', File.join(path, 'other')
489
+ end
442
490
 
491
+ fix = GEO::SOFT::ID_FIX[(organism + "_" + field[1].downcase).to_sym]
492
+ codes = Open.read(File.join(path, 'other')).collect{|l|
493
+ code = l.chomp
494
+ code = fix.call(code) if fix
495
+ code
496
+ }
497
+ end
498
+
499
+ biomart = translate(organism, codes)
500
+ Open.write(File.join(path, 'biomart'), biomart.collect{|v| v || "NO MATCH"}.join("\n")) if biomart.compact.length > codes.length.to_f / 10
443
501
  end
444
- end
445
502
 
503
+ # Select Best and save
504
+ translations = []
505
+ if ailun.compact.uniq.length > biomart.compact.uniq.length
506
+ id_type = ID::DEFAULT_FORMATS[organism] || ID::DEFAULT_FORMAT_ALL || id || field || "Entrez Gene Id"
507
+ if id_type.to_s !~ /Entrez/i
508
+ translations = translate(org,ailun.collect{|gene| gene || "NO MATCH"})
509
+ else
510
+ translations = ailun
511
+ end
512
+ else
513
+ translations = biomart
514
+ end
446
515
 
447
- if File.exist?(File.join(platform,'translations'))
448
- FileUtils.cp(File.join(platform,'translations'), prefix + '.translations')
449
- if File.exist?(prefix + '.swap')
450
- orders = Open.read(prefix + '.swap').collect{|l| l.chomp}
451
- inverse_orders = Array.new(orders.length)
452
- orders.each_with_index{|pos,i|
453
- next if pos !~ /\d/
454
- inverse_orders[pos.to_i] = i
455
- }
456
- rearange(inverse_orders, prefix + '.translations', "NO MATCH")
516
+ if translations.compact.length > codes.length.to_f / 10
517
+ Open.write(File.join(path, 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
518
+ Open.write(File.join(path, 'cross_platform'), translations.compact.sort.uniq.join("\n"))
457
519
  end
458
- puts "-- Translated to cross_platform format"
459
- GEO.get_GSE(gsms, conditions, do_log, prefix + '_cross_platform', prefix + '.translations',fields, info[:title], info[:description])
460
- fix_GSE_ids(File.join(platform_path(platform), 'cross_platform'),prefix + '_cross_platform');
461
- FileUtils.cp(File.join(platform_path(platform), 'cross_platform'),prefix + '_cross_platform.codes')
462
- FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
520
+
463
521
  end
464
- FileUtils.rm(prefix + '.swap') if File.exist?(prefix + '.swap')
522
+
465
523
  end
466
524
 
467
- def self.process_platform(platform)
468
- path = platform_path(platform)
469
- return if File.exist? path
470
525
 
471
- if platform =~ /_/
472
- FileUtils.mkdir(path)
473
- FileUtils.mkdir(path + '/GSE')
474
- FileUtils.mkdir(path + '/GDS')
475
- return
476
- end
477
526
 
478
- info = GEO::GPL_info(platform)
479
- organism = info[:organism]
480
-
481
- field = info[:other_ID_field]
482
- id = info[:BioMart_ID]
483
- org = info[:organism]
484
- field = nil if field == ""
485
- id = nil if id == ""
486
-
487
-
488
- puts "Processing Platform #{ platform }"
489
- [platform,
490
- File.join(platform_path(platform), 'GDS'),
491
- File.join(platform_path(platform), 'GSE'),
492
- ].each{|d|
493
- FileUtils.mkdir d unless File.exist? d
494
- }
527
+ #{{{ Local data store info
495
528
 
496
- get_GPL(platform, platform_path(platform), nil)
497
- FileUtils.mv platform_path(platform) + '.codes', File.join(platform_path(platform), 'codes')
498
-
529
+ def self.clean(name)
530
+ name.sub(/_cross_platform/,'') if name
531
+ end
499
532
 
500
- # AILUN translations
501
- codes = Open.read(File.join(platform_path(platform), 'codes')).collect{|l| l.chomp}
502
- ailun = ID.AILUN_translate(platform, codes)
503
- Open.write(File.join(platform_path(platform), 'ailun'), ailun.collect{|v| v || "NO MATCH"}.join("\n")) if ailun.compact.length > codes.length.to_f / 10
504
533
 
505
- # BioMart translations
506
- biomart = []
507
- if id || field
508
- if id
509
- codes = Open.read(File.join(platform_path(platform), 'codes')).collect{|l| l.chomp}
510
- else
511
- if field
512
- get_GPL(platform, platform_path(platform), field[0])
513
- FileUtils.mv platform_path(platform) + '.codes', File.join(platform_path(platform), 'other')
514
- end
534
+ def self.platform_path(platform)
535
+ File.join(MARQ.datadir, "GEO/#{clean(platform)}")
536
+ end
515
537
 
516
- fix = ID_FIX[(organism + "_" + field[1].downcase).to_sym]
517
- codes = Open.read(File.join(platform_path(platform), 'other')).collect{|l|
518
- code = l.chomp
519
- code = fix.call(code) if fix
520
- code
521
- }
522
- end
523
538
 
524
- biomart = ID.translate(organism, codes)
525
- Open.write(File.join(platform_path(platform), 'biomart'), biomart.collect{|v| v || "NO MATCH"}.join("\n")) if biomart.compact.length > codes.length.to_f / 10
526
- end
539
+ def self.is_cross_platform?(dataset)
540
+ dataset =~ /_cross_platform/
541
+ end
527
542
 
528
- # Select Best and save
529
- translations = []
530
- if ailun.compact.uniq.length > biomart.compact.uniq.length
531
- id_type = ID::DEFAULT_FORMATS[organism] || ID::DEFAULT_FORMAT_ALL || id || field || "Entrez Gene Id"
532
- if id_type.to_s !~ /Entrez/i
533
- translations = ID.translate(org,ailun.collect{|gene| gene || "NO MATCH"})
534
- else
535
- translations = ailun
536
- end
543
+ def self.has_cross_platform?(dataset = nil, platform = nil)
544
+ platform = clean(platform)
545
+ raise "Dataset #{ dataset } not found" if dataset && dataset_path(dataset, platform).nil?
546
+ raise "Platform #{ platform } not found" if platform && platform_path(platform).nil?
547
+ if dataset
548
+ File.exists?(dataset_path(dataset, platform) + "_cross_platform.orders")
537
549
  else
538
- translations = biomart
550
+ Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
539
551
  end
552
+ end
540
553
 
541
- if translations.compact.length > codes.length.to_f / 10
542
- Open.write(File.join(platform_path(platform), 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
543
- Open.write(File.join(platform_path(platform), 'cross_platform'), translations.compact.sort.uniq.join("\n"))
554
+ def self.dataset_path(dataset, platform = nil)
555
+ if platform
556
+ files = Dir.glob(File.join(platform_path(clean(platform)),"/*/#{ dataset }"))
557
+ else
558
+ files = Dir.glob(File.join(MARQ.datadir, "GEO/GPL*/*/#{ dataset }.*"))
544
559
  end
560
+ return nil if files.empty?
561
+ return files.first.match(/(.*)\./)[1]
562
+ end
545
563
 
564
+ def self.organism_platforms(organism)
565
+ Dir.glob(File.join(MARQ.datadir, "GEO/GPL*")).collect{|f|
566
+ File.basename(f)
567
+ }.select{|platform|
568
+ SOFT.GPL(platform)[:organism] == organism &&
569
+ platform_datasets(platform).any?
570
+ }
546
571
  end
547
572
 
548
573
 
549
- def self.process_platform_datasets(platform, force = false)
550
- raise "Platform #{ platform } not ready" unless File.exist? platform_path(platform)
551
574
 
552
- info = YAML::load(File.open(File.join(MARQ.datadir, "GEO/platforms/#{platform}.yaml")))
575
+ def self.platform_datasets(platform)
576
+ Dir.glob(File.join(platform_path(platform),"*/*.orders")).collect{|f| File.basename(f).sub(/.orders$/,'')}.select{|d| !is_cross_platform?(d)}
577
+ end
553
578
 
554
- datasets = GEO::Eutils::GPL_datasets(platform)
555
- datasets.each{|dataset|
556
- next if Dir.glob(File.join(platform_path(platform), 'GDS', dataset) + '.*').any? && ! force
557
- process_GDS(dataset, platform, nil)
558
- }
579
+ def self.dataset_platform(dataset)
580
+ dataset_path(dataset).match(/(GPL\d+)/)
581
+ $1
559
582
  end
560
583
 
584
+ def self.GDS_info(name)
585
+ begin
586
+ title, description = Open.read(dataset_path(name) + '.description').split(/\n--\n/).values_at(0,1)
587
+ {:title => title.strip, :description => description.strip}
588
+ rescue Exception
589
+ puts $!.message
590
+ {:title => "" , :description => "" }
591
+ end
592
+
593
+ end
594
+
595
+
561
596
  end
562
597
 
563
- if __FILE__ == $0
564
598
 
565
- p GEO.GPL_info('GPL920_GPL927')
566
- p GEO.GPL_id_fields('GPL920')
567
- puts GEO.GSE_info('GSE962')
568
- puts GEO.GSE_info('GSE8982')
569
- puts GEO::Eutils.GSE_dataset?('GSE8982')
570
- puts GEO::Eutils.GSE_dataset?('GSE962')
571
-
572
- exit
573
-
574
- #puts GEO::dataset_path('GDS1103').inspect
575
- #puts GEO::dataset_platform('GDS1103').inspect
576
-
577
- # puts GEO.dataset_path('GDS2931')
578
- # puts GEO.platform_datasets('GPL91')
579
- # puts GEO.platform_datasets('GPL91').select{|d| GEO.has_cross_platform?(d)}
580
- #
581
- # gpls = Open.read('ftp://ftp.ncbi.nih.gov/pub/geo/DATA/supplementary/PLATFORMS.txt').collect{|l|
582
- # l.chomp.split.first
583
- # }
584
- #
585
- # %w(GPL85).each{|gpl|
586
- # puts gpl
587
- # puts GEO::GPL_info(gpl).inspect if gpl =~ /GPL/
588
- # }
589
- #
590
- #puts GEO::GSM_info('GSM70604').inspect
591
-
592
- p GEO::Eutils.organism_platforms('human')
599
+ if __FILE__ == $0
593
600
 
594
601
  end
602
+