rbbt-marq 1.0.9 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/MARQ/GEO.rb CHANGED
@@ -1,594 +1,602 @@
1
1
  require 'MARQ'
2
- require 'rbbt/util/open'
3
2
  require 'rbbt/sources/organism'
4
3
 
4
+ # Work with GEO datasets
5
5
  module GEO
6
6
 
7
- CACHE_DIR = File.join(MARQ.cachedir,'GEO')
8
- FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
9
-
10
- GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
11
- def self.get_soft(item)
12
- item = item.strip
13
- cache_file = File.join(CACHE_DIR, item + '.soft')
14
- if File.exist?( cache_file )
15
- File.open(cache_file).read
16
- else
17
- content = Open.read(GEO_SOFT + item, :nocache => true)
18
- fout = File.open(cache_file,'w')
19
- fout.write content
20
- fout.close
21
- content
22
- end
23
- end
7
+ # Get information from Entrez
8
+ module Remote
24
9
 
25
- #{{{ Eutils
26
- module Eutils
27
10
  def self.organism_platforms(org)
28
11
  name = Organism.name(org)
29
12
  Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=\"#{name}\"[Organism:exp]+AND+%22gpl%22[Filter]&retmax=10000").
30
13
  scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.collect{|id| "GPL#{id.sub(/^100*/,'')}"}
31
14
  end
32
15
 
33
- def self.GPL_datasets(platform)
16
+ def self.platform_datasets(platform)
34
17
  Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=#{platform}[Accession]&retmax=2000").
35
18
  scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.select{|id| !id.match(/^(1|2)000/) }.collect{|id| "GDS#{id}"}
36
19
  end
37
20
 
38
- def self.GSE_dataset?(gse)
21
+ def self.dataset_platform(dataset)
22
+ if dataset =~ /GSE/
23
+ Open.read("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
24
+ else
25
+ Open.read("http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
26
+ end
27
+ end
28
+
29
+ def self.series_dataset?(gse)
39
30
  Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=geo&term=#{gse}[Accession]&retmax=2000").
40
31
  match(/<Id>(\d+?)<\/Id>/) != nil
41
32
  end
42
33
 
43
34
  end
44
35
 
36
+ CACHE_DIR = File.join(MARQ.cachedir,'GEO')
37
+ FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
45
38
 
46
39
 
47
- #{{{ Helper functions
48
-
40
+ # Parse information in .soft files
41
+ module SOFT
49
42
 
50
- def self.consecutive?(ids)
51
- ids.collect{|id| id.to_i}.sort[0..19] == (1..20).to_a
52
- end
43
+ GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
53
44
 
54
- def self.numerical?(ids)
55
- ids.compact.select{|id| ! id.match(/^\d+$/)}.uniq.length < ids.length.to_f / 10
56
- end
45
+ # Download a soft file. Uses cache
46
+ def self.get_soft(item)
47
+ item = item.strip
48
+ cache_file = File.join(CACHE_DIR, item + '.soft')
49
+ if File.exist?( cache_file )
50
+ File.open(cache_file).read
51
+ else
52
+ content = Open.read(GEO_SOFT + item, :nocache => true)
53
+ raise "SOFT file error" if content !~ /!/
54
+ fout = File.open(cache_file,'w')
55
+ fout.write content
56
+ fout.close
57
+ content
58
+ end
59
+ end
57
60
 
58
- def self.dna_sequence?(ids)
59
- ids.compact.select{|id| ! id.strip.match(/^[ATCG]+$/i)}.empty?
60
- end
61
+ #{{{ Guess the format of the IDS
61
62
 
63
+ @@formats = {}
62
64
 
63
- ID_FIX = {
64
- :mgi_unigene => proc{|gene| if gene then gene.match(/^Mm./) ? gene : "Mm." + gene end},
65
- :human_unigene => proc{|gene| if gene then gene.match(/^Hs./) ? gene : "Hs." + gene end},
66
- }
65
+ ID_FIX = {
66
+ :mgi_unigene => proc{|gene| if gene then gene.match(/^Mm./) ? gene : "Mm." + gene end},
67
+ :human_unigene => proc{|gene| if gene then gene.match(/^Hs./) ? gene : "Hs." + gene end},
68
+ }
67
69
 
68
- @@formats = {}
69
- def self.guessIds(genes,org, name = nil)
70
- @@formats[org] ||= Organism.id_formats(org)
71
- if consecutive?(genes) || dna_sequence?(genes) || (numerical?(genes) && (name.nil? || !name.match(/entrez/i)))
72
- id = nil
73
- else
74
- fix = ID_FIX[(org + "_" + name.downcase).to_sym] if name
75
- if fix
76
- genes = genes.collect{|gene| fix.call(gene)}
77
- end
78
- id = Organism.guessIdFormat(@@formats[org], genes)
70
+ # Id list is in sequence
71
+ def self.consecutive?(ids)
72
+ ids.collect{|id| id.to_i}.sort[0..19] == (1..20).to_a
79
73
  end
80
-
81
- id
82
- end
83
-
84
- @@r = nil
85
- def self.r
86
- if @@r.nil?
87
74
 
88
- # FIXME: RSruby does not install very well, this require id hidden here.
89
- require 'rsruby'
75
+ # Id list is numerical
76
+ def self.numerical?(ids)
77
+ ids.compact.select{|id| ! id.match(/^\d+$/)}.uniq.length < ids.length.to_f / 10
78
+ end
90
79
 
91
- RSRuby.instance.source(MARQ.rootdir + '/R/MA.R')
92
- RSRuby.instance.source(MARQ.rootdir + '/R/GEO.R')
93
- @@r = RSRuby.instance
80
+ # ID are DNA bases
81
+ def self.dna_sequence?(ids)
82
+ ids.compact.select{|id| ! id.strip.match(/^[ATCG]+$/i)}.empty?
94
83
  end
95
- @@r
96
- end
97
84
 
85
+ # Guess the format of the id in the list. The name parameter can be used to
86
+ # identify some exceptions
87
+ def self.guessIds(genes,org, name = nil)
88
+ @@formats[org] ||= Organism.id_formats(org)
89
+ if consecutive?(genes) || dna_sequence?(genes) || (numerical?(genes) && (name.nil? || !name.match(/entrez/i)))
90
+ id = nil
91
+ else
92
+ fix = ID_FIX[(org + "_" + name.downcase).to_sym] if name
93
+ if fix
94
+ genes = genes.collect{|gene| fix.call(gene)}
95
+ end
96
+ id = Organism.guessIdFormat(@@formats[org], genes)
97
+ end
98
98
 
99
- #{{{ Process
99
+ id
100
+ end
100
101
 
101
- def self.get_GPL(name, prefix, id_field = nil)
102
- r.GEO_GPL_process(name, prefix, id_field, CACHE_DIR)
103
- end
104
102
 
105
- def self.get_GDS(name, prefix, id_field = nil, id_file = nil)
106
- r.GEO_GDS_process(name, prefix, id_field, id_file, CACHE_DIR)
107
- end
108
103
 
109
- def self.get_GSE(gsms, conditions, do_log, prefix, id_file = nil, fields= nil, title = nil, description = nil)
110
- r.GEO_GSE_process(gsms, conditions, prefix, do_log, id_file, fields, title, description, CACHE_DIR)
111
- end
104
+ def self.GSE(series)
105
+ soft = get_soft(series)
112
106
 
113
- def self.GSE_info(series)
114
- soft = get_soft(series)
115
- raise "SOFT file error" if soft !~ /!/
107
+ if match = soft.scan(/!Series_platform_id\s*=?\s*(.*)/)
108
+ platform = match.flatten.collect{|p| p.strip}
109
+ else
110
+ raise "No Platform information"
111
+ end
116
112
 
117
- if match = soft.scan(/!Series_platform_id\s*=?\s*(.*)/)
118
- platform = match.flatten.collect{|p| p.strip}
119
- else
120
- raise "No Platform information"
121
- end
113
+ if soft.match(/!Series_title \s*=?\s*(.*)/)
114
+ title = $1
115
+ else
116
+ raise "No Title information"
117
+ end
122
118
 
123
- if soft.match(/!Series_title \s*=?\s*(.*)/)
124
- title = $1
125
- else
126
- raise "No Title information"
127
- end
119
+ if soft.match(/!Series_summary \s*=?\s*(.*)/)
120
+ matches = soft.scan(/!Series_summary \s*=?\s*(.*)/).to_a
121
+ description = matches.collect{|m| m.to_s.strip.sub(/!Series_summary \s*=?\s*/,'')}.join("\n")
122
+ else
123
+ raise "No Summary information"
124
+ end
128
125
 
129
- if soft.match(/!Series_summary \s*=?\s*(.*)/)
130
- matches = soft.scan(/!Series_summary \s*=?\s*(.*)/).to_a
131
- description = matches.collect{|m| m.to_s.strip.sub(/!Series_summary \s*=?\s*/,'')}.join("\n")
132
- else
133
- raise "No Summary information"
134
- end
126
+ if soft.match(/!Series_sample_id \s*=?\s*(.*)/)
127
+ matches = soft.scan(/!Series_sample_id \s*=?\s*(.*)/).to_a
128
+ samples = matches.collect{|m| m.to_s.strip.sub(/!Series_sample_id \s*=?\s*/,'')}
129
+ else
130
+ raise "No Summary information"
131
+ end
135
132
 
136
- if soft.match(/!Series_sample_id \s*=?\s*(.*)/)
137
- matches = soft.scan(/!Series_sample_id \s*=?\s*(.*)/).to_a
138
- samples = matches.collect{|m| m.to_s.strip.sub(/!Series_sample_id \s*=?\s*/,'')}
139
- else
140
- raise "No Summary information"
133
+ {
134
+ :platform => platform.join("_"),
135
+ :description =>description.strip,
136
+ :title => title.strip,
137
+ :samples => samples,
138
+ }
141
139
  end
142
140
 
143
- {
144
- :platform => platform.join("_"),
145
- :description =>description.strip,
146
- :title => title.strip,
147
- :samples => samples,
148
- }
149
- end
141
+ def self.GSM(array)
142
+ soft = get_soft(array)
150
143
 
151
- def self.GSM_info(array)
152
- soft = get_soft(array)
144
+ if soft.match(/!Sample_title\s*=?\s*(.*)/)
145
+ title = $1
146
+ else
147
+ raise "No Title information"
148
+ end
153
149
 
154
- if soft.match(/!Sample_title\s*=?\s*(.*)/)
155
- title = $1
156
- else
157
- raise "No Title information"
158
- end
159
150
 
151
+ if soft.match(/!Sample_description \s*=?\s*(.*)/)
152
+ description = $1
153
+ else
154
+ raise "No Description information"
155
+ end
160
156
 
161
- if soft.match(/!Sample_description \s*=?\s*(.*)/)
162
- description = $1
163
- else
164
- raise "No Description information"
157
+ {
158
+
159
+ :description =>description.strip,
160
+ :title => title.strip,
161
+ }
165
162
  end
166
163
 
167
- {
168
-
169
- :description =>description.strip,
170
- :title => title.strip,
171
- }
172
- end
164
+ def self.GPL(platform)
165
+ if !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")) &&
166
+ !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
167
+ begin
168
+ if platform =~ /_/
169
+ organism = GPL(platform.match(/(.*?)_/)[1])[:organism]
173
170
 
174
- def self.GPL_id_fields(platform)
175
- soft = get_soft(platform)
176
- data = soft.split(/!platform_table_begin/s)[1].collect{|l| l.chomp.split(/\t/)}
177
- data.shift
178
- data.shift
179
- end
180
-
181
- def self.GPL_info(platform)
182
- if !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")) &&
183
- !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
184
- begin
185
- if platform =~ /_/
186
- organism = GPL_info(platform.match(/(.*?)_/)[1])[:organism]
187
-
188
- info = {
189
- :organism => organism,
190
- :title => "Merged platforms #{ platform }",
191
- }
192
- return info
193
- end
194
- soft = get_soft(platform)
171
+ info = {
172
+ :organism => organism,
173
+ :title => "Merged platforms #{ platform }",
174
+ }
175
+ return info
176
+ end
177
+ soft = get_soft(platform)
195
178
 
196
179
 
197
- raise "SOFT file error" if soft !~ /!/
180
+ raise "SOFT file error" if soft !~ /!/
198
181
 
199
- organisms = soft.scan(/!Platform_organism\s*=\s*(.*)/).collect{|v| v.first.strip}
182
+ organisms = soft.scan(/!Platform_organism\s*=\s*(.*)/).collect{|v| v.first.strip}
200
183
 
201
- if organisms.empty?
202
- raise "No Organism information"
203
- else
204
- # This might happen actually GPL2529
205
- organisms.delete('Schizosaccharomyces pombe') if organisms.include?('Saccharomyces cerevisiae')
206
- org_name = organisms.first
207
- end
184
+ if organisms.empty?
185
+ raise "No Organism information"
186
+ else
187
+ # This might happen actually GPL2529
188
+ organisms.delete('Schizosaccharomyces pombe') if organisms.include?('Saccharomyces cerevisiae')
189
+ org_name = organisms.first
190
+ end
208
191
 
209
192
 
210
- title = ""
211
- if soft.match(/!Platform_title\s*=\s*(.*)/)
212
- title = $1
213
- end
193
+ title = ""
194
+ if soft.match(/!Platform_title\s*=\s*(.*)/)
195
+ title = $1
196
+ end
214
197
 
215
- org = Organism.name2org(org_name)
216
- raise "Organism not identified" if org.nil?
217
-
218
- if soft.match(/!platform_table_begin/)
219
- data = soft.split(/!platform_table_begin/s)[1].collect{|l| l.chomp.split(/\t/)}
220
- data.shift
221
- names = data.shift
222
- total = data.first.length
223
- genes = data.sort_by{ rand }[1..1000].collect{|v| v.first}
224
-
225
- id = guessIds(genes,org, names.first)
226
- other = nil
227
- other_pos = 0
228
- other_count = 0
229
- other_name = 0
230
- if id.nil?
231
- (1..total - 1).to_a.each{|num|
232
- genes = data.collect{|v| v[num]}
233
- other = guessIds(genes,org, name = names[num])
234
-
235
- if other && other[1] > other_count
236
- other_pos = num
237
- other_count = other[1]
238
- other_name = names[num]
239
- end
240
- }
198
+ org = Organism.name2org(org_name)
199
+ raise "Organism not identified: #{org_name}" if org.nil?
200
+
201
+ if soft.match(/!platform_table_begin/)
202
+ data = soft.split(/!platform_table_begin/s)[1].collect{|l| l.chomp.split(/\t/)}
203
+ data.shift
204
+ names = data.shift
205
+ total = data.first.length
206
+ genes = data.sort_by{ rand }[1..1000].collect{|v| v.first}
207
+
208
+ id = guessIds(genes,org, names.first)
209
+ other = nil
210
+ other_pos = 0
211
+ other_count = 0
212
+ other_name = 0
213
+ if id.nil?
214
+ (1..total - 1).to_a.each{|num|
215
+ genes = data.collect{|v| v[num]}
216
+ other = guessIds(genes,org, name = names[num])
217
+
218
+ if other && other[1] > other_count
219
+ other_pos = num
220
+ other_count = other[1]
221
+ other_name = names[num]
222
+ end
223
+ }
224
+ end
225
+ else
226
+ raise "Soft file incomplete"
241
227
  end
242
- else
243
- raise "Soft file incomplete"
244
- end
245
228
 
246
- info = {:organism => org, :BioMart_ID => id ? id.first : nil, :title => title }
247
- info[:other_ID_field] = [other_pos + 1, other_name] if other_pos > 0
229
+ info = {:organism => org, :BioMart_ID => id ? id.first : nil, :title => title }
230
+ info[:other_ID_field] = [other_pos + 1, other_name] if other_pos > 0
248
231
 
249
232
 
250
- Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml"), info.to_yaml)
251
- rescue Exception
252
- puts $!.message
253
- puts $!.backtrace
254
- Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"), $!.message)
233
+ Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml"), info.to_yaml)
234
+ rescue Exception
235
+ puts $!.message
236
+ puts $!.backtrace
237
+ Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"), $!.message)
238
+ end
255
239
  end
240
+
241
+ raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
242
+
243
+ YAML::load(File.open(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")))
256
244
  end
257
245
 
258
- raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
259
246
 
260
- YAML::load(File.open(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")))
261
247
  end
262
248
 
263
- def self.GDS_info(name)
264
- begin
265
- title, description = Open.read(dataset_path(name) + '.description').split(/\n--\n/).values_at(0,1)
266
- {:title => title.strip, :description => description.strip}
267
- rescue Exception
268
- puts $!.message
269
- {:title => "" , :description => "" }
270
- end
271
249
 
272
- end
250
+ #{{{ Process
273
251
 
252
+ # Use R to load and process the datasets
253
+ module Process
274
254
 
275
- #{{{ Misc Info
255
+ # R library wrapper
256
+ module R
257
+ @@r = nil
276
258
 
277
- def self.clean(name)
278
- name.sub(/_cross_platform/,'') if name
279
- end
259
+ # Get the R instance
260
+ def self.r
261
+ if @@r.nil?
280
262
 
281
- def self.platform_path(platform)
282
- File.join(MARQ.datadir, "GEO/#{clean(platform)}")
283
- end
263
+ # FIXME: RSruby does not install very well, this require id hidden here.
264
+ require 'rsruby'
284
265
 
285
- def self.dataset_path(dataset, platform = nil)
286
- if platform
287
- return Dir.glob(File.join(platform_path(clean(platform)),"/*/#{ dataset }")).first.match(/(.*)\./)[1]
288
- else
289
- files = Dir.glob(File.join(MARQ.datadir, "GEO/GPL*/*/#{ dataset }.*"))
290
- if files.any?
291
- return files.first.match(/(.*)\./)[1]
292
- else
293
- return ""
266
+ RSRuby.instance.source(MARQ.rootdir + '/R/MA.R')
267
+ RSRuby.instance.source(MARQ.rootdir + '/R/GEO.R')
268
+ RSRuby.instance.source(MARQ.rootdir + '/R/GEOquery_patch.R')
269
+ @@r = RSRuby.instance
270
+ end
271
+ @@r
294
272
  end
295
- end
296
- end
297
273
 
298
- def self.is_cross_platform?(dataset)
299
- dataset =~ /_cross_platform/
300
- end
274
+ # Use R to load GPL info
275
+ def self.GPL(name, prefix, id_field = nil)
276
+ r.GEO_GPL_process(name, prefix, id_field, CACHE_DIR)
277
+ end
301
278
 
302
- def self.has_cross_platform?(dataset = nil, platform = nil)
303
- platform = clean(platform)
304
- raise "Dataset #{ dataset } not found" if dataset && dataset_path(dataset, platform).nil?
305
- raise "Platform #{ platform } not found" if platform && platform_path(platform).nil?
306
- if dataset
307
- File.exists?(dataset_path(dataset, platform) + "_cross_platform.orders")
308
- else
309
- Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
310
- end
311
- end
279
+ # Use R to load process the dataset
280
+ def self.GDS(name, prefix, id_field = nil, id_file = nil)
281
+ r.GEO_GDS_process(name, prefix, id_field, id_file, CACHE_DIR)
282
+ end
312
283
 
284
+ # Use R to load process the series
285
+ def self.GSE(gsms, conditions, do_log, prefix, id_file = nil, fields= nil, title = nil, description = nil)
286
+ r.GEO_GSE_process(gsms, conditions, prefix, do_log, id_file, fields, title, description, CACHE_DIR)
287
+ end
288
+ end
313
289
 
314
- def self.platform_datasets(platform)
315
- Dir.glob(File.join(platform_path(platform),"*/*.orders")).collect{|f| File.basename(f).sub(/.orders$/,'')}.select{|d| !is_cross_platform?(d)}
316
- end
290
+ def self.translate(org, list)
291
+ begin
292
+ ID.translate_DB(org, list)
293
+ rescue
294
+ puts "DB translation failed, resorting to index"
295
+ ID.translate_index(org, list)
296
+ end
297
+ end
317
298
 
318
- def self.dataset_platform(dataset)
319
- dataset_path(dataset).match(/(GPL\d+)/)
320
- $1
321
- end
299
+ # Rearange the lines of a file with the given order. The order specifies, for
300
+ # each position in the original file, where it should en in the final file
301
+ def self.rearange(order, file, missing = "NA")
302
+ orig_lines = []
303
+ File.open(file).each_line{|l| orig_lines << l}
322
304
 
323
- def self.organism_platforms(organism)
324
- Dir.glob(File.join(MARQ.datadir, "GEO/GPL*")).collect{|f|
325
- File.basename(f)
326
- }.select{|platform|
327
- GPL_info(platform)[:organism] == organism &&
328
- platform_datasets(platform).any?
329
- }
330
- end
305
+ return if orig_lines.empty?
306
+ columns = orig_lines.first.split(/\t/).length
331
307
 
332
- #{{{ Processing
308
+ lines = Array.new(order.length)
333
309
 
334
- def self.process_GDS(dataset, platform, field = nil)
335
- puts "Processing GDS #{ dataset }. Platform #{ platform }"
310
+ orig_lines.each_with_index{|l,i|
311
+ next if order[i].nil?
312
+ lines[order[i]] = l.chomp
313
+ }
336
314
 
337
- puts "-- Original"
338
- prefix = File.join(platform_path(platform), 'GDS', dataset.to_s)
339
- GEO.get_GDS(dataset, prefix, field, nil)
315
+ lines = lines.collect{|l| l || [missing]*columns*"\t"}
340
316
 
341
- # Was there an error?
342
- if File.exist?(prefix + '.skip')
343
- FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
344
- return
317
+ fout = File.open(file, 'w')
318
+ fout.puts(lines.join("\n"))
319
+ fout.close
345
320
  end
346
321
 
347
- if File.exist?(File.join(platform,'cross_platform'))
348
- puts "-- Translated to cross_platform format"
349
- GEO.get_GDS(dataset, prefix + '_cross_platform', field, File.join(platform_path(platform), 'translations'))
322
+ # Fix possible discrepancies in ids between series and platforms
323
+ def self.fix_GSE_ids(platform_codes_file, prefix)
324
+ platform_codes = File.open(platform_codes_file).collect{|l| l.chomp}
325
+ platform_order = {}
326
+
327
+ platform_codes.each_with_index{|code, i|
328
+ platform_order[code] = i
329
+ }
330
+
331
+ series_codes = File.open(prefix + '.codes').collect{|l| l.chomp}
332
+
333
+ platform_positions = platform_order.values_at(*series_codes)
334
+
335
+ # Fill with nil for missing positions
336
+ platform_positions[platform_codes.length - 1] ||= nil
337
+
338
+ %w(t logratios orders pvalues).each{|ext|
339
+ rearange(platform_positions, prefix + '.' + ext)
340
+ }
341
+
342
+ Open.write(prefix + '.swap', platform_positions.join("\n"))
350
343
  end
351
- end
352
344
 
353
- # Rearange the lines of a file with the given order. The order specifies, for
354
- # each position in the original file, where it should en in the final file
355
- def self.rearange(order, file, missing = "NA")
356
- orig_lines = []
357
- File.open(file).each_line{|l| orig_lines << l}
358
345
 
359
- return if orig_lines.empty?
360
- columns = orig_lines.first.split(/\t/).length
361
-
362
- lines = Array.new(order.length)
346
+ # Process a dataset. Need to specify the platform. The field parameter can
347
+ # be used to use a different column for the field.
348
+ #
349
+ # Deprecated in favor of using the original firt column and using a
350
+ # different one only for translation
351
+ def self.GDS(dataset, platform, field = nil)
352
+ puts "Processing GDS #{ dataset }. Platform #{ platform }"
353
+ platform_path = GEO.platform_path(platform)
363
354
 
364
- orig_lines.each_with_index{|l,i|
365
- next if order[i].nil?
366
- lines[order[i]] = l.chomp
367
- }
355
+ puts "-- Original"
356
+ prefix = File.join(platform_path, 'GDS', dataset.to_s)
357
+ R.GDS(dataset, prefix, field, nil)
368
358
 
369
- lines = lines.collect{|l| l || [missing]*columns*"\t"}
359
+ # Was there an error?
360
+ if File.exist?(prefix + '.skip')
361
+ FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
362
+ return
363
+ end
370
364
 
371
- fout = File.open(file, 'w')
372
- fout.puts(lines.join("\n"))
373
- fout.close
374
- end
365
+ if File.exist?(File.join(platform,'cross_platform'))
366
+ puts "-- Translated to cross_platform format"
367
+ R.GDS(dataset, prefix + '_cross_platform', field, File.join(platform_path, 'translations'))
368
+ end
369
+ end
375
370
 
376
- # Fix possible discrepancies in ids between series and platforms
377
- def self.fix_GSE_ids(platform_codes_file, prefix)
378
- platform_codes = File.open(platform_codes_file).collect{|l| l.chomp}
379
- platform_order = {}
380
-
381
- platform_codes.each_with_index{|code, i|
382
- platform_order[code] = i
383
- }
371
+ # Process a series. The info parameters is a hash with the :array,
372
+ # :platform, :log2 and :fields keys
373
+ def self.GSE(series, info)
374
+ return if Dir.glob(File.join(info[:platform], 'GSE', series) + '.*').any?
384
375
 
385
- series_codes = File.open(prefix + '.codes').collect{|l| l.chomp}
386
376
 
387
- platform_positions = platform_order.values_at(*series_codes)
377
+ gsms = []
378
+ conditions = {}
379
+ info[:arrays].each{|gsm, cond|
380
+ gsms << gsm
381
+ cond.each{|condition, value|
382
+ conditions[condition] ||= []
383
+ conditions[condition] << value
384
+ }
385
+ }
386
+ platform = info[:platform]
387
+ do_log = nil
388
+ do_log = !info[:log2] if info[:log2]
389
+ fields = info[:fields]
390
+
391
+ puts "Processing GSE #{ series }. Platform #{ platform }"
392
+
393
+ platform_path = GEO::platform_path(platform)
394
+ prefix = File.join(platform_path, 'GSE', series.to_s)
395
+ puts "-- Original"
396
+ R.GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
397
+
398
+ # Was there an error?
399
+ if File.exist?(prefix + '.skip')
400
+ FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
401
+ return
402
+ end
388
403
 
389
- # Fill with nil for missing positions
390
- platform_positions[platform_codes.length - 1] ||= nil
404
+ if platform =~ /_/
405
+ FileUtils.cp(prefix + '.codes', File.join(platform_path,'codes'))
406
+ codes = Open.read(File.join(platform_path, 'codes')).collect{|l| l.chomp}
407
+ organism = SOFT::GPL(platform.match(/(.*?)_/)[1])[:organism]
408
+ translations = translate(organism, codes)
409
+ Open.write(File.join(platform_path, 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
410
+ Open.write(File.join(platform_path, 'cross_platform'), translations.compact.sort.uniq.join("\n"))
411
+ else
412
+ # Are the codes of the series equivalent to the ones in the platform?
413
+ if File.open(File.join(platform_path,'codes')).collect{|l| l.chomp} != File.open(prefix + '.codes').collect{|l| l.chomp}
414
+ fix_GSE_ids(File.join(platform_path, 'codes'),prefix);
415
+ FileUtils.cp(File.join(platform_path, 'codes'),prefix + '.codes')
416
+ end
417
+ end
391
418
 
392
- %w(t logratios orders pvalues).each{|ext|
393
- rearange(platform_positions, prefix + '.' + ext)
394
- }
395
419
 
396
- Open.write(prefix + '.swap', platform_positions.join("\n"))
397
- end
420
+ if File.exist?(File.join(platform,'translations'))
421
+ FileUtils.cp(File.join(platform,'translations'), prefix + '.translations')
422
+ if File.exist?(prefix + '.swap')
423
+ orders = Open.read(prefix + '.swap').collect{|l| l.chomp}
424
+ inverse_orders = Array.new(orders.length)
425
+ orders.each_with_index{|pos,i|
426
+ next if pos !~ /\d/
427
+ inverse_orders[pos.to_i] = i
428
+ }
429
+ rearange(inverse_orders, prefix + '.translations', "NO MATCH")
430
+ end
431
+ puts "-- Translated to cross_platform format"
432
+ R.GSE(gsms, conditions, do_log, prefix + '_cross_platform', prefix + '.translations',fields, info[:title], info[:description])
433
+ fix_GSE_ids(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform');
434
+ FileUtils.cp(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform.codes')
435
+ FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
436
+ end
437
+ FileUtils.rm(prefix + '.swap') if File.exist?(prefix + '.swap')
438
+ end
439
+
440
+ # Load GPL data. Translates IDS of the platform probes using AILUN and our
441
+ # system (called biomart for clarity)
442
+ def self.GPL(platform)
443
+ path = GEO::platform_path(platform)
444
+ return if File.exist? path
445
+
446
+ if platform =~ /_/
447
+ FileUtils.mkdir(path)
448
+ FileUtils.mkdir(path + '/GSE')
449
+ FileUtils.mkdir(path + '/GDS')
450
+ return
451
+ end
398
452
 
453
+ info = SOFT.GPL(platform)
454
+ organism = info[:organism]
399
455
 
456
+ field = info[:other_ID_field]
457
+ id = info[:BioMart_ID]
458
+ org = info[:organism]
459
+ field = nil if field == ""
460
+ id = nil if id == ""
400
461
 
401
- def self.process_GSE(series, info)
402
- return if Dir.glob(File.join(info[:platform], 'GSE', series) + '.*').any?
403
462
 
404
- gsms = []
405
- conditions = {}
406
- info[:arrays].each{|gsm, cond|
407
- gsms << gsm
408
- cond.each{|condition, value|
409
- conditions[condition] ||= []
410
- conditions[condition] << value
463
+ puts "Processing Platform #{ platform }"
464
+ [platform,
465
+ File.join(path, 'GDS'),
466
+ File.join(path, 'GSE'),
467
+ ].each{|d|
468
+ FileUtils.mkdir d unless File.exist? d
411
469
  }
412
- }
413
- platform = info[:platform]
414
- do_log = nil
415
- do_log = !info[:log2] if info[:log2]
416
- fields = info[:fields]
417
470
 
418
- puts "Processing GSE #{ series }. Platform #{ platform }"
471
+ R.GPL(platform, path, nil)
472
+ FileUtils.mv path + '.codes', File.join(path, 'codes')
419
473
 
420
- prefix = File.join(platform_path(platform), 'GSE', series.to_s)
421
- puts "-- Original"
422
- GEO.get_GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
423
474
 
424
- # Was there an error?
425
- if File.exist?(prefix + '.skip')
426
- FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
427
- return
428
- end
475
+ # AILUN translations
476
+ codes = Open.read(File.join(path, 'codes')).collect{|l| l.chomp}
477
+ ailun = ID.AILUN_translate(platform, codes)
478
+ Open.write(File.join(path, 'ailun'), ailun.collect{|v| v || "NO MATCH"}.join("\n")) if ailun.compact.length > codes.length.to_f / 10
429
479
 
430
- if platform =~ /_/
431
- FileUtils.cp(prefix + '.codes', File.join(platform_path(platform),'codes'))
432
- codes = Open.read(File.join(platform_path(platform), 'codes')).collect{|l| l.chomp}
433
- organism = GEO::GPL_info(platform.match(/(.*?)_/)[1])[:organism]
434
- translations = ID.translate(organism, codes)
435
- Open.write(File.join(platform_path(platform), 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
436
- Open.write(File.join(platform_path(platform), 'cross_platform'), translations.compact.sort.uniq.join("\n"))
437
- else
438
- # Are the codes of the series equivalent to the ones in the platform?
439
- if File.open(File.join(platform_path(platform),'codes')).collect{|l| l.chomp} != File.open(prefix + '.codes').collect{|l| l.chomp}
440
- fix_GSE_ids(File.join(platform_path(platform), 'codes'),prefix);
441
- FileUtils.cp(File.join(platform_path(platform), 'codes'),prefix + '.codes')
480
+ # BioMart translations
481
+ biomart = []
482
+ if id || field
483
+ if id
484
+ codes = Open.read(File.join(path, 'codes')).collect{|l| l.chomp}
485
+ else
486
+ if field
487
+ R.GPL(platform, path, field[0])
488
+ FileUtils.mv path + '.codes', File.join(path, 'other')
489
+ end
442
490
 
491
+ fix = GEO::SOFT::ID_FIX[(organism + "_" + field[1].downcase).to_sym]
492
+ codes = Open.read(File.join(path, 'other')).collect{|l|
493
+ code = l.chomp
494
+ code = fix.call(code) if fix
495
+ code
496
+ }
497
+ end
498
+
499
+ biomart = translate(organism, codes)
500
+ Open.write(File.join(path, 'biomart'), biomart.collect{|v| v || "NO MATCH"}.join("\n")) if biomart.compact.length > codes.length.to_f / 10
443
501
  end
444
- end
445
502
 
503
+ # Select Best and save
504
+ translations = []
505
+ if ailun.compact.uniq.length > biomart.compact.uniq.length
506
+ id_type = ID::DEFAULT_FORMATS[organism] || ID::DEFAULT_FORMAT_ALL || id || field || "Entrez Gene Id"
507
+ if id_type.to_s !~ /Entrez/i
508
+ translations = translate(org,ailun.collect{|gene| gene || "NO MATCH"})
509
+ else
510
+ translations = ailun
511
+ end
512
+ else
513
+ translations = biomart
514
+ end
446
515
 
447
- if File.exist?(File.join(platform,'translations'))
448
- FileUtils.cp(File.join(platform,'translations'), prefix + '.translations')
449
- if File.exist?(prefix + '.swap')
450
- orders = Open.read(prefix + '.swap').collect{|l| l.chomp}
451
- inverse_orders = Array.new(orders.length)
452
- orders.each_with_index{|pos,i|
453
- next if pos !~ /\d/
454
- inverse_orders[pos.to_i] = i
455
- }
456
- rearange(inverse_orders, prefix + '.translations', "NO MATCH")
516
+ if translations.compact.length > codes.length.to_f / 10
517
+ Open.write(File.join(path, 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
518
+ Open.write(File.join(path, 'cross_platform'), translations.compact.sort.uniq.join("\n"))
457
519
  end
458
- puts "-- Translated to cross_platform format"
459
- GEO.get_GSE(gsms, conditions, do_log, prefix + '_cross_platform', prefix + '.translations',fields, info[:title], info[:description])
460
- fix_GSE_ids(File.join(platform_path(platform), 'cross_platform'),prefix + '_cross_platform');
461
- FileUtils.cp(File.join(platform_path(platform), 'cross_platform'),prefix + '_cross_platform.codes')
462
- FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
520
+
463
521
  end
464
- FileUtils.rm(prefix + '.swap') if File.exist?(prefix + '.swap')
522
+
465
523
  end
466
524
 
467
- def self.process_platform(platform)
468
- path = platform_path(platform)
469
- return if File.exist? path
470
525
 
471
- if platform =~ /_/
472
- FileUtils.mkdir(path)
473
- FileUtils.mkdir(path + '/GSE')
474
- FileUtils.mkdir(path + '/GDS')
475
- return
476
- end
477
526
 
478
- info = GEO::GPL_info(platform)
479
- organism = info[:organism]
480
-
481
- field = info[:other_ID_field]
482
- id = info[:BioMart_ID]
483
- org = info[:organism]
484
- field = nil if field == ""
485
- id = nil if id == ""
486
-
487
-
488
- puts "Processing Platform #{ platform }"
489
- [platform,
490
- File.join(platform_path(platform), 'GDS'),
491
- File.join(platform_path(platform), 'GSE'),
492
- ].each{|d|
493
- FileUtils.mkdir d unless File.exist? d
494
- }
527
+ #{{{ Local data store info
495
528
 
496
- get_GPL(platform, platform_path(platform), nil)
497
- FileUtils.mv platform_path(platform) + '.codes', File.join(platform_path(platform), 'codes')
498
-
529
+ def self.clean(name)
530
+ name.sub(/_cross_platform/,'') if name
531
+ end
499
532
 
500
- # AILUN translations
501
- codes = Open.read(File.join(platform_path(platform), 'codes')).collect{|l| l.chomp}
502
- ailun = ID.AILUN_translate(platform, codes)
503
- Open.write(File.join(platform_path(platform), 'ailun'), ailun.collect{|v| v || "NO MATCH"}.join("\n")) if ailun.compact.length > codes.length.to_f / 10
504
533
 
505
- # BioMart translations
506
- biomart = []
507
- if id || field
508
- if id
509
- codes = Open.read(File.join(platform_path(platform), 'codes')).collect{|l| l.chomp}
510
- else
511
- if field
512
- get_GPL(platform, platform_path(platform), field[0])
513
- FileUtils.mv platform_path(platform) + '.codes', File.join(platform_path(platform), 'other')
514
- end
534
+ def self.platform_path(platform)
535
+ File.join(MARQ.datadir, "GEO/#{clean(platform)}")
536
+ end
515
537
 
516
- fix = ID_FIX[(organism + "_" + field[1].downcase).to_sym]
517
- codes = Open.read(File.join(platform_path(platform), 'other')).collect{|l|
518
- code = l.chomp
519
- code = fix.call(code) if fix
520
- code
521
- }
522
- end
523
538
 
524
- biomart = ID.translate(organism, codes)
525
- Open.write(File.join(platform_path(platform), 'biomart'), biomart.collect{|v| v || "NO MATCH"}.join("\n")) if biomart.compact.length > codes.length.to_f / 10
526
- end
539
+ def self.is_cross_platform?(dataset)
540
+ dataset =~ /_cross_platform/
541
+ end
527
542
 
528
- # Select Best and save
529
- translations = []
530
- if ailun.compact.uniq.length > biomart.compact.uniq.length
531
- id_type = ID::DEFAULT_FORMATS[organism] || ID::DEFAULT_FORMAT_ALL || id || field || "Entrez Gene Id"
532
- if id_type.to_s !~ /Entrez/i
533
- translations = ID.translate(org,ailun.collect{|gene| gene || "NO MATCH"})
534
- else
535
- translations = ailun
536
- end
543
+ def self.has_cross_platform?(dataset = nil, platform = nil)
544
+ platform = clean(platform)
545
+ raise "Dataset #{ dataset } not found" if dataset && dataset_path(dataset, platform).nil?
546
+ raise "Platform #{ platform } not found" if platform && platform_path(platform).nil?
547
+ if dataset
548
+ File.exists?(dataset_path(dataset, platform) + "_cross_platform.orders")
537
549
  else
538
- translations = biomart
550
+ Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
539
551
  end
552
+ end
540
553
 
541
- if translations.compact.length > codes.length.to_f / 10
542
- Open.write(File.join(platform_path(platform), 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
543
- Open.write(File.join(platform_path(platform), 'cross_platform'), translations.compact.sort.uniq.join("\n"))
554
+ def self.dataset_path(dataset, platform = nil)
555
+ if platform
556
+ files = Dir.glob(File.join(platform_path(clean(platform)),"/*/#{ dataset }"))
557
+ else
558
+ files = Dir.glob(File.join(MARQ.datadir, "GEO/GPL*/*/#{ dataset }.*"))
544
559
  end
560
+ return nil if files.empty?
561
+ return files.first.match(/(.*)\./)[1]
562
+ end
545
563
 
564
+ def self.organism_platforms(organism)
565
+ Dir.glob(File.join(MARQ.datadir, "GEO/GPL*")).collect{|f|
566
+ File.basename(f)
567
+ }.select{|platform|
568
+ SOFT.GPL(platform)[:organism] == organism &&
569
+ platform_datasets(platform).any?
570
+ }
546
571
  end
547
572
 
548
573
 
549
- def self.process_platform_datasets(platform, force = false)
550
- raise "Platform #{ platform } not ready" unless File.exist? platform_path(platform)
551
574
 
552
- info = YAML::load(File.open(File.join(MARQ.datadir, "GEO/platforms/#{platform}.yaml")))
575
+ def self.platform_datasets(platform)
576
+ Dir.glob(File.join(platform_path(platform),"*/*.orders")).collect{|f| File.basename(f).sub(/.orders$/,'')}.select{|d| !is_cross_platform?(d)}
577
+ end
553
578
 
554
- datasets = GEO::Eutils::GPL_datasets(platform)
555
- datasets.each{|dataset|
556
- next if Dir.glob(File.join(platform_path(platform), 'GDS', dataset) + '.*').any? && ! force
557
- process_GDS(dataset, platform, nil)
558
- }
579
+ def self.dataset_platform(dataset)
580
+ dataset_path(dataset).match(/(GPL\d+)/)
581
+ $1
559
582
  end
560
583
 
584
+ def self.GDS_info(name)
585
+ begin
586
+ title, description = Open.read(dataset_path(name) + '.description').split(/\n--\n/).values_at(0,1)
587
+ {:title => title.strip, :description => description.strip}
588
+ rescue Exception
589
+ puts $!.message
590
+ {:title => "" , :description => "" }
591
+ end
592
+
593
+ end
594
+
595
+
561
596
  end
562
597
 
563
- if __FILE__ == $0
564
598
 
565
- p GEO.GPL_info('GPL920_GPL927')
566
- p GEO.GPL_id_fields('GPL920')
567
- puts GEO.GSE_info('GSE962')
568
- puts GEO.GSE_info('GSE8982')
569
- puts GEO::Eutils.GSE_dataset?('GSE8982')
570
- puts GEO::Eutils.GSE_dataset?('GSE962')
571
-
572
- exit
573
-
574
- #puts GEO::dataset_path('GDS1103').inspect
575
- #puts GEO::dataset_platform('GDS1103').inspect
576
-
577
- # puts GEO.dataset_path('GDS2931')
578
- # puts GEO.platform_datasets('GPL91')
579
- # puts GEO.platform_datasets('GPL91').select{|d| GEO.has_cross_platform?(d)}
580
- #
581
- # gpls = Open.read('ftp://ftp.ncbi.nih.gov/pub/geo/DATA/supplementary/PLATFORMS.txt').collect{|l|
582
- # l.chomp.split.first
583
- # }
584
- #
585
- # %w(GPL85).each{|gpl|
586
- # puts gpl
587
- # puts GEO::GPL_info(gpl).inspect if gpl =~ /GPL/
588
- # }
589
- #
590
- #puts GEO::GSM_info('GSM70604').inspect
591
-
592
- p GEO::Eutils.organism_platforms('human')
599
+ if __FILE__ == $0
593
600
 
594
601
  end
602
+