rbbt-marq 1.0.9 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/R/GEO.R +68 -124
- data/R/GEOquery_patch.R +44 -0
- data/R/MARQ.R +76 -0
- data/install_scripts/GEO/Rakefile +111 -74
- data/lib/MARQ/GEO.rb +459 -451
- data/lib/MARQ/ID.rb +1 -0
- data/lib/MARQ/main.rb +82 -2
- metadata +13 -2
data/lib/MARQ/GEO.rb
CHANGED
@@ -1,594 +1,602 @@
|
|
1
1
|
require 'MARQ'
|
2
|
-
require 'rbbt/util/open'
|
3
2
|
require 'rbbt/sources/organism'
|
4
3
|
|
4
|
+
# Work with GEO datasets
|
5
5
|
module GEO
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
|
11
|
-
def self.get_soft(item)
|
12
|
-
item = item.strip
|
13
|
-
cache_file = File.join(CACHE_DIR, item + '.soft')
|
14
|
-
if File.exist?( cache_file )
|
15
|
-
File.open(cache_file).read
|
16
|
-
else
|
17
|
-
content = Open.read(GEO_SOFT + item, :nocache => true)
|
18
|
-
fout = File.open(cache_file,'w')
|
19
|
-
fout.write content
|
20
|
-
fout.close
|
21
|
-
content
|
22
|
-
end
|
23
|
-
end
|
7
|
+
# Get information from Entrez
|
8
|
+
module Remote
|
24
9
|
|
25
|
-
#{{{ Eutils
|
26
|
-
module Eutils
|
27
10
|
def self.organism_platforms(org)
|
28
11
|
name = Organism.name(org)
|
29
12
|
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=\"#{name}\"[Organism:exp]+AND+%22gpl%22[Filter]&retmax=10000").
|
30
13
|
scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.collect{|id| "GPL#{id.sub(/^100*/,'')}"}
|
31
14
|
end
|
32
15
|
|
33
|
-
def self.
|
16
|
+
def self.platform_datasets(platform)
|
34
17
|
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=#{platform}[Accession]&retmax=2000").
|
35
18
|
scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.select{|id| !id.match(/^(1|2)000/) }.collect{|id| "GDS#{id}"}
|
36
19
|
end
|
37
20
|
|
38
|
-
def self.
|
21
|
+
def self.dataset_platform(dataset)
|
22
|
+
if dataset =~ /GSE/
|
23
|
+
Open.read("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
|
24
|
+
else
|
25
|
+
Open.read("http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.series_dataset?(gse)
|
39
30
|
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=geo&term=#{gse}[Accession]&retmax=2000").
|
40
31
|
match(/<Id>(\d+?)<\/Id>/) != nil
|
41
32
|
end
|
42
33
|
|
43
34
|
end
|
44
35
|
|
36
|
+
CACHE_DIR = File.join(MARQ.cachedir,'GEO')
|
37
|
+
FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
|
45
38
|
|
46
39
|
|
47
|
-
#
|
48
|
-
|
40
|
+
# Parse information in .soft files
|
41
|
+
module SOFT
|
49
42
|
|
50
|
-
|
51
|
-
ids.collect{|id| id.to_i}.sort[0..19] == (1..20).to_a
|
52
|
-
end
|
43
|
+
GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
|
53
44
|
|
54
|
-
|
55
|
-
|
56
|
-
|
45
|
+
# Download a soft file. Uses cache
|
46
|
+
def self.get_soft(item)
|
47
|
+
item = item.strip
|
48
|
+
cache_file = File.join(CACHE_DIR, item + '.soft')
|
49
|
+
if File.exist?( cache_file )
|
50
|
+
File.open(cache_file).read
|
51
|
+
else
|
52
|
+
content = Open.read(GEO_SOFT + item, :nocache => true)
|
53
|
+
raise "SOFT file error" if content !~ /!/
|
54
|
+
fout = File.open(cache_file,'w')
|
55
|
+
fout.write content
|
56
|
+
fout.close
|
57
|
+
content
|
58
|
+
end
|
59
|
+
end
|
57
60
|
|
58
|
-
|
59
|
-
ids.compact.select{|id| ! id.strip.match(/^[ATCG]+$/i)}.empty?
|
60
|
-
end
|
61
|
+
#{{{ Guess the format of the IDS
|
61
62
|
|
63
|
+
@@formats = {}
|
62
64
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
65
|
+
ID_FIX = {
|
66
|
+
:mgi_unigene => proc{|gene| if gene then gene.match(/^Mm./) ? gene : "Mm." + gene end},
|
67
|
+
:human_unigene => proc{|gene| if gene then gene.match(/^Hs./) ? gene : "Hs." + gene end},
|
68
|
+
}
|
67
69
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
if consecutive?(genes) || dna_sequence?(genes) || (numerical?(genes) && (name.nil? || !name.match(/entrez/i)))
|
72
|
-
id = nil
|
73
|
-
else
|
74
|
-
fix = ID_FIX[(org + "_" + name.downcase).to_sym] if name
|
75
|
-
if fix
|
76
|
-
genes = genes.collect{|gene| fix.call(gene)}
|
77
|
-
end
|
78
|
-
id = Organism.guessIdFormat(@@formats[org], genes)
|
70
|
+
# Id list is in sequence
|
71
|
+
def self.consecutive?(ids)
|
72
|
+
ids.collect{|id| id.to_i}.sort[0..19] == (1..20).to_a
|
79
73
|
end
|
80
|
-
|
81
|
-
id
|
82
|
-
end
|
83
|
-
|
84
|
-
@@r = nil
|
85
|
-
def self.r
|
86
|
-
if @@r.nil?
|
87
74
|
|
88
|
-
|
89
|
-
|
75
|
+
# Id list is numerical
|
76
|
+
def self.numerical?(ids)
|
77
|
+
ids.compact.select{|id| ! id.match(/^\d+$/)}.uniq.length < ids.length.to_f / 10
|
78
|
+
end
|
90
79
|
|
91
|
-
|
92
|
-
|
93
|
-
|
80
|
+
# ID are DNA bases
|
81
|
+
def self.dna_sequence?(ids)
|
82
|
+
ids.compact.select{|id| ! id.strip.match(/^[ATCG]+$/i)}.empty?
|
94
83
|
end
|
95
|
-
@@r
|
96
|
-
end
|
97
84
|
|
85
|
+
# Guess the format of the id in the list. The name parameter can be used to
|
86
|
+
# identify some exceptions
|
87
|
+
def self.guessIds(genes,org, name = nil)
|
88
|
+
@@formats[org] ||= Organism.id_formats(org)
|
89
|
+
if consecutive?(genes) || dna_sequence?(genes) || (numerical?(genes) && (name.nil? || !name.match(/entrez/i)))
|
90
|
+
id = nil
|
91
|
+
else
|
92
|
+
fix = ID_FIX[(org + "_" + name.downcase).to_sym] if name
|
93
|
+
if fix
|
94
|
+
genes = genes.collect{|gene| fix.call(gene)}
|
95
|
+
end
|
96
|
+
id = Organism.guessIdFormat(@@formats[org], genes)
|
97
|
+
end
|
98
98
|
|
99
|
-
|
99
|
+
id
|
100
|
+
end
|
100
101
|
|
101
|
-
def self.get_GPL(name, prefix, id_field = nil)
|
102
|
-
r.GEO_GPL_process(name, prefix, id_field, CACHE_DIR)
|
103
|
-
end
|
104
102
|
|
105
|
-
def self.get_GDS(name, prefix, id_field = nil, id_file = nil)
|
106
|
-
r.GEO_GDS_process(name, prefix, id_field, id_file, CACHE_DIR)
|
107
|
-
end
|
108
103
|
|
109
|
-
|
110
|
-
|
111
|
-
end
|
104
|
+
def self.GSE(series)
|
105
|
+
soft = get_soft(series)
|
112
106
|
|
113
|
-
|
114
|
-
|
115
|
-
|
107
|
+
if match = soft.scan(/!Series_platform_id\s*=?\s*(.*)/)
|
108
|
+
platform = match.flatten.collect{|p| p.strip}
|
109
|
+
else
|
110
|
+
raise "No Platform information"
|
111
|
+
end
|
116
112
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
113
|
+
if soft.match(/!Series_title \s*=?\s*(.*)/)
|
114
|
+
title = $1
|
115
|
+
else
|
116
|
+
raise "No Title information"
|
117
|
+
end
|
122
118
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
119
|
+
if soft.match(/!Series_summary \s*=?\s*(.*)/)
|
120
|
+
matches = soft.scan(/!Series_summary \s*=?\s*(.*)/).to_a
|
121
|
+
description = matches.collect{|m| m.to_s.strip.sub(/!Series_summary \s*=?\s*/,'')}.join("\n")
|
122
|
+
else
|
123
|
+
raise "No Summary information"
|
124
|
+
end
|
128
125
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
126
|
+
if soft.match(/!Series_sample_id \s*=?\s*(.*)/)
|
127
|
+
matches = soft.scan(/!Series_sample_id \s*=?\s*(.*)/).to_a
|
128
|
+
samples = matches.collect{|m| m.to_s.strip.sub(/!Series_sample_id \s*=?\s*/,'')}
|
129
|
+
else
|
130
|
+
raise "No Summary information"
|
131
|
+
end
|
135
132
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
133
|
+
{
|
134
|
+
:platform => platform.join("_"),
|
135
|
+
:description =>description.strip,
|
136
|
+
:title => title.strip,
|
137
|
+
:samples => samples,
|
138
|
+
}
|
141
139
|
end
|
142
140
|
|
143
|
-
|
144
|
-
|
145
|
-
:description =>description.strip,
|
146
|
-
:title => title.strip,
|
147
|
-
:samples => samples,
|
148
|
-
}
|
149
|
-
end
|
141
|
+
def self.GSM(array)
|
142
|
+
soft = get_soft(array)
|
150
143
|
|
151
|
-
|
152
|
-
|
144
|
+
if soft.match(/!Sample_title\s*=?\s*(.*)/)
|
145
|
+
title = $1
|
146
|
+
else
|
147
|
+
raise "No Title information"
|
148
|
+
end
|
153
149
|
|
154
|
-
if soft.match(/!Sample_title\s*=?\s*(.*)/)
|
155
|
-
title = $1
|
156
|
-
else
|
157
|
-
raise "No Title information"
|
158
|
-
end
|
159
150
|
|
151
|
+
if soft.match(/!Sample_description \s*=?\s*(.*)/)
|
152
|
+
description = $1
|
153
|
+
else
|
154
|
+
raise "No Description information"
|
155
|
+
end
|
160
156
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
157
|
+
{
|
158
|
+
|
159
|
+
:description =>description.strip,
|
160
|
+
:title => title.strip,
|
161
|
+
}
|
165
162
|
end
|
166
163
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
164
|
+
def self.GPL(platform)
|
165
|
+
if !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")) &&
|
166
|
+
!File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
|
167
|
+
begin
|
168
|
+
if platform =~ /_/
|
169
|
+
organism = GPL(platform.match(/(.*?)_/)[1])[:organism]
|
173
170
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
def self.GPL_info(platform)
|
182
|
-
if !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")) &&
|
183
|
-
!File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
|
184
|
-
begin
|
185
|
-
if platform =~ /_/
|
186
|
-
organism = GPL_info(platform.match(/(.*?)_/)[1])[:organism]
|
187
|
-
|
188
|
-
info = {
|
189
|
-
:organism => organism,
|
190
|
-
:title => "Merged platforms #{ platform }",
|
191
|
-
}
|
192
|
-
return info
|
193
|
-
end
|
194
|
-
soft = get_soft(platform)
|
171
|
+
info = {
|
172
|
+
:organism => organism,
|
173
|
+
:title => "Merged platforms #{ platform }",
|
174
|
+
}
|
175
|
+
return info
|
176
|
+
end
|
177
|
+
soft = get_soft(platform)
|
195
178
|
|
196
179
|
|
197
|
-
|
180
|
+
raise "SOFT file error" if soft !~ /!/
|
198
181
|
|
199
|
-
|
182
|
+
organisms = soft.scan(/!Platform_organism\s*=\s*(.*)/).collect{|v| v.first.strip}
|
200
183
|
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
184
|
+
if organisms.empty?
|
185
|
+
raise "No Organism information"
|
186
|
+
else
|
187
|
+
# This might happen actually GPL2529
|
188
|
+
organisms.delete('Schizosaccharomyces pombe') if organisms.include?('Saccharomyces cerevisiae')
|
189
|
+
org_name = organisms.first
|
190
|
+
end
|
208
191
|
|
209
192
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
193
|
+
title = ""
|
194
|
+
if soft.match(/!Platform_title\s*=\s*(.*)/)
|
195
|
+
title = $1
|
196
|
+
end
|
214
197
|
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
198
|
+
org = Organism.name2org(org_name)
|
199
|
+
raise "Organism not identified: #{org_name}" if org.nil?
|
200
|
+
|
201
|
+
if soft.match(/!platform_table_begin/)
|
202
|
+
data = soft.split(/!platform_table_begin/s)[1].collect{|l| l.chomp.split(/\t/)}
|
203
|
+
data.shift
|
204
|
+
names = data.shift
|
205
|
+
total = data.first.length
|
206
|
+
genes = data.sort_by{ rand }[1..1000].collect{|v| v.first}
|
207
|
+
|
208
|
+
id = guessIds(genes,org, names.first)
|
209
|
+
other = nil
|
210
|
+
other_pos = 0
|
211
|
+
other_count = 0
|
212
|
+
other_name = 0
|
213
|
+
if id.nil?
|
214
|
+
(1..total - 1).to_a.each{|num|
|
215
|
+
genes = data.collect{|v| v[num]}
|
216
|
+
other = guessIds(genes,org, name = names[num])
|
217
|
+
|
218
|
+
if other && other[1] > other_count
|
219
|
+
other_pos = num
|
220
|
+
other_count = other[1]
|
221
|
+
other_name = names[num]
|
222
|
+
end
|
223
|
+
}
|
224
|
+
end
|
225
|
+
else
|
226
|
+
raise "Soft file incomplete"
|
241
227
|
end
|
242
|
-
else
|
243
|
-
raise "Soft file incomplete"
|
244
|
-
end
|
245
228
|
|
246
|
-
|
247
|
-
|
229
|
+
info = {:organism => org, :BioMart_ID => id ? id.first : nil, :title => title }
|
230
|
+
info[:other_ID_field] = [other_pos + 1, other_name] if other_pos > 0
|
248
231
|
|
249
232
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
233
|
+
Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml"), info.to_yaml)
|
234
|
+
rescue Exception
|
235
|
+
puts $!.message
|
236
|
+
puts $!.backtrace
|
237
|
+
Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"), $!.message)
|
238
|
+
end
|
255
239
|
end
|
240
|
+
|
241
|
+
raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
|
242
|
+
|
243
|
+
YAML::load(File.open(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")))
|
256
244
|
end
|
257
245
|
|
258
|
-
raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
|
259
246
|
|
260
|
-
YAML::load(File.open(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")))
|
261
247
|
end
|
262
248
|
|
263
|
-
def self.GDS_info(name)
|
264
|
-
begin
|
265
|
-
title, description = Open.read(dataset_path(name) + '.description').split(/\n--\n/).values_at(0,1)
|
266
|
-
{:title => title.strip, :description => description.strip}
|
267
|
-
rescue Exception
|
268
|
-
puts $!.message
|
269
|
-
{:title => "" , :description => "" }
|
270
|
-
end
|
271
249
|
|
272
|
-
|
250
|
+
#{{{ Process
|
273
251
|
|
252
|
+
# Use R to load and process the datasets
|
253
|
+
module Process
|
274
254
|
|
275
|
-
|
255
|
+
# R library wrapper
|
256
|
+
module R
|
257
|
+
@@r = nil
|
276
258
|
|
277
|
-
|
278
|
-
|
279
|
-
|
259
|
+
# Get the R instance
|
260
|
+
def self.r
|
261
|
+
if @@r.nil?
|
280
262
|
|
281
|
-
|
282
|
-
|
283
|
-
end
|
263
|
+
# FIXME: RSruby does not install very well, this require id hidden here.
|
264
|
+
require 'rsruby'
|
284
265
|
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
return files.first.match(/(.*)\./)[1]
|
292
|
-
else
|
293
|
-
return ""
|
266
|
+
RSRuby.instance.source(MARQ.rootdir + '/R/MA.R')
|
267
|
+
RSRuby.instance.source(MARQ.rootdir + '/R/GEO.R')
|
268
|
+
RSRuby.instance.source(MARQ.rootdir + '/R/GEOquery_patch.R')
|
269
|
+
@@r = RSRuby.instance
|
270
|
+
end
|
271
|
+
@@r
|
294
272
|
end
|
295
|
-
end
|
296
|
-
end
|
297
273
|
|
298
|
-
|
299
|
-
|
300
|
-
|
274
|
+
# Use R to load GPL info
|
275
|
+
def self.GPL(name, prefix, id_field = nil)
|
276
|
+
r.GEO_GPL_process(name, prefix, id_field, CACHE_DIR)
|
277
|
+
end
|
301
278
|
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
if dataset
|
307
|
-
File.exists?(dataset_path(dataset, platform) + "_cross_platform.orders")
|
308
|
-
else
|
309
|
-
Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
|
310
|
-
end
|
311
|
-
end
|
279
|
+
# Use R to load process the dataset
|
280
|
+
def self.GDS(name, prefix, id_field = nil, id_file = nil)
|
281
|
+
r.GEO_GDS_process(name, prefix, id_field, id_file, CACHE_DIR)
|
282
|
+
end
|
312
283
|
|
284
|
+
# Use R to load process the series
|
285
|
+
def self.GSE(gsms, conditions, do_log, prefix, id_file = nil, fields= nil, title = nil, description = nil)
|
286
|
+
r.GEO_GSE_process(gsms, conditions, prefix, do_log, id_file, fields, title, description, CACHE_DIR)
|
287
|
+
end
|
288
|
+
end
|
313
289
|
|
314
|
-
|
315
|
-
|
316
|
-
|
290
|
+
def self.translate(org, list)
|
291
|
+
begin
|
292
|
+
ID.translate_DB(org, list)
|
293
|
+
rescue
|
294
|
+
puts "DB translation failed, resorting to index"
|
295
|
+
ID.translate_index(org, list)
|
296
|
+
end
|
297
|
+
end
|
317
298
|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
299
|
+
# Rearange the lines of a file with the given order. The order specifies, for
|
300
|
+
# each position in the original file, where it should en in the final file
|
301
|
+
def self.rearange(order, file, missing = "NA")
|
302
|
+
orig_lines = []
|
303
|
+
File.open(file).each_line{|l| orig_lines << l}
|
322
304
|
|
323
|
-
|
324
|
-
|
325
|
-
File.basename(f)
|
326
|
-
}.select{|platform|
|
327
|
-
GPL_info(platform)[:organism] == organism &&
|
328
|
-
platform_datasets(platform).any?
|
329
|
-
}
|
330
|
-
end
|
305
|
+
return if orig_lines.empty?
|
306
|
+
columns = orig_lines.first.split(/\t/).length
|
331
307
|
|
332
|
-
|
308
|
+
lines = Array.new(order.length)
|
333
309
|
|
334
|
-
|
335
|
-
|
310
|
+
orig_lines.each_with_index{|l,i|
|
311
|
+
next if order[i].nil?
|
312
|
+
lines[order[i]] = l.chomp
|
313
|
+
}
|
336
314
|
|
337
|
-
|
338
|
-
prefix = File.join(platform_path(platform), 'GDS', dataset.to_s)
|
339
|
-
GEO.get_GDS(dataset, prefix, field, nil)
|
315
|
+
lines = lines.collect{|l| l || [missing]*columns*"\t"}
|
340
316
|
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
return
|
317
|
+
fout = File.open(file, 'w')
|
318
|
+
fout.puts(lines.join("\n"))
|
319
|
+
fout.close
|
345
320
|
end
|
346
321
|
|
347
|
-
|
348
|
-
|
349
|
-
|
322
|
+
# Fix possible discrepancies in ids between series and platforms
|
323
|
+
def self.fix_GSE_ids(platform_codes_file, prefix)
|
324
|
+
platform_codes = File.open(platform_codes_file).collect{|l| l.chomp}
|
325
|
+
platform_order = {}
|
326
|
+
|
327
|
+
platform_codes.each_with_index{|code, i|
|
328
|
+
platform_order[code] = i
|
329
|
+
}
|
330
|
+
|
331
|
+
series_codes = File.open(prefix + '.codes').collect{|l| l.chomp}
|
332
|
+
|
333
|
+
platform_positions = platform_order.values_at(*series_codes)
|
334
|
+
|
335
|
+
# Fill with nil for missing positions
|
336
|
+
platform_positions[platform_codes.length - 1] ||= nil
|
337
|
+
|
338
|
+
%w(t logratios orders pvalues).each{|ext|
|
339
|
+
rearange(platform_positions, prefix + '.' + ext)
|
340
|
+
}
|
341
|
+
|
342
|
+
Open.write(prefix + '.swap', platform_positions.join("\n"))
|
350
343
|
end
|
351
|
-
end
|
352
344
|
|
353
|
-
# Rearange the lines of a file with the given order. The order specifies, for
|
354
|
-
# each position in the original file, where it should en in the final file
|
355
|
-
def self.rearange(order, file, missing = "NA")
|
356
|
-
orig_lines = []
|
357
|
-
File.open(file).each_line{|l| orig_lines << l}
|
358
345
|
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
346
|
+
# Process a dataset. Need to specify the platform. The field parameter can
|
347
|
+
# be used to use a different column for the field.
|
348
|
+
#
|
349
|
+
# Deprecated in favor of using the original firt column and using a
|
350
|
+
# different one only for translation
|
351
|
+
def self.GDS(dataset, platform, field = nil)
|
352
|
+
puts "Processing GDS #{ dataset }. Platform #{ platform }"
|
353
|
+
platform_path = GEO.platform_path(platform)
|
363
354
|
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
}
|
355
|
+
puts "-- Original"
|
356
|
+
prefix = File.join(platform_path, 'GDS', dataset.to_s)
|
357
|
+
R.GDS(dataset, prefix, field, nil)
|
368
358
|
|
369
|
-
|
359
|
+
# Was there an error?
|
360
|
+
if File.exist?(prefix + '.skip')
|
361
|
+
FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
|
362
|
+
return
|
363
|
+
end
|
370
364
|
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
365
|
+
if File.exist?(File.join(platform,'cross_platform'))
|
366
|
+
puts "-- Translated to cross_platform format"
|
367
|
+
R.GDS(dataset, prefix + '_cross_platform', field, File.join(platform_path, 'translations'))
|
368
|
+
end
|
369
|
+
end
|
375
370
|
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
platform_codes.each_with_index{|code, i|
|
382
|
-
platform_order[code] = i
|
383
|
-
}
|
371
|
+
# Process a series. The info parameters is a hash with the :array,
|
372
|
+
# :platform, :log2 and :fields keys
|
373
|
+
def self.GSE(series, info)
|
374
|
+
return if Dir.glob(File.join(info[:platform], 'GSE', series) + '.*').any?
|
384
375
|
|
385
|
-
series_codes = File.open(prefix + '.codes').collect{|l| l.chomp}
|
386
376
|
|
387
|
-
|
377
|
+
gsms = []
|
378
|
+
conditions = {}
|
379
|
+
info[:arrays].each{|gsm, cond|
|
380
|
+
gsms << gsm
|
381
|
+
cond.each{|condition, value|
|
382
|
+
conditions[condition] ||= []
|
383
|
+
conditions[condition] << value
|
384
|
+
}
|
385
|
+
}
|
386
|
+
platform = info[:platform]
|
387
|
+
do_log = nil
|
388
|
+
do_log = !info[:log2] if info[:log2]
|
389
|
+
fields = info[:fields]
|
390
|
+
|
391
|
+
puts "Processing GSE #{ series }. Platform #{ platform }"
|
392
|
+
|
393
|
+
platform_path = GEO::platform_path(platform)
|
394
|
+
prefix = File.join(platform_path, 'GSE', series.to_s)
|
395
|
+
puts "-- Original"
|
396
|
+
R.GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
|
397
|
+
|
398
|
+
# Was there an error?
|
399
|
+
if File.exist?(prefix + '.skip')
|
400
|
+
FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
|
401
|
+
return
|
402
|
+
end
|
388
403
|
|
389
|
-
|
390
|
-
|
404
|
+
if platform =~ /_/
|
405
|
+
FileUtils.cp(prefix + '.codes', File.join(platform_path,'codes'))
|
406
|
+
codes = Open.read(File.join(platform_path, 'codes')).collect{|l| l.chomp}
|
407
|
+
organism = SOFT::GPL(platform.match(/(.*?)_/)[1])[:organism]
|
408
|
+
translations = translate(organism, codes)
|
409
|
+
Open.write(File.join(platform_path, 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
|
410
|
+
Open.write(File.join(platform_path, 'cross_platform'), translations.compact.sort.uniq.join("\n"))
|
411
|
+
else
|
412
|
+
# Are the codes of the series equivalent to the ones in the platform?
|
413
|
+
if File.open(File.join(platform_path,'codes')).collect{|l| l.chomp} != File.open(prefix + '.codes').collect{|l| l.chomp}
|
414
|
+
fix_GSE_ids(File.join(platform_path, 'codes'),prefix);
|
415
|
+
FileUtils.cp(File.join(platform_path, 'codes'),prefix + '.codes')
|
416
|
+
end
|
417
|
+
end
|
391
418
|
|
392
|
-
%w(t logratios orders pvalues).each{|ext|
|
393
|
-
rearange(platform_positions, prefix + '.' + ext)
|
394
|
-
}
|
395
419
|
|
396
|
-
|
397
|
-
|
420
|
+
if File.exist?(File.join(platform,'translations'))
|
421
|
+
FileUtils.cp(File.join(platform,'translations'), prefix + '.translations')
|
422
|
+
if File.exist?(prefix + '.swap')
|
423
|
+
orders = Open.read(prefix + '.swap').collect{|l| l.chomp}
|
424
|
+
inverse_orders = Array.new(orders.length)
|
425
|
+
orders.each_with_index{|pos,i|
|
426
|
+
next if pos !~ /\d/
|
427
|
+
inverse_orders[pos.to_i] = i
|
428
|
+
}
|
429
|
+
rearange(inverse_orders, prefix + '.translations', "NO MATCH")
|
430
|
+
end
|
431
|
+
puts "-- Translated to cross_platform format"
|
432
|
+
R.GSE(gsms, conditions, do_log, prefix + '_cross_platform', prefix + '.translations',fields, info[:title], info[:description])
|
433
|
+
fix_GSE_ids(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform');
|
434
|
+
FileUtils.cp(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform.codes')
|
435
|
+
FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
|
436
|
+
end
|
437
|
+
FileUtils.rm(prefix + '.swap') if File.exist?(prefix + '.swap')
|
438
|
+
end
|
439
|
+
|
440
|
+
# Load GPL data. Translates IDS of the platform probes using AILUN and our
|
441
|
+
# system (called biomart for clarity)
|
442
|
+
def self.GPL(platform)
|
443
|
+
path = GEO::platform_path(platform)
|
444
|
+
return if File.exist? path
|
445
|
+
|
446
|
+
if platform =~ /_/
|
447
|
+
FileUtils.mkdir(path)
|
448
|
+
FileUtils.mkdir(path + '/GSE')
|
449
|
+
FileUtils.mkdir(path + '/GDS')
|
450
|
+
return
|
451
|
+
end
|
398
452
|
|
453
|
+
info = SOFT.GPL(platform)
|
454
|
+
organism = info[:organism]
|
399
455
|
|
456
|
+
field = info[:other_ID_field]
|
457
|
+
id = info[:BioMart_ID]
|
458
|
+
org = info[:organism]
|
459
|
+
field = nil if field == ""
|
460
|
+
id = nil if id == ""
|
400
461
|
|
401
|
-
def self.process_GSE(series, info)
|
402
|
-
return if Dir.glob(File.join(info[:platform], 'GSE', series) + '.*').any?
|
403
462
|
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
conditions[condition] << value
|
463
|
+
puts "Processing Platform #{ platform }"
|
464
|
+
[platform,
|
465
|
+
File.join(path, 'GDS'),
|
466
|
+
File.join(path, 'GSE'),
|
467
|
+
].each{|d|
|
468
|
+
FileUtils.mkdir d unless File.exist? d
|
411
469
|
}
|
412
|
-
}
|
413
|
-
platform = info[:platform]
|
414
|
-
do_log = nil
|
415
|
-
do_log = !info[:log2] if info[:log2]
|
416
|
-
fields = info[:fields]
|
417
470
|
|
418
|
-
|
471
|
+
R.GPL(platform, path, nil)
|
472
|
+
FileUtils.mv path + '.codes', File.join(path, 'codes')
|
419
473
|
|
420
|
-
prefix = File.join(platform_path(platform), 'GSE', series.to_s)
|
421
|
-
puts "-- Original"
|
422
|
-
GEO.get_GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
|
423
474
|
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
end
|
475
|
+
# AILUN translations
|
476
|
+
codes = Open.read(File.join(path, 'codes')).collect{|l| l.chomp}
|
477
|
+
ailun = ID.AILUN_translate(platform, codes)
|
478
|
+
Open.write(File.join(path, 'ailun'), ailun.collect{|v| v || "NO MATCH"}.join("\n")) if ailun.compact.length > codes.length.to_f / 10
|
429
479
|
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
fix_GSE_ids(File.join(platform_path(platform), 'codes'),prefix);
|
441
|
-
FileUtils.cp(File.join(platform_path(platform), 'codes'),prefix + '.codes')
|
480
|
+
# BioMart translations
|
481
|
+
biomart = []
|
482
|
+
if id || field
|
483
|
+
if id
|
484
|
+
codes = Open.read(File.join(path, 'codes')).collect{|l| l.chomp}
|
485
|
+
else
|
486
|
+
if field
|
487
|
+
R.GPL(platform, path, field[0])
|
488
|
+
FileUtils.mv path + '.codes', File.join(path, 'other')
|
489
|
+
end
|
442
490
|
|
491
|
+
fix = GEO::SOFT::ID_FIX[(organism + "_" + field[1].downcase).to_sym]
|
492
|
+
codes = Open.read(File.join(path, 'other')).collect{|l|
|
493
|
+
code = l.chomp
|
494
|
+
code = fix.call(code) if fix
|
495
|
+
code
|
496
|
+
}
|
497
|
+
end
|
498
|
+
|
499
|
+
biomart = translate(organism, codes)
|
500
|
+
Open.write(File.join(path, 'biomart'), biomart.collect{|v| v || "NO MATCH"}.join("\n")) if biomart.compact.length > codes.length.to_f / 10
|
443
501
|
end
|
444
|
-
end
|
445
502
|
|
503
|
+
# Select Best and save
|
504
|
+
translations = []
|
505
|
+
if ailun.compact.uniq.length > biomart.compact.uniq.length
|
506
|
+
id_type = ID::DEFAULT_FORMATS[organism] || ID::DEFAULT_FORMAT_ALL || id || field || "Entrez Gene Id"
|
507
|
+
if id_type.to_s !~ /Entrez/i
|
508
|
+
translations = translate(org,ailun.collect{|gene| gene || "NO MATCH"})
|
509
|
+
else
|
510
|
+
translations = ailun
|
511
|
+
end
|
512
|
+
else
|
513
|
+
translations = biomart
|
514
|
+
end
|
446
515
|
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
orders = Open.read(prefix + '.swap').collect{|l| l.chomp}
|
451
|
-
inverse_orders = Array.new(orders.length)
|
452
|
-
orders.each_with_index{|pos,i|
|
453
|
-
next if pos !~ /\d/
|
454
|
-
inverse_orders[pos.to_i] = i
|
455
|
-
}
|
456
|
-
rearange(inverse_orders, prefix + '.translations', "NO MATCH")
|
516
|
+
if translations.compact.length > codes.length.to_f / 10
|
517
|
+
Open.write(File.join(path, 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
|
518
|
+
Open.write(File.join(path, 'cross_platform'), translations.compact.sort.uniq.join("\n"))
|
457
519
|
end
|
458
|
-
|
459
|
-
GEO.get_GSE(gsms, conditions, do_log, prefix + '_cross_platform', prefix + '.translations',fields, info[:title], info[:description])
|
460
|
-
fix_GSE_ids(File.join(platform_path(platform), 'cross_platform'),prefix + '_cross_platform');
|
461
|
-
FileUtils.cp(File.join(platform_path(platform), 'cross_platform'),prefix + '_cross_platform.codes')
|
462
|
-
FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
|
520
|
+
|
463
521
|
end
|
464
|
-
|
522
|
+
|
465
523
|
end
|
466
524
|
|
467
|
-
def self.process_platform(platform)
|
468
|
-
path = platform_path(platform)
|
469
|
-
return if File.exist? path
|
470
525
|
|
471
|
-
if platform =~ /_/
|
472
|
-
FileUtils.mkdir(path)
|
473
|
-
FileUtils.mkdir(path + '/GSE')
|
474
|
-
FileUtils.mkdir(path + '/GDS')
|
475
|
-
return
|
476
|
-
end
|
477
526
|
|
478
|
-
|
479
|
-
organism = info[:organism]
|
480
|
-
|
481
|
-
field = info[:other_ID_field]
|
482
|
-
id = info[:BioMart_ID]
|
483
|
-
org = info[:organism]
|
484
|
-
field = nil if field == ""
|
485
|
-
id = nil if id == ""
|
486
|
-
|
487
|
-
|
488
|
-
puts "Processing Platform #{ platform }"
|
489
|
-
[platform,
|
490
|
-
File.join(platform_path(platform), 'GDS'),
|
491
|
-
File.join(platform_path(platform), 'GSE'),
|
492
|
-
].each{|d|
|
493
|
-
FileUtils.mkdir d unless File.exist? d
|
494
|
-
}
|
527
|
+
#{{{ Local data store info
|
495
528
|
|
496
|
-
|
497
|
-
|
498
|
-
|
529
|
+
def self.clean(name)
|
530
|
+
name.sub(/_cross_platform/,'') if name
|
531
|
+
end
|
499
532
|
|
500
|
-
# AILUN translations
|
501
|
-
codes = Open.read(File.join(platform_path(platform), 'codes')).collect{|l| l.chomp}
|
502
|
-
ailun = ID.AILUN_translate(platform, codes)
|
503
|
-
Open.write(File.join(platform_path(platform), 'ailun'), ailun.collect{|v| v || "NO MATCH"}.join("\n")) if ailun.compact.length > codes.length.to_f / 10
|
504
533
|
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
if id
|
509
|
-
codes = Open.read(File.join(platform_path(platform), 'codes')).collect{|l| l.chomp}
|
510
|
-
else
|
511
|
-
if field
|
512
|
-
get_GPL(platform, platform_path(platform), field[0])
|
513
|
-
FileUtils.mv platform_path(platform) + '.codes', File.join(platform_path(platform), 'other')
|
514
|
-
end
|
534
|
+
def self.platform_path(platform)
|
535
|
+
File.join(MARQ.datadir, "GEO/#{clean(platform)}")
|
536
|
+
end
|
515
537
|
|
516
|
-
fix = ID_FIX[(organism + "_" + field[1].downcase).to_sym]
|
517
|
-
codes = Open.read(File.join(platform_path(platform), 'other')).collect{|l|
|
518
|
-
code = l.chomp
|
519
|
-
code = fix.call(code) if fix
|
520
|
-
code
|
521
|
-
}
|
522
|
-
end
|
523
538
|
|
524
|
-
|
525
|
-
|
526
|
-
|
539
|
+
def self.is_cross_platform?(dataset)
|
540
|
+
dataset =~ /_cross_platform/
|
541
|
+
end
|
527
542
|
|
528
|
-
|
529
|
-
|
530
|
-
if
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
else
|
535
|
-
translations = ailun
|
536
|
-
end
|
543
|
+
def self.has_cross_platform?(dataset = nil, platform = nil)
|
544
|
+
platform = clean(platform)
|
545
|
+
raise "Dataset #{ dataset } not found" if dataset && dataset_path(dataset, platform).nil?
|
546
|
+
raise "Platform #{ platform } not found" if platform && platform_path(platform).nil?
|
547
|
+
if dataset
|
548
|
+
File.exists?(dataset_path(dataset, platform) + "_cross_platform.orders")
|
537
549
|
else
|
538
|
-
|
550
|
+
Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
|
539
551
|
end
|
552
|
+
end
|
540
553
|
|
541
|
-
|
542
|
-
|
543
|
-
|
554
|
+
def self.dataset_path(dataset, platform = nil)
|
555
|
+
if platform
|
556
|
+
files = Dir.glob(File.join(platform_path(clean(platform)),"/*/#{ dataset }"))
|
557
|
+
else
|
558
|
+
files = Dir.glob(File.join(MARQ.datadir, "GEO/GPL*/*/#{ dataset }.*"))
|
544
559
|
end
|
560
|
+
return nil if files.empty?
|
561
|
+
return files.first.match(/(.*)\./)[1]
|
562
|
+
end
|
545
563
|
|
564
|
+
def self.organism_platforms(organism)
|
565
|
+
Dir.glob(File.join(MARQ.datadir, "GEO/GPL*")).collect{|f|
|
566
|
+
File.basename(f)
|
567
|
+
}.select{|platform|
|
568
|
+
SOFT.GPL(platform)[:organism] == organism &&
|
569
|
+
platform_datasets(platform).any?
|
570
|
+
}
|
546
571
|
end
|
547
572
|
|
548
573
|
|
549
|
-
def self.process_platform_datasets(platform, force = false)
|
550
|
-
raise "Platform #{ platform } not ready" unless File.exist? platform_path(platform)
|
551
574
|
|
552
|
-
|
575
|
+
def self.platform_datasets(platform)
|
576
|
+
Dir.glob(File.join(platform_path(platform),"*/*.orders")).collect{|f| File.basename(f).sub(/.orders$/,'')}.select{|d| !is_cross_platform?(d)}
|
577
|
+
end
|
553
578
|
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
process_GDS(dataset, platform, nil)
|
558
|
-
}
|
579
|
+
def self.dataset_platform(dataset)
|
580
|
+
dataset_path(dataset).match(/(GPL\d+)/)
|
581
|
+
$1
|
559
582
|
end
|
560
583
|
|
584
|
+
def self.GDS_info(name)
|
585
|
+
begin
|
586
|
+
title, description = Open.read(dataset_path(name) + '.description').split(/\n--\n/).values_at(0,1)
|
587
|
+
{:title => title.strip, :description => description.strip}
|
588
|
+
rescue Exception
|
589
|
+
puts $!.message
|
590
|
+
{:title => "" , :description => "" }
|
591
|
+
end
|
592
|
+
|
593
|
+
end
|
594
|
+
|
595
|
+
|
561
596
|
end
|
562
597
|
|
563
|
-
if __FILE__ == $0
|
564
598
|
|
565
|
-
|
566
|
-
p GEO.GPL_id_fields('GPL920')
|
567
|
-
puts GEO.GSE_info('GSE962')
|
568
|
-
puts GEO.GSE_info('GSE8982')
|
569
|
-
puts GEO::Eutils.GSE_dataset?('GSE8982')
|
570
|
-
puts GEO::Eutils.GSE_dataset?('GSE962')
|
571
|
-
|
572
|
-
exit
|
573
|
-
|
574
|
-
#puts GEO::dataset_path('GDS1103').inspect
|
575
|
-
#puts GEO::dataset_platform('GDS1103').inspect
|
576
|
-
|
577
|
-
# puts GEO.dataset_path('GDS2931')
|
578
|
-
# puts GEO.platform_datasets('GPL91')
|
579
|
-
# puts GEO.platform_datasets('GPL91').select{|d| GEO.has_cross_platform?(d)}
|
580
|
-
#
|
581
|
-
# gpls = Open.read('ftp://ftp.ncbi.nih.gov/pub/geo/DATA/supplementary/PLATFORMS.txt').collect{|l|
|
582
|
-
# l.chomp.split.first
|
583
|
-
# }
|
584
|
-
#
|
585
|
-
# %w(GPL85).each{|gpl|
|
586
|
-
# puts gpl
|
587
|
-
# puts GEO::GPL_info(gpl).inspect if gpl =~ /GPL/
|
588
|
-
# }
|
589
|
-
#
|
590
|
-
#puts GEO::GSM_info('GSM70604').inspect
|
591
|
-
|
592
|
-
p GEO::Eutils.organism_platforms('human')
|
599
|
+
if __FILE__ == $0
|
593
600
|
|
594
601
|
end
|
602
|
+
|