rbbt-marq 1.0.9 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/R/GEO.R +68 -124
- data/R/GEOquery_patch.R +44 -0
- data/R/MARQ.R +76 -0
- data/install_scripts/GEO/Rakefile +111 -74
- data/lib/MARQ/GEO.rb +459 -451
- data/lib/MARQ/ID.rb +1 -0
- data/lib/MARQ/main.rb +82 -2
- metadata +13 -2
data/lib/MARQ/GEO.rb
CHANGED
@@ -1,594 +1,602 @@
|
|
1
1
|
require 'MARQ'
|
2
|
-
require 'rbbt/util/open'
|
3
2
|
require 'rbbt/sources/organism'
|
4
3
|
|
4
|
+
# Work with GEO datasets
|
5
5
|
module GEO
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
|
11
|
-
def self.get_soft(item)
|
12
|
-
item = item.strip
|
13
|
-
cache_file = File.join(CACHE_DIR, item + '.soft')
|
14
|
-
if File.exist?( cache_file )
|
15
|
-
File.open(cache_file).read
|
16
|
-
else
|
17
|
-
content = Open.read(GEO_SOFT + item, :nocache => true)
|
18
|
-
fout = File.open(cache_file,'w')
|
19
|
-
fout.write content
|
20
|
-
fout.close
|
21
|
-
content
|
22
|
-
end
|
23
|
-
end
|
7
|
+
# Get information from Entrez
|
8
|
+
module Remote
|
24
9
|
|
25
|
-
#{{{ Eutils
|
26
|
-
module Eutils
|
27
10
|
def self.organism_platforms(org)
|
28
11
|
name = Organism.name(org)
|
29
12
|
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=\"#{name}\"[Organism:exp]+AND+%22gpl%22[Filter]&retmax=10000").
|
30
13
|
scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.collect{|id| "GPL#{id.sub(/^100*/,'')}"}
|
31
14
|
end
|
32
15
|
|
33
|
-
def self.
|
16
|
+
def self.platform_datasets(platform)
|
34
17
|
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=#{platform}[Accession]&retmax=2000").
|
35
18
|
scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.select{|id| !id.match(/^(1|2)000/) }.collect{|id| "GDS#{id}"}
|
36
19
|
end
|
37
20
|
|
38
|
-
def self.
|
21
|
+
def self.dataset_platform(dataset)
|
22
|
+
if dataset =~ /GSE/
|
23
|
+
Open.read("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
|
24
|
+
else
|
25
|
+
Open.read("http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.series_dataset?(gse)
|
39
30
|
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=geo&term=#{gse}[Accession]&retmax=2000").
|
40
31
|
match(/<Id>(\d+?)<\/Id>/) != nil
|
41
32
|
end
|
42
33
|
|
43
34
|
end
|
44
35
|
|
36
|
+
CACHE_DIR = File.join(MARQ.cachedir,'GEO')
|
37
|
+
FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
|
45
38
|
|
46
39
|
|
47
|
-
#
|
48
|
-
|
40
|
+
# Parse information in .soft files
|
41
|
+
module SOFT
|
49
42
|
|
50
|
-
|
51
|
-
ids.collect{|id| id.to_i}.sort[0..19] == (1..20).to_a
|
52
|
-
end
|
43
|
+
GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
|
53
44
|
|
54
|
-
|
55
|
-
|
56
|
-
|
45
|
+
# Download a soft file. Uses cache
|
46
|
+
def self.get_soft(item)
|
47
|
+
item = item.strip
|
48
|
+
cache_file = File.join(CACHE_DIR, item + '.soft')
|
49
|
+
if File.exist?( cache_file )
|
50
|
+
File.open(cache_file).read
|
51
|
+
else
|
52
|
+
content = Open.read(GEO_SOFT + item, :nocache => true)
|
53
|
+
raise "SOFT file error" if content !~ /!/
|
54
|
+
fout = File.open(cache_file,'w')
|
55
|
+
fout.write content
|
56
|
+
fout.close
|
57
|
+
content
|
58
|
+
end
|
59
|
+
end
|
57
60
|
|
58
|
-
|
59
|
-
ids.compact.select{|id| ! id.strip.match(/^[ATCG]+$/i)}.empty?
|
60
|
-
end
|
61
|
+
#{{{ Guess the format of the IDS
|
61
62
|
|
63
|
+
@@formats = {}
|
62
64
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
65
|
+
ID_FIX = {
|
66
|
+
:mgi_unigene => proc{|gene| if gene then gene.match(/^Mm./) ? gene : "Mm." + gene end},
|
67
|
+
:human_unigene => proc{|gene| if gene then gene.match(/^Hs./) ? gene : "Hs." + gene end},
|
68
|
+
}
|
67
69
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
if consecutive?(genes) || dna_sequence?(genes) || (numerical?(genes) && (name.nil? || !name.match(/entrez/i)))
|
72
|
-
id = nil
|
73
|
-
else
|
74
|
-
fix = ID_FIX[(org + "_" + name.downcase).to_sym] if name
|
75
|
-
if fix
|
76
|
-
genes = genes.collect{|gene| fix.call(gene)}
|
77
|
-
end
|
78
|
-
id = Organism.guessIdFormat(@@formats[org], genes)
|
70
|
+
# Id list is in sequence
|
71
|
+
def self.consecutive?(ids)
|
72
|
+
ids.collect{|id| id.to_i}.sort[0..19] == (1..20).to_a
|
79
73
|
end
|
80
|
-
|
81
|
-
id
|
82
|
-
end
|
83
|
-
|
84
|
-
@@r = nil
|
85
|
-
def self.r
|
86
|
-
if @@r.nil?
|
87
74
|
|
88
|
-
|
89
|
-
|
75
|
+
# Id list is numerical
|
76
|
+
def self.numerical?(ids)
|
77
|
+
ids.compact.select{|id| ! id.match(/^\d+$/)}.uniq.length < ids.length.to_f / 10
|
78
|
+
end
|
90
79
|
|
91
|
-
|
92
|
-
|
93
|
-
|
80
|
+
# ID are DNA bases
|
81
|
+
def self.dna_sequence?(ids)
|
82
|
+
ids.compact.select{|id| ! id.strip.match(/^[ATCG]+$/i)}.empty?
|
94
83
|
end
|
95
|
-
@@r
|
96
|
-
end
|
97
84
|
|
85
|
+
# Guess the format of the id in the list. The name parameter can be used to
|
86
|
+
# identify some exceptions
|
87
|
+
def self.guessIds(genes,org, name = nil)
|
88
|
+
@@formats[org] ||= Organism.id_formats(org)
|
89
|
+
if consecutive?(genes) || dna_sequence?(genes) || (numerical?(genes) && (name.nil? || !name.match(/entrez/i)))
|
90
|
+
id = nil
|
91
|
+
else
|
92
|
+
fix = ID_FIX[(org + "_" + name.downcase).to_sym] if name
|
93
|
+
if fix
|
94
|
+
genes = genes.collect{|gene| fix.call(gene)}
|
95
|
+
end
|
96
|
+
id = Organism.guessIdFormat(@@formats[org], genes)
|
97
|
+
end
|
98
98
|
|
99
|
-
|
99
|
+
id
|
100
|
+
end
|
100
101
|
|
101
|
-
def self.get_GPL(name, prefix, id_field = nil)
|
102
|
-
r.GEO_GPL_process(name, prefix, id_field, CACHE_DIR)
|
103
|
-
end
|
104
102
|
|
105
|
-
def self.get_GDS(name, prefix, id_field = nil, id_file = nil)
|
106
|
-
r.GEO_GDS_process(name, prefix, id_field, id_file, CACHE_DIR)
|
107
|
-
end
|
108
103
|
|
109
|
-
|
110
|
-
|
111
|
-
end
|
104
|
+
def self.GSE(series)
|
105
|
+
soft = get_soft(series)
|
112
106
|
|
113
|
-
|
114
|
-
|
115
|
-
|
107
|
+
if match = soft.scan(/!Series_platform_id\s*=?\s*(.*)/)
|
108
|
+
platform = match.flatten.collect{|p| p.strip}
|
109
|
+
else
|
110
|
+
raise "No Platform information"
|
111
|
+
end
|
116
112
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
113
|
+
if soft.match(/!Series_title \s*=?\s*(.*)/)
|
114
|
+
title = $1
|
115
|
+
else
|
116
|
+
raise "No Title information"
|
117
|
+
end
|
122
118
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
119
|
+
if soft.match(/!Series_summary \s*=?\s*(.*)/)
|
120
|
+
matches = soft.scan(/!Series_summary \s*=?\s*(.*)/).to_a
|
121
|
+
description = matches.collect{|m| m.to_s.strip.sub(/!Series_summary \s*=?\s*/,'')}.join("\n")
|
122
|
+
else
|
123
|
+
raise "No Summary information"
|
124
|
+
end
|
128
125
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
126
|
+
if soft.match(/!Series_sample_id \s*=?\s*(.*)/)
|
127
|
+
matches = soft.scan(/!Series_sample_id \s*=?\s*(.*)/).to_a
|
128
|
+
samples = matches.collect{|m| m.to_s.strip.sub(/!Series_sample_id \s*=?\s*/,'')}
|
129
|
+
else
|
130
|
+
raise "No Summary information"
|
131
|
+
end
|
135
132
|
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
133
|
+
{
|
134
|
+
:platform => platform.join("_"),
|
135
|
+
:description =>description.strip,
|
136
|
+
:title => title.strip,
|
137
|
+
:samples => samples,
|
138
|
+
}
|
141
139
|
end
|
142
140
|
|
143
|
-
|
144
|
-
|
145
|
-
:description =>description.strip,
|
146
|
-
:title => title.strip,
|
147
|
-
:samples => samples,
|
148
|
-
}
|
149
|
-
end
|
141
|
+
def self.GSM(array)
|
142
|
+
soft = get_soft(array)
|
150
143
|
|
151
|
-
|
152
|
-
|
144
|
+
if soft.match(/!Sample_title\s*=?\s*(.*)/)
|
145
|
+
title = $1
|
146
|
+
else
|
147
|
+
raise "No Title information"
|
148
|
+
end
|
153
149
|
|
154
|
-
if soft.match(/!Sample_title\s*=?\s*(.*)/)
|
155
|
-
title = $1
|
156
|
-
else
|
157
|
-
raise "No Title information"
|
158
|
-
end
|
159
150
|
|
151
|
+
if soft.match(/!Sample_description \s*=?\s*(.*)/)
|
152
|
+
description = $1
|
153
|
+
else
|
154
|
+
raise "No Description information"
|
155
|
+
end
|
160
156
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
157
|
+
{
|
158
|
+
|
159
|
+
:description =>description.strip,
|
160
|
+
:title => title.strip,
|
161
|
+
}
|
165
162
|
end
|
166
163
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
164
|
+
def self.GPL(platform)
|
165
|
+
if !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")) &&
|
166
|
+
!File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
|
167
|
+
begin
|
168
|
+
if platform =~ /_/
|
169
|
+
organism = GPL(platform.match(/(.*?)_/)[1])[:organism]
|
173
170
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
def self.GPL_info(platform)
|
182
|
-
if !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")) &&
|
183
|
-
!File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
|
184
|
-
begin
|
185
|
-
if platform =~ /_/
|
186
|
-
organism = GPL_info(platform.match(/(.*?)_/)[1])[:organism]
|
187
|
-
|
188
|
-
info = {
|
189
|
-
:organism => organism,
|
190
|
-
:title => "Merged platforms #{ platform }",
|
191
|
-
}
|
192
|
-
return info
|
193
|
-
end
|
194
|
-
soft = get_soft(platform)
|
171
|
+
info = {
|
172
|
+
:organism => organism,
|
173
|
+
:title => "Merged platforms #{ platform }",
|
174
|
+
}
|
175
|
+
return info
|
176
|
+
end
|
177
|
+
soft = get_soft(platform)
|
195
178
|
|
196
179
|
|
197
|
-
|
180
|
+
raise "SOFT file error" if soft !~ /!/
|
198
181
|
|
199
|
-
|
182
|
+
organisms = soft.scan(/!Platform_organism\s*=\s*(.*)/).collect{|v| v.first.strip}
|
200
183
|
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
184
|
+
if organisms.empty?
|
185
|
+
raise "No Organism information"
|
186
|
+
else
|
187
|
+
# This might happen actually GPL2529
|
188
|
+
organisms.delete('Schizosaccharomyces pombe') if organisms.include?('Saccharomyces cerevisiae')
|
189
|
+
org_name = organisms.first
|
190
|
+
end
|
208
191
|
|
209
192
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
193
|
+
title = ""
|
194
|
+
if soft.match(/!Platform_title\s*=\s*(.*)/)
|
195
|
+
title = $1
|
196
|
+
end
|
214
197
|
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
198
|
+
org = Organism.name2org(org_name)
|
199
|
+
raise "Organism not identified: #{org_name}" if org.nil?
|
200
|
+
|
201
|
+
if soft.match(/!platform_table_begin/)
|
202
|
+
data = soft.split(/!platform_table_begin/s)[1].collect{|l| l.chomp.split(/\t/)}
|
203
|
+
data.shift
|
204
|
+
names = data.shift
|
205
|
+
total = data.first.length
|
206
|
+
genes = data.sort_by{ rand }[1..1000].collect{|v| v.first}
|
207
|
+
|
208
|
+
id = guessIds(genes,org, names.first)
|
209
|
+
other = nil
|
210
|
+
other_pos = 0
|
211
|
+
other_count = 0
|
212
|
+
other_name = 0
|
213
|
+
if id.nil?
|
214
|
+
(1..total - 1).to_a.each{|num|
|
215
|
+
genes = data.collect{|v| v[num]}
|
216
|
+
other = guessIds(genes,org, name = names[num])
|
217
|
+
|
218
|
+
if other && other[1] > other_count
|
219
|
+
other_pos = num
|
220
|
+
other_count = other[1]
|
221
|
+
other_name = names[num]
|
222
|
+
end
|
223
|
+
}
|
224
|
+
end
|
225
|
+
else
|
226
|
+
raise "Soft file incomplete"
|
241
227
|
end
|
242
|
-
else
|
243
|
-
raise "Soft file incomplete"
|
244
|
-
end
|
245
228
|
|
246
|
-
|
247
|
-
|
229
|
+
info = {:organism => org, :BioMart_ID => id ? id.first : nil, :title => title }
|
230
|
+
info[:other_ID_field] = [other_pos + 1, other_name] if other_pos > 0
|
248
231
|
|
249
232
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
233
|
+
Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml"), info.to_yaml)
|
234
|
+
rescue Exception
|
235
|
+
puts $!.message
|
236
|
+
puts $!.backtrace
|
237
|
+
Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"), $!.message)
|
238
|
+
end
|
255
239
|
end
|
240
|
+
|
241
|
+
raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
|
242
|
+
|
243
|
+
YAML::load(File.open(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")))
|
256
244
|
end
|
257
245
|
|
258
|
-
raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
|
259
246
|
|
260
|
-
YAML::load(File.open(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")))
|
261
247
|
end
|
262
248
|
|
263
|
-
def self.GDS_info(name)
|
264
|
-
begin
|
265
|
-
title, description = Open.read(dataset_path(name) + '.description').split(/\n--\n/).values_at(0,1)
|
266
|
-
{:title => title.strip, :description => description.strip}
|
267
|
-
rescue Exception
|
268
|
-
puts $!.message
|
269
|
-
{:title => "" , :description => "" }
|
270
|
-
end
|
271
249
|
|
272
|
-
|
250
|
+
#{{{ Process
|
273
251
|
|
252
|
+
# Use R to load and process the datasets
|
253
|
+
module Process
|
274
254
|
|
275
|
-
|
255
|
+
# R library wrapper
|
256
|
+
module R
|
257
|
+
@@r = nil
|
276
258
|
|
277
|
-
|
278
|
-
|
279
|
-
|
259
|
+
# Get the R instance
|
260
|
+
def self.r
|
261
|
+
if @@r.nil?
|
280
262
|
|
281
|
-
|
282
|
-
|
283
|
-
end
|
263
|
+
# FIXME: RSruby does not install very well, this require id hidden here.
|
264
|
+
require 'rsruby'
|
284
265
|
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
return files.first.match(/(.*)\./)[1]
|
292
|
-
else
|
293
|
-
return ""
|
266
|
+
RSRuby.instance.source(MARQ.rootdir + '/R/MA.R')
|
267
|
+
RSRuby.instance.source(MARQ.rootdir + '/R/GEO.R')
|
268
|
+
RSRuby.instance.source(MARQ.rootdir + '/R/GEOquery_patch.R')
|
269
|
+
@@r = RSRuby.instance
|
270
|
+
end
|
271
|
+
@@r
|
294
272
|
end
|
295
|
-
end
|
296
|
-
end
|
297
273
|
|
298
|
-
|
299
|
-
|
300
|
-
|
274
|
+
# Use R to load GPL info
|
275
|
+
def self.GPL(name, prefix, id_field = nil)
|
276
|
+
r.GEO_GPL_process(name, prefix, id_field, CACHE_DIR)
|
277
|
+
end
|
301
278
|
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
if dataset
|
307
|
-
File.exists?(dataset_path(dataset, platform) + "_cross_platform.orders")
|
308
|
-
else
|
309
|
-
Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
|
310
|
-
end
|
311
|
-
end
|
279
|
+
# Use R to load process the dataset
|
280
|
+
def self.GDS(name, prefix, id_field = nil, id_file = nil)
|
281
|
+
r.GEO_GDS_process(name, prefix, id_field, id_file, CACHE_DIR)
|
282
|
+
end
|
312
283
|
|
284
|
+
# Use R to load process the series
|
285
|
+
def self.GSE(gsms, conditions, do_log, prefix, id_file = nil, fields= nil, title = nil, description = nil)
|
286
|
+
r.GEO_GSE_process(gsms, conditions, prefix, do_log, id_file, fields, title, description, CACHE_DIR)
|
287
|
+
end
|
288
|
+
end
|
313
289
|
|
314
|
-
|
315
|
-
|
316
|
-
|
290
|
+
def self.translate(org, list)
|
291
|
+
begin
|
292
|
+
ID.translate_DB(org, list)
|
293
|
+
rescue
|
294
|
+
puts "DB translation failed, resorting to index"
|
295
|
+
ID.translate_index(org, list)
|
296
|
+
end
|
297
|
+
end
|
317
298
|
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
299
|
+
# Rearange the lines of a file with the given order. The order specifies, for
|
300
|
+
# each position in the original file, where it should en in the final file
|
301
|
+
def self.rearange(order, file, missing = "NA")
|
302
|
+
orig_lines = []
|
303
|
+
File.open(file).each_line{|l| orig_lines << l}
|
322
304
|
|
323
|
-
|
324
|
-
|
325
|
-
File.basename(f)
|
326
|
-
}.select{|platform|
|
327
|
-
GPL_info(platform)[:organism] == organism &&
|
328
|
-
platform_datasets(platform).any?
|
329
|
-
}
|
330
|
-
end
|
305
|
+
return if orig_lines.empty?
|
306
|
+
columns = orig_lines.first.split(/\t/).length
|
331
307
|
|
332
|
-
|
308
|
+
lines = Array.new(order.length)
|
333
309
|
|
334
|
-
|
335
|
-
|
310
|
+
orig_lines.each_with_index{|l,i|
|
311
|
+
next if order[i].nil?
|
312
|
+
lines[order[i]] = l.chomp
|
313
|
+
}
|
336
314
|
|
337
|
-
|
338
|
-
prefix = File.join(platform_path(platform), 'GDS', dataset.to_s)
|
339
|
-
GEO.get_GDS(dataset, prefix, field, nil)
|
315
|
+
lines = lines.collect{|l| l || [missing]*columns*"\t"}
|
340
316
|
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
return
|
317
|
+
fout = File.open(file, 'w')
|
318
|
+
fout.puts(lines.join("\n"))
|
319
|
+
fout.close
|
345
320
|
end
|
346
321
|
|
347
|
-
|
348
|
-
|
349
|
-
|
322
|
+
# Fix possible discrepancies in ids between series and platforms
|
323
|
+
def self.fix_GSE_ids(platform_codes_file, prefix)
|
324
|
+
platform_codes = File.open(platform_codes_file).collect{|l| l.chomp}
|
325
|
+
platform_order = {}
|
326
|
+
|
327
|
+
platform_codes.each_with_index{|code, i|
|
328
|
+
platform_order[code] = i
|
329
|
+
}
|
330
|
+
|
331
|
+
series_codes = File.open(prefix + '.codes').collect{|l| l.chomp}
|
332
|
+
|
333
|
+
platform_positions = platform_order.values_at(*series_codes)
|
334
|
+
|
335
|
+
# Fill with nil for missing positions
|
336
|
+
platform_positions[platform_codes.length - 1] ||= nil
|
337
|
+
|
338
|
+
%w(t logratios orders pvalues).each{|ext|
|
339
|
+
rearange(platform_positions, prefix + '.' + ext)
|
340
|
+
}
|
341
|
+
|
342
|
+
Open.write(prefix + '.swap', platform_positions.join("\n"))
|
350
343
|
end
|
351
|
-
end
|
352
344
|
|
353
|
-
# Rearange the lines of a file with the given order. The order specifies, for
|
354
|
-
# each position in the original file, where it should en in the final file
|
355
|
-
def self.rearange(order, file, missing = "NA")
|
356
|
-
orig_lines = []
|
357
|
-
File.open(file).each_line{|l| orig_lines << l}
|
358
345
|
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
346
|
+
# Process a dataset. Need to specify the platform. The field parameter can
|
347
|
+
# be used to use a different column for the field.
|
348
|
+
#
|
349
|
+
# Deprecated in favor of using the original firt column and using a
|
350
|
+
# different one only for translation
|
351
|
+
def self.GDS(dataset, platform, field = nil)
|
352
|
+
puts "Processing GDS #{ dataset }. Platform #{ platform }"
|
353
|
+
platform_path = GEO.platform_path(platform)
|
363
354
|
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
}
|
355
|
+
puts "-- Original"
|
356
|
+
prefix = File.join(platform_path, 'GDS', dataset.to_s)
|
357
|
+
R.GDS(dataset, prefix, field, nil)
|
368
358
|
|
369
|
-
|
359
|
+
# Was there an error?
|
360
|
+
if File.exist?(prefix + '.skip')
|
361
|
+
FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
|
362
|
+
return
|
363
|
+
end
|
370
364
|
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
365
|
+
if File.exist?(File.join(platform,'cross_platform'))
|
366
|
+
puts "-- Translated to cross_platform format"
|
367
|
+
R.GDS(dataset, prefix + '_cross_platform', field, File.join(platform_path, 'translations'))
|
368
|
+
end
|
369
|
+
end
|
375
370
|
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
platform_codes.each_with_index{|code, i|
|
382
|
-
platform_order[code] = i
|
383
|
-
}
|
371
|
+
# Process a series. The info parameters is a hash with the :array,
|
372
|
+
# :platform, :log2 and :fields keys
|
373
|
+
def self.GSE(series, info)
|
374
|
+
return if Dir.glob(File.join(info[:platform], 'GSE', series) + '.*').any?
|
384
375
|
|
385
|
-
series_codes = File.open(prefix + '.codes').collect{|l| l.chomp}
|
386
376
|
|
387
|
-
|
377
|
+
gsms = []
|
378
|
+
conditions = {}
|
379
|
+
info[:arrays].each{|gsm, cond|
|
380
|
+
gsms << gsm
|
381
|
+
cond.each{|condition, value|
|
382
|
+
conditions[condition] ||= []
|
383
|
+
conditions[condition] << value
|
384
|
+
}
|
385
|
+
}
|
386
|
+
platform = info[:platform]
|
387
|
+
do_log = nil
|
388
|
+
do_log = !info[:log2] if info[:log2]
|
389
|
+
fields = info[:fields]
|
390
|
+
|
391
|
+
puts "Processing GSE #{ series }. Platform #{ platform }"
|
392
|
+
|
393
|
+
platform_path = GEO::platform_path(platform)
|
394
|
+
prefix = File.join(platform_path, 'GSE', series.to_s)
|
395
|
+
puts "-- Original"
|
396
|
+
R.GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
|
397
|
+
|
398
|
+
# Was there an error?
|
399
|
+
if File.exist?(prefix + '.skip')
|
400
|
+
FileUtils.cp(prefix + '.skip', prefix + '_cross_platform.skip')
|
401
|
+
return
|
402
|
+
end
|
388
403
|
|
389
|
-
|
390
|
-
|
404
|
+
if platform =~ /_/
|
405
|
+
FileUtils.cp(prefix + '.codes', File.join(platform_path,'codes'))
|
406
|
+
codes = Open.read(File.join(platform_path, 'codes')).collect{|l| l.chomp}
|
407
|
+
organism = SOFT::GPL(platform.match(/(.*?)_/)[1])[:organism]
|
408
|
+
translations = translate(organism, codes)
|
409
|
+
Open.write(File.join(platform_path, 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
|
410
|
+
Open.write(File.join(platform_path, 'cross_platform'), translations.compact.sort.uniq.join("\n"))
|
411
|
+
else
|
412
|
+
# Are the codes of the series equivalent to the ones in the platform?
|
413
|
+
if File.open(File.join(platform_path,'codes')).collect{|l| l.chomp} != File.open(prefix + '.codes').collect{|l| l.chomp}
|
414
|
+
fix_GSE_ids(File.join(platform_path, 'codes'),prefix);
|
415
|
+
FileUtils.cp(File.join(platform_path, 'codes'),prefix + '.codes')
|
416
|
+
end
|
417
|
+
end
|
391
418
|
|
392
|
-
%w(t logratios orders pvalues).each{|ext|
|
393
|
-
rearange(platform_positions, prefix + '.' + ext)
|
394
|
-
}
|
395
419
|
|
396
|
-
|
397
|
-
|
420
|
+
if File.exist?(File.join(platform,'translations'))
|
421
|
+
FileUtils.cp(File.join(platform,'translations'), prefix + '.translations')
|
422
|
+
if File.exist?(prefix + '.swap')
|
423
|
+
orders = Open.read(prefix + '.swap').collect{|l| l.chomp}
|
424
|
+
inverse_orders = Array.new(orders.length)
|
425
|
+
orders.each_with_index{|pos,i|
|
426
|
+
next if pos !~ /\d/
|
427
|
+
inverse_orders[pos.to_i] = i
|
428
|
+
}
|
429
|
+
rearange(inverse_orders, prefix + '.translations', "NO MATCH")
|
430
|
+
end
|
431
|
+
puts "-- Translated to cross_platform format"
|
432
|
+
R.GSE(gsms, conditions, do_log, prefix + '_cross_platform', prefix + '.translations',fields, info[:title], info[:description])
|
433
|
+
fix_GSE_ids(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform');
|
434
|
+
FileUtils.cp(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform.codes')
|
435
|
+
FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
|
436
|
+
end
|
437
|
+
FileUtils.rm(prefix + '.swap') if File.exist?(prefix + '.swap')
|
438
|
+
end
|
439
|
+
|
440
|
+
# Load GPL data. Translates IDS of the platform probes using AILUN and our
|
441
|
+
# system (called biomart for clarity)
|
442
|
+
def self.GPL(platform)
|
443
|
+
path = GEO::platform_path(platform)
|
444
|
+
return if File.exist? path
|
445
|
+
|
446
|
+
if platform =~ /_/
|
447
|
+
FileUtils.mkdir(path)
|
448
|
+
FileUtils.mkdir(path + '/GSE')
|
449
|
+
FileUtils.mkdir(path + '/GDS')
|
450
|
+
return
|
451
|
+
end
|
398
452
|
|
453
|
+
info = SOFT.GPL(platform)
|
454
|
+
organism = info[:organism]
|
399
455
|
|
456
|
+
field = info[:other_ID_field]
|
457
|
+
id = info[:BioMart_ID]
|
458
|
+
org = info[:organism]
|
459
|
+
field = nil if field == ""
|
460
|
+
id = nil if id == ""
|
400
461
|
|
401
|
-
def self.process_GSE(series, info)
|
402
|
-
return if Dir.glob(File.join(info[:platform], 'GSE', series) + '.*').any?
|
403
462
|
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
conditions[condition] << value
|
463
|
+
puts "Processing Platform #{ platform }"
|
464
|
+
[platform,
|
465
|
+
File.join(path, 'GDS'),
|
466
|
+
File.join(path, 'GSE'),
|
467
|
+
].each{|d|
|
468
|
+
FileUtils.mkdir d unless File.exist? d
|
411
469
|
}
|
412
|
-
}
|
413
|
-
platform = info[:platform]
|
414
|
-
do_log = nil
|
415
|
-
do_log = !info[:log2] if info[:log2]
|
416
|
-
fields = info[:fields]
|
417
470
|
|
418
|
-
|
471
|
+
R.GPL(platform, path, nil)
|
472
|
+
FileUtils.mv path + '.codes', File.join(path, 'codes')
|
419
473
|
|
420
|
-
prefix = File.join(platform_path(platform), 'GSE', series.to_s)
|
421
|
-
puts "-- Original"
|
422
|
-
GEO.get_GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
|
423
474
|
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
end
|
475
|
+
# AILUN translations
|
476
|
+
codes = Open.read(File.join(path, 'codes')).collect{|l| l.chomp}
|
477
|
+
ailun = ID.AILUN_translate(platform, codes)
|
478
|
+
Open.write(File.join(path, 'ailun'), ailun.collect{|v| v || "NO MATCH"}.join("\n")) if ailun.compact.length > codes.length.to_f / 10
|
429
479
|
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
fix_GSE_ids(File.join(platform_path(platform), 'codes'),prefix);
|
441
|
-
FileUtils.cp(File.join(platform_path(platform), 'codes'),prefix + '.codes')
|
480
|
+
# BioMart translations
|
481
|
+
biomart = []
|
482
|
+
if id || field
|
483
|
+
if id
|
484
|
+
codes = Open.read(File.join(path, 'codes')).collect{|l| l.chomp}
|
485
|
+
else
|
486
|
+
if field
|
487
|
+
R.GPL(platform, path, field[0])
|
488
|
+
FileUtils.mv path + '.codes', File.join(path, 'other')
|
489
|
+
end
|
442
490
|
|
491
|
+
fix = GEO::SOFT::ID_FIX[(organism + "_" + field[1].downcase).to_sym]
|
492
|
+
codes = Open.read(File.join(path, 'other')).collect{|l|
|
493
|
+
code = l.chomp
|
494
|
+
code = fix.call(code) if fix
|
495
|
+
code
|
496
|
+
}
|
497
|
+
end
|
498
|
+
|
499
|
+
biomart = translate(organism, codes)
|
500
|
+
Open.write(File.join(path, 'biomart'), biomart.collect{|v| v || "NO MATCH"}.join("\n")) if biomart.compact.length > codes.length.to_f / 10
|
443
501
|
end
|
444
|
-
end
|
445
502
|
|
503
|
+
# Select Best and save
|
504
|
+
translations = []
|
505
|
+
if ailun.compact.uniq.length > biomart.compact.uniq.length
|
506
|
+
id_type = ID::DEFAULT_FORMATS[organism] || ID::DEFAULT_FORMAT_ALL || id || field || "Entrez Gene Id"
|
507
|
+
if id_type.to_s !~ /Entrez/i
|
508
|
+
translations = translate(org,ailun.collect{|gene| gene || "NO MATCH"})
|
509
|
+
else
|
510
|
+
translations = ailun
|
511
|
+
end
|
512
|
+
else
|
513
|
+
translations = biomart
|
514
|
+
end
|
446
515
|
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
orders = Open.read(prefix + '.swap').collect{|l| l.chomp}
|
451
|
-
inverse_orders = Array.new(orders.length)
|
452
|
-
orders.each_with_index{|pos,i|
|
453
|
-
next if pos !~ /\d/
|
454
|
-
inverse_orders[pos.to_i] = i
|
455
|
-
}
|
456
|
-
rearange(inverse_orders, prefix + '.translations', "NO MATCH")
|
516
|
+
if translations.compact.length > codes.length.to_f / 10
|
517
|
+
Open.write(File.join(path, 'translations'), translations.collect{|v| v || "NO MATCH"}.join("\n"))
|
518
|
+
Open.write(File.join(path, 'cross_platform'), translations.compact.sort.uniq.join("\n"))
|
457
519
|
end
|
458
|
-
|
459
|
-
GEO.get_GSE(gsms, conditions, do_log, prefix + '_cross_platform', prefix + '.translations',fields, info[:title], info[:description])
|
460
|
-
fix_GSE_ids(File.join(platform_path(platform), 'cross_platform'),prefix + '_cross_platform');
|
461
|
-
FileUtils.cp(File.join(platform_path(platform), 'cross_platform'),prefix + '_cross_platform.codes')
|
462
|
-
FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
|
520
|
+
|
463
521
|
end
|
464
|
-
|
522
|
+
|
465
523
|
end
|
466
524
|
|
467
|
-
def self.process_platform(platform)
|
468
|
-
path = platform_path(platform)
|
469
|
-
return if File.exist? path
|
470
525
|
|
471
|
-
if platform =~ /_/
|
472
|
-
FileUtils.mkdir(path)
|
473
|
-
FileUtils.mkdir(path + '/GSE')
|
474
|
-
FileUtils.mkdir(path + '/GDS')
|
475
|
-
return
|
476
|
-
end
|
477
526
|
|
478
|
-
|
479
|
-
organism = info[:organism]
|
480
|
-
|
481
|
-
field = info[:other_ID_field]
|
482
|
-
id = info[:BioMart_ID]
|
483
|
-
org = info[:organism]
|
484
|
-
field = nil if field == ""
|
485
|
-
id = nil if id == ""
|
486
|
-
|
487
|
-
|
488
|
-
puts "Processing Platform #{ platform }"
|
489
|
-
[platform,
|
490
|
-
File.join(platform_path(platform), 'GDS'),
|
491
|
-
File.join(platform_path(platform), 'GSE'),
|
492
|
-
].each{|d|
|
493
|
-
FileUtils.mkdir d unless File.exist? d
|
494
|
-
}
|
527
|
+
#{{{ Local data store info
|
495
528
|
|
496
|
-
|
497
|
-
|
498
|
-
|
529
|
+
def self.clean(name)
|
530
|
+
name.sub(/_cross_platform/,'') if name
|
531
|
+
end
|
499
532
|
|
500
|
-
# AILUN translations
|
501
|
-
codes = Open.read(File.join(platform_path(platform), 'codes')).collect{|l| l.chomp}
|
502
|
-
ailun = ID.AILUN_translate(platform, codes)
|
503
|
-
Open.write(File.join(platform_path(platform), 'ailun'), ailun.collect{|v| v || "NO MATCH"}.join("\n")) if ailun.compact.length > codes.length.to_f / 10
|
504
533
|
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
if id
|
509
|
-
codes = Open.read(File.join(platform_path(platform), 'codes')).collect{|l| l.chomp}
|
510
|
-
else
|
511
|
-
if field
|
512
|
-
get_GPL(platform, platform_path(platform), field[0])
|
513
|
-
FileUtils.mv platform_path(platform) + '.codes', File.join(platform_path(platform), 'other')
|
514
|
-
end
|
534
|
+
def self.platform_path(platform)
|
535
|
+
File.join(MARQ.datadir, "GEO/#{clean(platform)}")
|
536
|
+
end
|
515
537
|
|
516
|
-
fix = ID_FIX[(organism + "_" + field[1].downcase).to_sym]
|
517
|
-
codes = Open.read(File.join(platform_path(platform), 'other')).collect{|l|
|
518
|
-
code = l.chomp
|
519
|
-
code = fix.call(code) if fix
|
520
|
-
code
|
521
|
-
}
|
522
|
-
end
|
523
538
|
|
524
|
-
|
525
|
-
|
526
|
-
|
539
|
+
def self.is_cross_platform?(dataset)
|
540
|
+
dataset =~ /_cross_platform/
|
541
|
+
end
|
527
542
|
|
528
|
-
|
529
|
-
|
530
|
-
if
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
else
|
535
|
-
translations = ailun
|
536
|
-
end
|
543
|
+
def self.has_cross_platform?(dataset = nil, platform = nil)
|
544
|
+
platform = clean(platform)
|
545
|
+
raise "Dataset #{ dataset } not found" if dataset && dataset_path(dataset, platform).nil?
|
546
|
+
raise "Platform #{ platform } not found" if platform && platform_path(platform).nil?
|
547
|
+
if dataset
|
548
|
+
File.exists?(dataset_path(dataset, platform) + "_cross_platform.orders")
|
537
549
|
else
|
538
|
-
|
550
|
+
Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
|
539
551
|
end
|
552
|
+
end
|
540
553
|
|
541
|
-
|
542
|
-
|
543
|
-
|
554
|
+
def self.dataset_path(dataset, platform = nil)
|
555
|
+
if platform
|
556
|
+
files = Dir.glob(File.join(platform_path(clean(platform)),"/*/#{ dataset }"))
|
557
|
+
else
|
558
|
+
files = Dir.glob(File.join(MARQ.datadir, "GEO/GPL*/*/#{ dataset }.*"))
|
544
559
|
end
|
560
|
+
return nil if files.empty?
|
561
|
+
return files.first.match(/(.*)\./)[1]
|
562
|
+
end
|
545
563
|
|
564
|
+
def self.organism_platforms(organism)
|
565
|
+
Dir.glob(File.join(MARQ.datadir, "GEO/GPL*")).collect{|f|
|
566
|
+
File.basename(f)
|
567
|
+
}.select{|platform|
|
568
|
+
SOFT.GPL(platform)[:organism] == organism &&
|
569
|
+
platform_datasets(platform).any?
|
570
|
+
}
|
546
571
|
end
|
547
572
|
|
548
573
|
|
549
|
-
def self.process_platform_datasets(platform, force = false)
|
550
|
-
raise "Platform #{ platform } not ready" unless File.exist? platform_path(platform)
|
551
574
|
|
552
|
-
|
575
|
+
def self.platform_datasets(platform)
|
576
|
+
Dir.glob(File.join(platform_path(platform),"*/*.orders")).collect{|f| File.basename(f).sub(/.orders$/,'')}.select{|d| !is_cross_platform?(d)}
|
577
|
+
end
|
553
578
|
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
process_GDS(dataset, platform, nil)
|
558
|
-
}
|
579
|
+
def self.dataset_platform(dataset)
|
580
|
+
dataset_path(dataset).match(/(GPL\d+)/)
|
581
|
+
$1
|
559
582
|
end
|
560
583
|
|
584
|
+
def self.GDS_info(name)
|
585
|
+
begin
|
586
|
+
title, description = Open.read(dataset_path(name) + '.description').split(/\n--\n/).values_at(0,1)
|
587
|
+
{:title => title.strip, :description => description.strip}
|
588
|
+
rescue Exception
|
589
|
+
puts $!.message
|
590
|
+
{:title => "" , :description => "" }
|
591
|
+
end
|
592
|
+
|
593
|
+
end
|
594
|
+
|
595
|
+
|
561
596
|
end
|
562
597
|
|
563
|
-
if __FILE__ == $0
|
564
598
|
|
565
|
-
|
566
|
-
p GEO.GPL_id_fields('GPL920')
|
567
|
-
puts GEO.GSE_info('GSE962')
|
568
|
-
puts GEO.GSE_info('GSE8982')
|
569
|
-
puts GEO::Eutils.GSE_dataset?('GSE8982')
|
570
|
-
puts GEO::Eutils.GSE_dataset?('GSE962')
|
571
|
-
|
572
|
-
exit
|
573
|
-
|
574
|
-
#puts GEO::dataset_path('GDS1103').inspect
|
575
|
-
#puts GEO::dataset_platform('GDS1103').inspect
|
576
|
-
|
577
|
-
# puts GEO.dataset_path('GDS2931')
|
578
|
-
# puts GEO.platform_datasets('GPL91')
|
579
|
-
# puts GEO.platform_datasets('GPL91').select{|d| GEO.has_cross_platform?(d)}
|
580
|
-
#
|
581
|
-
# gpls = Open.read('ftp://ftp.ncbi.nih.gov/pub/geo/DATA/supplementary/PLATFORMS.txt').collect{|l|
|
582
|
-
# l.chomp.split.first
|
583
|
-
# }
|
584
|
-
#
|
585
|
-
# %w(GPL85).each{|gpl|
|
586
|
-
# puts gpl
|
587
|
-
# puts GEO::GPL_info(gpl).inspect if gpl =~ /GPL/
|
588
|
-
# }
|
589
|
-
#
|
590
|
-
#puts GEO::GSM_info('GSM70604').inspect
|
591
|
-
|
592
|
-
p GEO::Eutils.organism_platforms('human')
|
599
|
+
if __FILE__ == $0
|
593
600
|
|
594
601
|
end
|
602
|
+
|