rbbt-marq 1.1.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/R/GEO.R +6 -4
- data/R/MA.R +1 -0
- data/bin/marq_config +4 -3
- data/install_scripts/CustomDS/Rakefile +27 -215
- data/install_scripts/GEO/Rakefile +34 -275
- data/install_scripts/rake_includes.rb +236 -0
- data/lib/MARQ/CustomDS.rb +63 -32
- data/lib/MARQ/GEO.rb +99 -63
- data/lib/MARQ/MADB.rb +107 -202
- data/lib/MARQ/annotations.rb +124 -38
- data/lib/MARQ/main.rb +152 -160
- data/lib/MARQ/rankproduct.rb +20 -34
- data/tasks/install.rake +7 -2
- metadata +3 -2
@@ -0,0 +1,236 @@
|
|
1
|
+
$expr_threshold ||= (ENV['threshold'] || 0.05).to_f
|
2
|
+
$folds ||= (ENV['folds'] || 2.5).to_f
|
3
|
+
$nth_genes ||= (ENV['nth_genes'] || 100).to_i
|
4
|
+
|
5
|
+
$force = [$force, ENV['force'], false].compact.first.to_s == 'true'
|
6
|
+
$tranlations = [$tranlations, ENV['translations'], false].compact.first.to_s == 'true'
|
7
|
+
$series = [$series, ENV['series'], true].compact.first.to_s == 'true'
|
8
|
+
$update_db = [$update_db, ENV['update_db'], false].compact.first.to_s == 'true'
|
9
|
+
$skip_db = [$skip_db, ENV['skip_db'], false].compact.first.to_s == 'true'
|
10
|
+
$fdr = [$fdr, ENV['fdr'], true].compact.first.to_s == 'true'
|
11
|
+
$do_folds = [$do_folds, ENV['do_folds'], true].compact.first.to_s == 'true'
|
12
|
+
|
13
|
+
|
14
|
+
$changes = false
|
15
|
+
module GEO::Process::R
|
16
|
+
class << self
|
17
|
+
alias_method :GDS_old, :GDS
|
18
|
+
def GDS(*args)
|
19
|
+
$changes = true
|
20
|
+
GDS_old(*args)
|
21
|
+
end
|
22
|
+
|
23
|
+
alias_method :GSE_old, :GSE
|
24
|
+
def GSE(*args)
|
25
|
+
$changes = true
|
26
|
+
GSE_old(*args)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
desc "Analyze datasets"
|
32
|
+
task 'data' do
|
33
|
+
|
34
|
+
platforms_to_save = []
|
35
|
+
|
36
|
+
platforms = process_list
|
37
|
+
platforms.each{|platform, datasets|
|
38
|
+
|
39
|
+
begin
|
40
|
+
# Prepare the platform
|
41
|
+
MARQ::Platform.process(platform)
|
42
|
+
rescue
|
43
|
+
puts "Error processing platform #{platform}"
|
44
|
+
puts $!.message
|
45
|
+
puts $!.backtrace.join("\n")
|
46
|
+
next
|
47
|
+
end
|
48
|
+
|
49
|
+
next if $tranlations
|
50
|
+
|
51
|
+
$changes = false
|
52
|
+
# Process all datasets
|
53
|
+
datasets.each{|dataset|
|
54
|
+
begin
|
55
|
+
next unless $force || MARQ::Dataset.path(dataset).nil?
|
56
|
+
MARQ::Dataset.process(dataset, platform)
|
57
|
+
rescue
|
58
|
+
puts "Error processing dataset #{ dataset }"
|
59
|
+
puts $!.message
|
60
|
+
puts $!.backtrace.join("\n")
|
61
|
+
end
|
62
|
+
}
|
63
|
+
|
64
|
+
# Mark the platform for saving in DB
|
65
|
+
platforms_to_save << platform if $changes || $update_db
|
66
|
+
}
|
67
|
+
|
68
|
+
platforms_to_save.each{|platform|
|
69
|
+
begin
|
70
|
+
puts "Saving #{platform}"
|
71
|
+
MADB.save_platform(platform)
|
72
|
+
rescue
|
73
|
+
puts "Error saving platform #{ platform }"
|
74
|
+
puts $!.message
|
75
|
+
puts $!.backtrace.join("\n")
|
76
|
+
end
|
77
|
+
}
|
78
|
+
end
|
79
|
+
|
80
|
+
def annotations(name, cross_platform = false, &block)
|
81
|
+
platforms = process_list
|
82
|
+
|
83
|
+
platforms.each do |platform, datasets|
|
84
|
+
datasets.each do |dataset|
|
85
|
+
begin
|
86
|
+
next if File.exist?(File.join("annotations", name, dataset)) && ! $force
|
87
|
+
next if MARQ::Dataset.path(dataset).nil?
|
88
|
+
|
89
|
+
FileUtils.mkdir_p File.join("annotations", name)
|
90
|
+
filename = File.join("annotations", name, dataset)
|
91
|
+
dataset += '_cross_platform' if cross_platform && MARQ::Platform::has_cross_platform?(platform)
|
92
|
+
next if ! MARQ::Dataset.exists?(dataset)
|
93
|
+
terms = block.call(dataset)
|
94
|
+
Open.write(filename, terms.to_yaml)
|
95
|
+
rescue
|
96
|
+
puts "Error processing dataset #{ dataset }"
|
97
|
+
puts $!.message
|
98
|
+
puts $!.backtrace.join("\n")
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
|
105
|
+
task 'annotate_Words' do
|
106
|
+
require 'MARQ/annotations'
|
107
|
+
require 'rbbt/bow/bow'
|
108
|
+
annotations('Words') do |dataset|
|
109
|
+
terms = {}
|
110
|
+
description = Open.read(MARQ::Dataset.path(dataset) + '.description')
|
111
|
+
terms[:dataset] = [dataset] + description.words.uniq
|
112
|
+
Open.read(MARQ::Dataset.path(dataset) + '.experiments').collect{|name|
|
113
|
+
name = name.strip
|
114
|
+
terms[name] = name.sub(/.*?: /,'').sub(/\[ratio\]/,'').words.uniq
|
115
|
+
}
|
116
|
+
terms
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
task 'annotate_UMLS' do
|
122
|
+
require 'MARQ/annotations'
|
123
|
+
require 'rbbt/util/misc'
|
124
|
+
annotations('UMLS') do |dataset|
|
125
|
+
terms = {}
|
126
|
+
description = Open.read(MARQ::Dataset.path(dataset) + '.description')
|
127
|
+
terms[:dataset] = Annotations::UMLS::OBA(description).uniq
|
128
|
+
Open.read(MARQ::Dataset.path(dataset) + '.experiments').collect{|name|
|
129
|
+
name = name.strip
|
130
|
+
terms[name] = Annotations::UMLS::OBA(name.sub(/.*?: /,'').sub(/\[ratio\]/,'')).uniq
|
131
|
+
}
|
132
|
+
terms
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
task 'annotate_Polysearch' do
|
138
|
+
require 'MARQ/annotations'
|
139
|
+
require 'rbbt/util/misc'
|
140
|
+
require 'rbbt/sources/polysearch'
|
141
|
+
annotations('Polysearch') do |dataset|
|
142
|
+
terms = {}
|
143
|
+
description = Open.read(MARQ::Dataset.path(dataset) + '.description')
|
144
|
+
terms[:dataset] = Polysearch::match(description).values.flatten.sort.collect{|n| n.gsub(/\s+/,' ').downcase}.uniq
|
145
|
+
Open.read(MARQ::Dataset.path(dataset) + '.experiments').collect{|name|
|
146
|
+
name = name.strip
|
147
|
+
terms[name] = Polysearch::match(name.sub(/.*?: /,'').sub(/\[ratio\]/,'')).values.flatten.sort.collect{|n| n.gsub(/\s+/,' ').downcase}.uniq
|
148
|
+
}
|
149
|
+
terms
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
|
154
|
+
def goterms(org, list, slim, threshold)
|
155
|
+
return [] if list.empty?
|
156
|
+
results = Annotations::Genes::Genecodis::Local.analysis(org, list, slim)
|
157
|
+
return [] if results.nil?
|
158
|
+
results.
|
159
|
+
select{|info| info[:s].to_i > 2 }.
|
160
|
+
select{|info| info[:hyp_c].to_f < threshold }.
|
161
|
+
collect{|info| info[:items]}.collect{|id| GO::id2name(id)}
|
162
|
+
end
|
163
|
+
|
164
|
+
task 'annotate_GO' do
|
165
|
+
require 'MARQ/annotations'
|
166
|
+
require 'rbbt/sources/go'
|
167
|
+
options = { :cut_off => $expr_threshold, :fdr => $fdr, :folds => $folds, :do_folds => $do_folds, :nth_genes => $nth_genes}
|
168
|
+
annotations('GO_up', true) do |dataset|
|
169
|
+
org = MARQ::Dataset.organism(dataset)
|
170
|
+
genes = Annotations::Genes.get_genes(dataset, options)
|
171
|
+
|
172
|
+
up = {}
|
173
|
+
genes[:up] ||= []
|
174
|
+
genes[:up].collect{|experiment,list|
|
175
|
+
up[experiment] = goterms(org, list, false, $expr_threshold)
|
176
|
+
}
|
177
|
+
up
|
178
|
+
end
|
179
|
+
|
180
|
+
annotations('GO_down', true) do |dataset|
|
181
|
+
org = MARQ::Dataset.organism(dataset)
|
182
|
+
genes = Annotations::Genes.get_genes(dataset, options)
|
183
|
+
|
184
|
+
down = {}
|
185
|
+
genes[:down] ||= []
|
186
|
+
genes[:down].collect{|experiment,list|
|
187
|
+
down[experiment] = goterms(org, list, false, $expr_threshold)
|
188
|
+
}
|
189
|
+
down
|
190
|
+
end
|
191
|
+
|
192
|
+
annotations('GOSlim_up', true) do |dataset|
|
193
|
+
org = MARQ::Dataset.organism(dataset)
|
194
|
+
genes = Annotations::Genes.get_genes(dataset, options)
|
195
|
+
|
196
|
+
up = {}
|
197
|
+
genes[:up] ||= []
|
198
|
+
genes[:up].collect{|experiment,list|
|
199
|
+
up[experiment] = goterms(org, list, true, $expr_threshold)
|
200
|
+
}
|
201
|
+
up
|
202
|
+
end
|
203
|
+
|
204
|
+
annotations('GOSlim_down', true) do |dataset|
|
205
|
+
org = MARQ::Dataset.organism(dataset)
|
206
|
+
genes = Annotations::Genes.get_genes(dataset, options)
|
207
|
+
|
208
|
+
down = {}
|
209
|
+
genes[:down] ||= []
|
210
|
+
genes[:down].collect{|experiment,list|
|
211
|
+
down[experiment] = goterms(org, list, true, $expr_threshold)
|
212
|
+
}
|
213
|
+
down
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
task 'annotate_SENT' do
|
218
|
+
require 'MARQ/annotations'
|
219
|
+
options = { :cut_off => $expr_threshold, :fdr => $fdr, :folds => $folds, :do_folds => $do_folds, :nth_genes => $nth_genes}
|
220
|
+
annotations('SENT') do |dataset|
|
221
|
+
org = MARQ::Dataset.organism(dataset)
|
222
|
+
genes = Annotations::Genes.get_genes(dataset, options)
|
223
|
+
terms = Annotations::Genes::SENT.terms(org, genes)
|
224
|
+
terms
|
225
|
+
end
|
226
|
+
|
227
|
+
|
228
|
+
end
|
229
|
+
|
230
|
+
task 'default' do
|
231
|
+
Rake::Task['data'].invoke
|
232
|
+
Rake::Task['annotate_Words'].invoke
|
233
|
+
Rake::Task['annotate_UMLS'].invoke
|
234
|
+
Rake::Task['annotate_Polysearch'].invoke
|
235
|
+
Rake::Task['annotate_GO'].invoke
|
236
|
+
end
|
data/lib/MARQ/CustomDS.rb
CHANGED
@@ -5,9 +5,8 @@ require 'MARQ/ID'
|
|
5
5
|
module CustomDS
|
6
6
|
@@r = nil
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
end
|
8
|
+
DATA_DIR = File.join(MARQ.datadir,'CustomDS')
|
9
|
+
|
11
10
|
|
12
11
|
def self.r
|
13
12
|
require 'rsruby'
|
@@ -35,33 +34,12 @@ module CustomDS
|
|
35
34
|
end
|
36
35
|
end
|
37
36
|
|
38
|
-
def self.path(dataset)
|
39
|
-
files = Dir.glob(customdir + "/*/#{ dataset }.orders")
|
40
|
-
if files.length == 1
|
41
|
-
files.first.sub(/.orders/,'')
|
42
|
-
else
|
43
|
-
Dir.glob(customdir + "/*/#{ dataset }").first
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
37
|
def self.organism(dataset)
|
48
|
-
path(dataset).match(/#{
|
49
|
-
end
|
50
|
-
|
51
|
-
def self.is_cross_platform?(dataset)
|
52
|
-
dataset.match(/_cross_platform/)
|
53
|
-
end
|
54
|
-
|
55
|
-
def self.clean(dataset)
|
56
|
-
dataset.sub(/_cross_platform/,'')
|
57
|
-
end
|
58
|
-
|
59
|
-
def self.has_cross_platform?(dataset)
|
60
|
-
Dir.glob(path(clean(dataset)) + '_cross_platform.orders').any?
|
38
|
+
path(dataset).match(/#{ DATA_DIR }\/(.*?)\//)[1]
|
61
39
|
end
|
62
40
|
|
63
41
|
def self.datasets(org)
|
64
|
-
Dir.glob(File.join(
|
42
|
+
Dir.glob(File.join(DATA_DIR, org) + '/*.orders').collect{|f| clean(File.basename(f.sub(/.orders/,'')))}.uniq
|
65
43
|
end
|
66
44
|
|
67
45
|
def self.process_matrix(prefix, org)
|
@@ -82,9 +60,62 @@ module CustomDS
|
|
82
60
|
end
|
83
61
|
|
84
62
|
def self.process(name)
|
85
|
-
|
86
|
-
|
87
|
-
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.organisms
|
66
|
+
Dir.glob(File.join(DATA_DIR, '*')).
|
67
|
+
select {|path| File.directory? path}.
|
68
|
+
collect {|path| File.basename path}
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.dataset_path(dataset)
|
72
|
+
organisms.each do |organism|
|
73
|
+
case
|
74
|
+
when File.exists?(File.join(DATA_DIR, organism, dataset + '.orders'))
|
75
|
+
return File.join(DATA_DIR, organism, dataset)
|
76
|
+
when File.exists?(File.join(DATA_DIR, organism, dataset + '.skip'))
|
77
|
+
return nil
|
78
|
+
end
|
79
|
+
end
|
80
|
+
return nil
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.platform_path(platform)
|
84
|
+
dataset_path(platform)
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.platform_datasets(platform)
|
88
|
+
MARQ::Dataset.clean(platform)
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.platform_organism(platform)
|
92
|
+
path = platform_path(platform)
|
93
|
+
return nil if path.nil?
|
94
|
+
path.match(/#{DATA_DIR}\/(.*)\/#{ platform }$/)
|
95
|
+
return $1
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.dataset_organism(dataset)
|
99
|
+
platform_organism(dataset)
|
100
|
+
end
|
101
|
+
|
102
|
+
def self.dataset_platform(dataset)
|
103
|
+
dataset
|
104
|
+
end
|
105
|
+
|
106
|
+
def self.organism_platforms(organism)
|
107
|
+
Dir.glob(File.join(DATA_DIR,organism,'*.orders')).
|
108
|
+
collect {|path| File.basename(path).sub(/\.orders$/,'').sub(/_cross_platform/,'')}.
|
109
|
+
uniq
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.process_platform(platform)
|
113
|
+
end
|
114
|
+
|
115
|
+
def self.process_dataset(dataset, platform = nil)
|
116
|
+
puts "Processing #{ dataset }"
|
117
|
+
org = dataset_organism(dataset)
|
118
|
+
prefix = File.join(DATA_DIR, org, dataset)
|
88
119
|
|
89
120
|
CustomDS::process_matrix(prefix, org)
|
90
121
|
end
|
@@ -94,13 +125,13 @@ end
|
|
94
125
|
|
95
126
|
if __FILE__ == $0
|
96
127
|
p CustomDS::datasets('sgd')
|
97
|
-
p CustomDS::
|
98
|
-
p CustomDS::
|
128
|
+
p CustomDS::dataset_path('HaploidData')
|
129
|
+
p CustomDS::dataset_path('HaploidData_cross_platform')
|
99
130
|
|
100
131
|
exit
|
101
132
|
|
102
133
|
org = 'sgd'
|
103
|
-
process = Dir.glob(File.join(CustomDS::
|
134
|
+
process = Dir.glob(File.join(CustomDS::DATA_DIR, org) + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)} - CustomDS.datasets('sgd')
|
104
135
|
p process
|
105
136
|
process.each{|d| CustomDS::process(d)}
|
106
137
|
|
data/lib/MARQ/GEO.rb
CHANGED
@@ -1,45 +1,56 @@
|
|
1
1
|
require 'MARQ'
|
2
|
+
require 'MARQ/main'
|
2
3
|
require 'rbbt/sources/organism'
|
3
4
|
|
4
5
|
# Work with GEO datasets
|
5
6
|
module GEO
|
6
7
|
|
8
|
+
CACHE_DIR = File.join(MARQ.cachedir,'GEO')
|
9
|
+
FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
|
10
|
+
|
11
|
+
DATA_DIR = File.join(MARQ.datadir, 'GEO')
|
12
|
+
|
7
13
|
# Get information from Entrez
|
8
14
|
module Remote
|
9
15
|
|
16
|
+
@@nice = 1
|
10
17
|
def self.organism_platforms(org)
|
11
18
|
name = Organism.name(org)
|
12
|
-
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=\"#{name}\"[Organism:exp]+AND+%22gpl%22[Filter]&retmax=10000").
|
19
|
+
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=\"#{name}\"[Organism:exp]+AND+%22gpl%22[Filter]&retmax=10000", :nice => @@nice).
|
13
20
|
scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.collect{|id| "GPL#{id.sub(/^100*/,'')}"}
|
14
21
|
end
|
15
22
|
|
16
23
|
def self.platform_datasets(platform)
|
17
|
-
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=#{platform}[Accession]&retmax=2000").
|
24
|
+
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=#{platform}[Accession]&retmax=2000", :nice => @@nice).
|
18
25
|
scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.select{|id| !id.match(/^(1|2)000/) }.collect{|id| "GDS#{id}"}
|
19
26
|
end
|
20
27
|
|
21
28
|
def self.dataset_platform(dataset)
|
22
29
|
if dataset =~ /GSE/
|
23
|
-
Open.read("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
|
30
|
+
Open.read("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc=#{dataset}", :nice => @@nice).scan(/GPL\d+/).uniq.sort.join("_")
|
24
31
|
else
|
25
|
-
Open.read("http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
|
32
|
+
Open.read("http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=#{dataset}", :nice => @@nice).scan(/GPL\d+/).uniq.sort.join("_")
|
26
33
|
end
|
27
34
|
end
|
28
35
|
|
29
36
|
def self.series_dataset?(gse)
|
30
|
-
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=geo&term=#{gse}[Accession]&retmax=2000").
|
37
|
+
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=geo&term=#{gse}[Accession]&retmax=2000", :nice => @@nice).
|
31
38
|
match(/<Id>(\d+?)<\/Id>/) != nil
|
32
39
|
end
|
33
40
|
|
41
|
+
def self.platform_organism(platform)
|
42
|
+
Open.read("http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=#{platform}", :nice => @@nice).
|
43
|
+
match(%r#<td><a href="http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi\?mode=Info&id=\d+" onmouseout="onLinkOut\('HelpMessage' , geo_empty_help\)" onmouseover="onLinkOver\('HelpMessage' , geoaxema_organismus\)">(.*)</a></td>#)[1]
|
44
|
+
end
|
45
|
+
|
34
46
|
end
|
35
47
|
|
36
|
-
CACHE_DIR = File.join(MARQ.cachedir,'GEO')
|
37
|
-
FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
|
38
48
|
|
39
49
|
|
40
50
|
# Parse information in .soft files
|
41
51
|
module SOFT
|
42
52
|
|
53
|
+
@@nice = 1
|
43
54
|
GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
|
44
55
|
|
45
56
|
# Download a soft file. Uses cache
|
@@ -49,7 +60,7 @@ module GEO
|
|
49
60
|
if File.exist?( cache_file )
|
50
61
|
File.open(cache_file).read
|
51
62
|
else
|
52
|
-
content = Open.read(GEO_SOFT + item, :nocache => true)
|
63
|
+
content = Open.read(GEO_SOFT + item, :nocache => true, :nice => @@nice)
|
53
64
|
raise "SOFT file error" if content !~ /!/
|
54
65
|
fout = File.open(cache_file,'w')
|
55
66
|
fout.write content
|
@@ -105,7 +116,7 @@ module GEO
|
|
105
116
|
soft = get_soft(series)
|
106
117
|
|
107
118
|
if match = soft.scan(/!Series_platform_id\s*=?\s*(.*)/)
|
108
|
-
platform = match.flatten.collect{|p| p.strip}
|
119
|
+
platform = match.flatten.collect{|p| p.strip}.join("_")
|
109
120
|
else
|
110
121
|
raise "No Platform information"
|
111
122
|
end
|
@@ -131,7 +142,7 @@ module GEO
|
|
131
142
|
end
|
132
143
|
|
133
144
|
{
|
134
|
-
:platform => platform
|
145
|
+
:platform => platform,
|
135
146
|
:description =>description.strip,
|
136
147
|
:title => title.strip,
|
137
148
|
:samples => samples,
|
@@ -162,8 +173,8 @@ module GEO
|
|
162
173
|
end
|
163
174
|
|
164
175
|
def self.GPL(platform)
|
165
|
-
if !File.exist?(File.join(
|
166
|
-
|
176
|
+
if !File.exist?(File.join(DATA_DIR, 'platforms',"#{platform}.yaml")) &&
|
177
|
+
!File.exist?(File.join(DATA_DIR, 'platforms',"#{platform}.skip"))
|
167
178
|
begin
|
168
179
|
if platform =~ /_/
|
169
180
|
organism = GPL(platform.match(/(.*?)_/)[1])[:organism]
|
@@ -230,17 +241,17 @@ module GEO
|
|
230
241
|
info[:other_ID_field] = [other_pos + 1, other_name] if other_pos > 0
|
231
242
|
|
232
243
|
|
233
|
-
Open.write(File.join(
|
244
|
+
Open.write(File.join(DATA_DIR, 'platforms',"#{platform}.yaml"), info.to_yaml)
|
234
245
|
rescue Exception
|
235
246
|
puts $!.message
|
236
247
|
puts $!.backtrace
|
237
|
-
Open.write(File.join(
|
248
|
+
Open.write(File.join(DATA_DIR, 'platforms',"#{platform}.skip"), $!.message)
|
238
249
|
end
|
239
250
|
end
|
240
251
|
|
241
252
|
raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
|
242
253
|
|
243
|
-
YAML::load(File.open(File.join(
|
254
|
+
YAML::load(File.open(File.join(DATA_DIR, 'platforms',"#{platform}.yaml")))
|
244
255
|
end
|
245
256
|
|
246
257
|
|
@@ -365,6 +376,8 @@ module GEO
|
|
365
376
|
if File.exist?(File.join(platform,'cross_platform'))
|
366
377
|
puts "-- Translated to cross_platform format"
|
367
378
|
R.GDS(dataset, prefix + '_cross_platform', field, File.join(platform_path, 'translations'))
|
379
|
+
else
|
380
|
+
puts "No cross_platform probe ids for platform"
|
368
381
|
end
|
369
382
|
end
|
370
383
|
|
@@ -388,10 +401,11 @@ module GEO
|
|
388
401
|
do_log = !info[:log2] if info[:log2]
|
389
402
|
fields = info[:fields]
|
390
403
|
|
391
|
-
puts "Processing GSE #{ series }. Platform #{ platform }"
|
392
|
-
|
393
404
|
platform_path = GEO::platform_path(platform)
|
405
|
+
return if platform_path.nil?
|
394
406
|
prefix = File.join(platform_path, 'GSE', series.to_s)
|
407
|
+
|
408
|
+
puts "Processing GSE #{ series }. Platform #{ platform }"
|
395
409
|
puts "-- Original"
|
396
410
|
R.GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
|
397
411
|
|
@@ -433,6 +447,8 @@ module GEO
|
|
433
447
|
fix_GSE_ids(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform');
|
434
448
|
FileUtils.cp(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform.codes')
|
435
449
|
FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
|
450
|
+
else
|
451
|
+
puts "No cross_platform probe ids for platform"
|
436
452
|
end
|
437
453
|
FileUtils.rm(prefix + '.swap') if File.exist?(prefix + '.swap')
|
438
454
|
end
|
@@ -441,7 +457,7 @@ module GEO
|
|
441
457
|
# system (called biomart for clarity)
|
442
458
|
def self.GPL(platform)
|
443
459
|
path = GEO::platform_path(platform)
|
444
|
-
return if File.exist?
|
460
|
+
return if path.nil? || File.exist?(path)
|
445
461
|
|
446
462
|
if platform =~ /_/
|
447
463
|
FileUtils.mkdir(path)
|
@@ -522,81 +538,101 @@ module GEO
|
|
522
538
|
|
523
539
|
end
|
524
540
|
|
541
|
+
def self.platforms
|
542
|
+
Dir.glob(File.join(DATA_DIR, "GPL*")).collect {|path| File.basename(path) }
|
543
|
+
end
|
525
544
|
|
526
545
|
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
546
|
+
def self.dataset_type(dataset)
|
547
|
+
case
|
548
|
+
when dataset =~ /^GDS/
|
549
|
+
:GDS
|
550
|
+
when dataset =~ /^GSE/
|
551
|
+
:GSE
|
552
|
+
end
|
531
553
|
end
|
532
554
|
|
533
|
-
|
534
555
|
def self.platform_path(platform)
|
535
|
-
File.join(
|
556
|
+
path = File.join(DATA_DIR, platform)
|
557
|
+
path = nil unless File.exists? path
|
558
|
+
path
|
536
559
|
end
|
537
560
|
|
561
|
+
def self.dataset_path(dataset, platform = nil)
|
562
|
+
if platform
|
563
|
+
platforms = [platform]
|
564
|
+
else
|
565
|
+
platforms = self.platforms
|
566
|
+
end
|
538
567
|
|
539
|
-
|
540
|
-
|
541
|
-
|
568
|
+
platforms.each do |platform|
|
569
|
+
platform_path = platform_path(platform)
|
570
|
+
next if platform_path.nil?
|
542
571
|
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
|
572
|
+
prefix = File.join(platform_path, dataset_type(dataset).to_s, dataset)
|
573
|
+
case
|
574
|
+
when File.exists?(prefix + '.orders')
|
575
|
+
return File.join(platform_path, dataset_type(dataset).to_s, dataset)
|
576
|
+
when File.exists?(prefix + '.skip')
|
577
|
+
return nil
|
578
|
+
end
|
551
579
|
end
|
580
|
+
|
581
|
+
return nil
|
552
582
|
end
|
553
583
|
|
554
|
-
def self.
|
555
|
-
|
556
|
-
|
584
|
+
def self.platform_datasets(platform)
|
585
|
+
cross_platform = MARQ::Platform.is_cross_platform? platform
|
586
|
+
|
587
|
+
path = platform_path(MARQ::Platform.clean(platform))
|
588
|
+
return [] if path.nil?
|
589
|
+
|
590
|
+
datasets = Dir.glob(File.join(path, '*', '*.orders')).
|
591
|
+
collect {|path| File.basename(path).sub(/\.orders$/,'')}
|
592
|
+
|
593
|
+
if cross_platform
|
594
|
+
datasets.select {|dataset| MARQ::Dataset.is_cross_platform? dataset }.
|
595
|
+
collect {|dataset| MARQ::Dataset.clean(dataset) }
|
557
596
|
else
|
558
|
-
|
597
|
+
datasets.select {|dataset| ! MARQ::Dataset.is_cross_platform? dataset }
|
559
598
|
end
|
560
|
-
return nil if files.empty?
|
561
|
-
return files.first.match(/(.*)\./)[1]
|
562
599
|
end
|
563
600
|
|
564
|
-
def self.
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
platform_datasets(platform).any?
|
570
|
-
}
|
601
|
+
def self.dataset_platform(dataset)
|
602
|
+
path = dataset_path(dataset)
|
603
|
+
return nil if path.nil?
|
604
|
+
path.match(/(GPL\d+)/)
|
605
|
+
return $1
|
571
606
|
end
|
572
607
|
|
608
|
+
def self.platform_organism(platform)
|
609
|
+
GEO::SOFT.GPL(platform)[:organism]
|
610
|
+
end
|
573
611
|
|
574
|
-
|
575
|
-
|
576
|
-
Dir.glob(File.join(platform_path(platform),"*/*.orders")).collect{|f| File.basename(f).sub(/.orders$/,'')}.select{|d| !is_cross_platform?(d)}
|
612
|
+
def self.dataset_organism(dataset)
|
613
|
+
platform_organism(dataset_platform(dataset))
|
577
614
|
end
|
578
615
|
|
579
|
-
def self.
|
580
|
-
|
581
|
-
$1
|
616
|
+
def self.process_platform(platform)
|
617
|
+
GEO::Process.GPL(platform)
|
582
618
|
end
|
583
619
|
|
584
|
-
def self.
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
{:
|
620
|
+
def self.process_dataset(dataset, platform)
|
621
|
+
case dataset_type(dataset)
|
622
|
+
when :GDS
|
623
|
+
GEO::Process.GDS(dataset, platform)
|
624
|
+
when :GSE
|
625
|
+
info = YAML::load(File.open("series/#{ dataset }.yaml"))
|
626
|
+
FileUtils.rm("platforms/#{ info[:platform] }.skip") if File.exist? "platforms/#{ info[:platform] }.skip"
|
627
|
+
GEO::Process.GSE(dataset, info)
|
591
628
|
end
|
592
|
-
|
593
629
|
end
|
594
630
|
|
595
|
-
|
596
631
|
end
|
597
632
|
|
598
633
|
|
599
634
|
if __FILE__ == $0
|
635
|
+
p GEO.dataset_path 'GDS2791_cross_platform', 'GPL96'
|
600
636
|
|
601
637
|
end
|
602
638
|
|