rbbt-marq 1.1.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,236 @@
1
+ $expr_threshold ||= (ENV['threshold'] || 0.05).to_f
2
+ $folds ||= (ENV['folds'] || 2.5).to_f
3
+ $nth_genes ||= (ENV['nth_genes'] || 100).to_i
4
+
5
+ $force = [$force, ENV['force'], false].compact.first.to_s == 'true'
6
+ $tranlations = [$tranlations, ENV['translations'], false].compact.first.to_s == 'true'
7
+ $series = [$series, ENV['series'], true].compact.first.to_s == 'true'
8
+ $update_db = [$update_db, ENV['update_db'], false].compact.first.to_s == 'true'
9
+ $skip_db = [$skip_db, ENV['skip_db'], false].compact.first.to_s == 'true'
10
+ $fdr = [$fdr, ENV['fdr'], true].compact.first.to_s == 'true'
11
+ $do_folds = [$do_folds, ENV['do_folds'], true].compact.first.to_s == 'true'
12
+
13
+
14
+ $changes = false
15
+ module GEO::Process::R
16
+ class << self
17
+ alias_method :GDS_old, :GDS
18
+ def GDS(*args)
19
+ $changes = true
20
+ GDS_old(*args)
21
+ end
22
+
23
+ alias_method :GSE_old, :GSE
24
+ def GSE(*args)
25
+ $changes = true
26
+ GSE_old(*args)
27
+ end
28
+ end
29
+ end
30
+
31
+ desc "Analyze datasets"
32
+ task 'data' do
33
+
34
+ platforms_to_save = []
35
+
36
+ platforms = process_list
37
+ platforms.each{|platform, datasets|
38
+
39
+ begin
40
+ # Prepare the platform
41
+ MARQ::Platform.process(platform)
42
+ rescue
43
+ puts "Error processing platform #{platform}"
44
+ puts $!.message
45
+ puts $!.backtrace.join("\n")
46
+ next
47
+ end
48
+
49
+ next if $tranlations
50
+
51
+ $changes = false
52
+ # Process all datasets
53
+ datasets.each{|dataset|
54
+ begin
55
+ next unless $force || MARQ::Dataset.path(dataset).nil?
56
+ MARQ::Dataset.process(dataset, platform)
57
+ rescue
58
+ puts "Error processing dataset #{ dataset }"
59
+ puts $!.message
60
+ puts $!.backtrace.join("\n")
61
+ end
62
+ }
63
+
64
+ # Mark the platform for saving in DB
65
+ platforms_to_save << platform if $changes || $update_db
66
+ }
67
+
68
+ platforms_to_save.each{|platform|
69
+ begin
70
+ puts "Saving #{platform}"
71
+ MADB.save_platform(platform)
72
+ rescue
73
+ puts "Error saving platform #{ platform }"
74
+ puts $!.message
75
+ puts $!.backtrace.join("\n")
76
+ end
77
+ }
78
+ end
79
+
80
+ def annotations(name, cross_platform = false, &block)
81
+ platforms = process_list
82
+
83
+ platforms.each do |platform, datasets|
84
+ datasets.each do |dataset|
85
+ begin
86
+ next if File.exist?(File.join("annotations", name, dataset)) && ! $force
87
+ next if MARQ::Dataset.path(dataset).nil?
88
+
89
+ FileUtils.mkdir_p File.join("annotations", name)
90
+ filename = File.join("annotations", name, dataset)
91
+ dataset += '_cross_platform' if cross_platform && MARQ::Platform::has_cross_platform?(platform)
92
+ next if ! MARQ::Dataset.exists?(dataset)
93
+ terms = block.call(dataset)
94
+ Open.write(filename, terms.to_yaml)
95
+ rescue
96
+ puts "Error processing dataset #{ dataset }"
97
+ puts $!.message
98
+ puts $!.backtrace.join("\n")
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+
105
+ task 'annotate_Words' do
106
+ require 'MARQ/annotations'
107
+ require 'rbbt/bow/bow'
108
+ annotations('Words') do |dataset|
109
+ terms = {}
110
+ description = Open.read(MARQ::Dataset.path(dataset) + '.description')
111
+ terms[:dataset] = [dataset] + description.words.uniq
112
+ Open.read(MARQ::Dataset.path(dataset) + '.experiments').collect{|name|
113
+ name = name.strip
114
+ terms[name] = name.sub(/.*?: /,'').sub(/\[ratio\]/,'').words.uniq
115
+ }
116
+ terms
117
+ end
118
+ end
119
+
120
+
121
+ task 'annotate_UMLS' do
122
+ require 'MARQ/annotations'
123
+ require 'rbbt/util/misc'
124
+ annotations('UMLS') do |dataset|
125
+ terms = {}
126
+ description = Open.read(MARQ::Dataset.path(dataset) + '.description')
127
+ terms[:dataset] = Annotations::UMLS::OBA(description).uniq
128
+ Open.read(MARQ::Dataset.path(dataset) + '.experiments').collect{|name|
129
+ name = name.strip
130
+ terms[name] = Annotations::UMLS::OBA(name.sub(/.*?: /,'').sub(/\[ratio\]/,'')).uniq
131
+ }
132
+ terms
133
+ end
134
+ end
135
+
136
+
137
+ task 'annotate_Polysearch' do
138
+ require 'MARQ/annotations'
139
+ require 'rbbt/util/misc'
140
+ require 'rbbt/sources/polysearch'
141
+ annotations('Polysearch') do |dataset|
142
+ terms = {}
143
+ description = Open.read(MARQ::Dataset.path(dataset) + '.description')
144
+ terms[:dataset] = Polysearch::match(description).values.flatten.sort.collect{|n| n.gsub(/\s+/,' ').downcase}.uniq
145
+ Open.read(MARQ::Dataset.path(dataset) + '.experiments').collect{|name|
146
+ name = name.strip
147
+ terms[name] = Polysearch::match(name.sub(/.*?: /,'').sub(/\[ratio\]/,'')).values.flatten.sort.collect{|n| n.gsub(/\s+/,' ').downcase}.uniq
148
+ }
149
+ terms
150
+ end
151
+
152
+ end
153
+
154
+ def goterms(org, list, slim, threshold)
155
+ return [] if list.empty?
156
+ results = Annotations::Genes::Genecodis::Local.analysis(org, list, slim)
157
+ return [] if results.nil?
158
+ results.
159
+ select{|info| info[:s].to_i > 2 }.
160
+ select{|info| info[:hyp_c].to_f < threshold }.
161
+ collect{|info| info[:items]}.collect{|id| GO::id2name(id)}
162
+ end
163
+
164
+ task 'annotate_GO' do
165
+ require 'MARQ/annotations'
166
+ require 'rbbt/sources/go'
167
+ options = { :cut_off => $expr_threshold, :fdr => $fdr, :folds => $folds, :do_folds => $do_folds, :nth_genes => $nth_genes}
168
+ annotations('GO_up', true) do |dataset|
169
+ org = MARQ::Dataset.organism(dataset)
170
+ genes = Annotations::Genes.get_genes(dataset, options)
171
+
172
+ up = {}
173
+ genes[:up] ||= []
174
+ genes[:up].collect{|experiment,list|
175
+ up[experiment] = goterms(org, list, false, $expr_threshold)
176
+ }
177
+ up
178
+ end
179
+
180
+ annotations('GO_down', true) do |dataset|
181
+ org = MARQ::Dataset.organism(dataset)
182
+ genes = Annotations::Genes.get_genes(dataset, options)
183
+
184
+ down = {}
185
+ genes[:down] ||= []
186
+ genes[:down].collect{|experiment,list|
187
+ down[experiment] = goterms(org, list, false, $expr_threshold)
188
+ }
189
+ down
190
+ end
191
+
192
+ annotations('GOSlim_up', true) do |dataset|
193
+ org = MARQ::Dataset.organism(dataset)
194
+ genes = Annotations::Genes.get_genes(dataset, options)
195
+
196
+ up = {}
197
+ genes[:up] ||= []
198
+ genes[:up].collect{|experiment,list|
199
+ up[experiment] = goterms(org, list, true, $expr_threshold)
200
+ }
201
+ up
202
+ end
203
+
204
+ annotations('GOSlim_down', true) do |dataset|
205
+ org = MARQ::Dataset.organism(dataset)
206
+ genes = Annotations::Genes.get_genes(dataset, options)
207
+
208
+ down = {}
209
+ genes[:down] ||= []
210
+ genes[:down].collect{|experiment,list|
211
+ down[experiment] = goterms(org, list, true, $expr_threshold)
212
+ }
213
+ down
214
+ end
215
+ end
216
+
217
+ task 'annotate_SENT' do
218
+ require 'MARQ/annotations'
219
+ options = { :cut_off => $expr_threshold, :fdr => $fdr, :folds => $folds, :do_folds => $do_folds, :nth_genes => $nth_genes}
220
+ annotations('SENT') do |dataset|
221
+ org = MARQ::Dataset.organism(dataset)
222
+ genes = Annotations::Genes.get_genes(dataset, options)
223
+ terms = Annotations::Genes::SENT.terms(org, genes)
224
+ terms
225
+ end
226
+
227
+
228
+ end
229
+
230
+ task 'default' do
231
+ Rake::Task['data'].invoke
232
+ Rake::Task['annotate_Words'].invoke
233
+ Rake::Task['annotate_UMLS'].invoke
234
+ Rake::Task['annotate_Polysearch'].invoke
235
+ Rake::Task['annotate_GO'].invoke
236
+ end
data/lib/MARQ/CustomDS.rb CHANGED
@@ -5,9 +5,8 @@ require 'MARQ/ID'
5
5
  module CustomDS
6
6
  @@r = nil
7
7
 
8
- def self.customdir
9
- File.join(MARQ.datadir,'CustomDS')
10
- end
8
+ DATA_DIR = File.join(MARQ.datadir,'CustomDS')
9
+
11
10
 
12
11
  def self.r
13
12
  require 'rsruby'
@@ -35,33 +34,12 @@ module CustomDS
35
34
  end
36
35
  end
37
36
 
38
- def self.path(dataset)
39
- files = Dir.glob(customdir + "/*/#{ dataset }.orders")
40
- if files.length == 1
41
- files.first.sub(/.orders/,'')
42
- else
43
- Dir.glob(customdir + "/*/#{ dataset }").first
44
- end
45
- end
46
-
47
37
  def self.organism(dataset)
48
- path(dataset).match(/#{ customdir }\/(.*?)\//)[1]
49
- end
50
-
51
- def self.is_cross_platform?(dataset)
52
- dataset.match(/_cross_platform/)
53
- end
54
-
55
- def self.clean(dataset)
56
- dataset.sub(/_cross_platform/,'')
57
- end
58
-
59
- def self.has_cross_platform?(dataset)
60
- Dir.glob(path(clean(dataset)) + '_cross_platform.orders').any?
38
+ path(dataset).match(/#{ DATA_DIR }\/(.*?)\//)[1]
61
39
  end
62
40
 
63
41
  def self.datasets(org)
64
- Dir.glob(File.join(customdir, org) + '/*.orders').collect{|f| clean(File.basename(f.sub(/.orders/,'')))}.uniq
42
+ Dir.glob(File.join(DATA_DIR, org) + '/*.orders').collect{|f| clean(File.basename(f.sub(/.orders/,'')))}.uniq
65
43
  end
66
44
 
67
45
  def self.process_matrix(prefix, org)
@@ -82,9 +60,62 @@ module CustomDS
82
60
  end
83
61
 
84
62
  def self.process(name)
85
- puts "Processing #{ name }"
86
- org = organism(name)
87
- prefix = File.join(customdir, org, name)
63
+ end
64
+
65
+ def self.organisms
66
+ Dir.glob(File.join(DATA_DIR, '*')).
67
+ select {|path| File.directory? path}.
68
+ collect {|path| File.basename path}
69
+ end
70
+
71
+ def self.dataset_path(dataset)
72
+ organisms.each do |organism|
73
+ case
74
+ when File.exists?(File.join(DATA_DIR, organism, dataset + '.orders'))
75
+ return File.join(DATA_DIR, organism, dataset)
76
+ when File.exists?(File.join(DATA_DIR, organism, dataset + '.skip'))
77
+ return nil
78
+ end
79
+ end
80
+ return nil
81
+ end
82
+
83
+ def self.platform_path(platform)
84
+ dataset_path(platform)
85
+ end
86
+
87
+ def self.platform_datasets(platform)
88
+ MARQ::Dataset.clean(platform)
89
+ end
90
+
91
+ def self.platform_organism(platform)
92
+ path = platform_path(platform)
93
+ return nil if path.nil?
94
+ path.match(/#{DATA_DIR}\/(.*)\/#{ platform }$/)
95
+ return $1
96
+ end
97
+
98
+ def self.dataset_organism(dataset)
99
+ platform_organism(dataset)
100
+ end
101
+
102
+ def self.dataset_platform(dataset)
103
+ dataset
104
+ end
105
+
106
+ def self.organism_platforms(organism)
107
+ Dir.glob(File.join(DATA_DIR,organism,'*.orders')).
108
+ collect {|path| File.basename(path).sub(/\.orders$/,'').sub(/_cross_platform/,'')}.
109
+ uniq
110
+ end
111
+
112
+ def self.process_platform(platform)
113
+ end
114
+
115
+ def self.process_dataset(dataset, platform = nil)
116
+ puts "Processing #{ dataset }"
117
+ org = dataset_organism(dataset)
118
+ prefix = File.join(DATA_DIR, org, dataset)
88
119
 
89
120
  CustomDS::process_matrix(prefix, org)
90
121
  end
@@ -94,13 +125,13 @@ end
94
125
 
95
126
  if __FILE__ == $0
96
127
  p CustomDS::datasets('sgd')
97
- p CustomDS::path('HaploidData')
98
- p CustomDS::path('HaploidData_cross_platform')
128
+ p CustomDS::dataset_path('HaploidData')
129
+ p CustomDS::dataset_path('HaploidData_cross_platform')
99
130
 
100
131
  exit
101
132
 
102
133
  org = 'sgd'
103
- process = Dir.glob(File.join(CustomDS::customdir, org) + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)} - CustomDS.datasets('sgd')
134
+ process = Dir.glob(File.join(CustomDS::DATA_DIR, org) + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)} - CustomDS.datasets('sgd')
104
135
  p process
105
136
  process.each{|d| CustomDS::process(d)}
106
137
 
data/lib/MARQ/GEO.rb CHANGED
@@ -1,45 +1,56 @@
1
1
  require 'MARQ'
2
+ require 'MARQ/main'
2
3
  require 'rbbt/sources/organism'
3
4
 
4
5
  # Work with GEO datasets
5
6
  module GEO
6
7
 
8
+ CACHE_DIR = File.join(MARQ.cachedir,'GEO')
9
+ FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
10
+
11
+ DATA_DIR = File.join(MARQ.datadir, 'GEO')
12
+
7
13
  # Get information from Entrez
8
14
  module Remote
9
15
 
16
+ @@nice = 1
10
17
  def self.organism_platforms(org)
11
18
  name = Organism.name(org)
12
- Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=\"#{name}\"[Organism:exp]+AND+%22gpl%22[Filter]&retmax=10000").
19
+ Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=\"#{name}\"[Organism:exp]+AND+%22gpl%22[Filter]&retmax=10000", :nice => @@nice).
13
20
  scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.collect{|id| "GPL#{id.sub(/^100*/,'')}"}
14
21
  end
15
22
 
16
23
  def self.platform_datasets(platform)
17
- Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=#{platform}[Accession]&retmax=2000").
24
+ Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=#{platform}[Accession]&retmax=2000", :nice => @@nice).
18
25
  scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.select{|id| !id.match(/^(1|2)000/) }.collect{|id| "GDS#{id}"}
19
26
  end
20
27
 
21
28
  def self.dataset_platform(dataset)
22
29
  if dataset =~ /GSE/
23
- Open.read("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
30
+ Open.read("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc=#{dataset}", :nice => @@nice).scan(/GPL\d+/).uniq.sort.join("_")
24
31
  else
25
- Open.read("http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
32
+ Open.read("http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=#{dataset}", :nice => @@nice).scan(/GPL\d+/).uniq.sort.join("_")
26
33
  end
27
34
  end
28
35
 
29
36
  def self.series_dataset?(gse)
30
- Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=geo&term=#{gse}[Accession]&retmax=2000").
37
+ Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=geo&term=#{gse}[Accession]&retmax=2000", :nice => @@nice).
31
38
  match(/<Id>(\d+?)<\/Id>/) != nil
32
39
  end
33
40
 
41
+ def self.platform_organism(platform)
42
+ Open.read("http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=#{platform}", :nice => @@nice).
43
+ match(%r#<td><a href="http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi\?mode=Info&amp;id=\d+" onmouseout="onLinkOut\('HelpMessage' , geo_empty_help\)" onmouseover="onLinkOver\('HelpMessage' , geoaxema_organismus\)">(.*)</a></td>#)[1]
44
+ end
45
+
34
46
  end
35
47
 
36
- CACHE_DIR = File.join(MARQ.cachedir,'GEO')
37
- FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
38
48
 
39
49
 
40
50
  # Parse information in .soft files
41
51
  module SOFT
42
52
 
53
+ @@nice = 1
43
54
  GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
44
55
 
45
56
  # Download a soft file. Uses cache
@@ -49,7 +60,7 @@ module GEO
49
60
  if File.exist?( cache_file )
50
61
  File.open(cache_file).read
51
62
  else
52
- content = Open.read(GEO_SOFT + item, :nocache => true)
63
+ content = Open.read(GEO_SOFT + item, :nocache => true, :nice => @@nice)
53
64
  raise "SOFT file error" if content !~ /!/
54
65
  fout = File.open(cache_file,'w')
55
66
  fout.write content
@@ -105,7 +116,7 @@ module GEO
105
116
  soft = get_soft(series)
106
117
 
107
118
  if match = soft.scan(/!Series_platform_id\s*=?\s*(.*)/)
108
- platform = match.flatten.collect{|p| p.strip}
119
+ platform = match.flatten.collect{|p| p.strip}.join("_")
109
120
  else
110
121
  raise "No Platform information"
111
122
  end
@@ -131,7 +142,7 @@ module GEO
131
142
  end
132
143
 
133
144
  {
134
- :platform => platform.join("_"),
145
+ :platform => platform,
135
146
  :description =>description.strip,
136
147
  :title => title.strip,
137
148
  :samples => samples,
@@ -162,8 +173,8 @@ module GEO
162
173
  end
163
174
 
164
175
  def self.GPL(platform)
165
- if !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")) &&
166
- !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
176
+ if !File.exist?(File.join(DATA_DIR, 'platforms',"#{platform}.yaml")) &&
177
+ !File.exist?(File.join(DATA_DIR, 'platforms',"#{platform}.skip"))
167
178
  begin
168
179
  if platform =~ /_/
169
180
  organism = GPL(platform.match(/(.*?)_/)[1])[:organism]
@@ -230,17 +241,17 @@ module GEO
230
241
  info[:other_ID_field] = [other_pos + 1, other_name] if other_pos > 0
231
242
 
232
243
 
233
- Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml"), info.to_yaml)
244
+ Open.write(File.join(DATA_DIR, 'platforms',"#{platform}.yaml"), info.to_yaml)
234
245
  rescue Exception
235
246
  puts $!.message
236
247
  puts $!.backtrace
237
- Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"), $!.message)
248
+ Open.write(File.join(DATA_DIR, 'platforms',"#{platform}.skip"), $!.message)
238
249
  end
239
250
  end
240
251
 
241
252
  raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
242
253
 
243
- YAML::load(File.open(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")))
254
+ YAML::load(File.open(File.join(DATA_DIR, 'platforms',"#{platform}.yaml")))
244
255
  end
245
256
 
246
257
 
@@ -365,6 +376,8 @@ module GEO
365
376
  if File.exist?(File.join(platform,'cross_platform'))
366
377
  puts "-- Translated to cross_platform format"
367
378
  R.GDS(dataset, prefix + '_cross_platform', field, File.join(platform_path, 'translations'))
379
+ else
380
+ puts "No cross_platform probe ids for platform"
368
381
  end
369
382
  end
370
383
 
@@ -388,10 +401,11 @@ module GEO
388
401
  do_log = !info[:log2] if info[:log2]
389
402
  fields = info[:fields]
390
403
 
391
- puts "Processing GSE #{ series }. Platform #{ platform }"
392
-
393
404
  platform_path = GEO::platform_path(platform)
405
+ return if platform_path.nil?
394
406
  prefix = File.join(platform_path, 'GSE', series.to_s)
407
+
408
+ puts "Processing GSE #{ series }. Platform #{ platform }"
395
409
  puts "-- Original"
396
410
  R.GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
397
411
 
@@ -433,6 +447,8 @@ module GEO
433
447
  fix_GSE_ids(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform');
434
448
  FileUtils.cp(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform.codes')
435
449
  FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
450
+ else
451
+ puts "No cross_platform probe ids for platform"
436
452
  end
437
453
  FileUtils.rm(prefix + '.swap') if File.exist?(prefix + '.swap')
438
454
  end
@@ -441,7 +457,7 @@ module GEO
441
457
  # system (called biomart for clarity)
442
458
  def self.GPL(platform)
443
459
  path = GEO::platform_path(platform)
444
- return if File.exist? path
460
+ return if path.nil? || File.exist?(path)
445
461
 
446
462
  if platform =~ /_/
447
463
  FileUtils.mkdir(path)
@@ -522,81 +538,101 @@ module GEO
522
538
 
523
539
  end
524
540
 
541
+ def self.platforms
542
+ Dir.glob(File.join(DATA_DIR, "GPL*")).collect {|path| File.basename(path) }
543
+ end
525
544
 
526
545
 
527
- #{{{ Local data store info
528
-
529
- def self.clean(name)
530
- name.sub(/_cross_platform/,'') if name
546
+ def self.dataset_type(dataset)
547
+ case
548
+ when dataset =~ /^GDS/
549
+ :GDS
550
+ when dataset =~ /^GSE/
551
+ :GSE
552
+ end
531
553
  end
532
554
 
533
-
534
555
  def self.platform_path(platform)
535
- File.join(MARQ.datadir, "GEO/#{clean(platform)}")
556
+ path = File.join(DATA_DIR, platform)
557
+ path = nil unless File.exists? path
558
+ path
536
559
  end
537
560
 
561
+ def self.dataset_path(dataset, platform = nil)
562
+ if platform
563
+ platforms = [platform]
564
+ else
565
+ platforms = self.platforms
566
+ end
538
567
 
539
- def self.is_cross_platform?(dataset)
540
- dataset =~ /_cross_platform/
541
- end
568
+ platforms.each do |platform|
569
+ platform_path = platform_path(platform)
570
+ next if platform_path.nil?
542
571
 
543
- def self.has_cross_platform?(dataset = nil, platform = nil)
544
- platform = clean(platform)
545
- raise "Dataset #{ dataset } not found" if dataset && dataset_path(dataset, platform).nil?
546
- raise "Platform #{ platform } not found" if platform && platform_path(platform).nil?
547
- if dataset
548
- File.exists?(dataset_path(dataset, platform) + "_cross_platform.orders")
549
- else
550
- Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
572
+ prefix = File.join(platform_path, dataset_type(dataset).to_s, dataset)
573
+ case
574
+ when File.exists?(prefix + '.orders')
575
+ return File.join(platform_path, dataset_type(dataset).to_s, dataset)
576
+ when File.exists?(prefix + '.skip')
577
+ return nil
578
+ end
551
579
  end
580
+
581
+ return nil
552
582
  end
553
583
 
554
- def self.dataset_path(dataset, platform = nil)
555
- if platform
556
- files = Dir.glob(File.join(platform_path(clean(platform)),"/*/#{ dataset }"))
584
+ def self.platform_datasets(platform)
585
+ cross_platform = MARQ::Platform.is_cross_platform? platform
586
+
587
+ path = platform_path(MARQ::Platform.clean(platform))
588
+ return [] if path.nil?
589
+
590
+ datasets = Dir.glob(File.join(path, '*', '*.orders')).
591
+ collect {|path| File.basename(path).sub(/\.orders$/,'')}
592
+
593
+ if cross_platform
594
+ datasets.select {|dataset| MARQ::Dataset.is_cross_platform? dataset }.
595
+ collect {|dataset| MARQ::Dataset.clean(dataset) }
557
596
  else
558
- files = Dir.glob(File.join(MARQ.datadir, "GEO/GPL*/*/#{ dataset }.*"))
597
+ datasets.select {|dataset| ! MARQ::Dataset.is_cross_platform? dataset }
559
598
  end
560
- return nil if files.empty?
561
- return files.first.match(/(.*)\./)[1]
562
599
  end
563
600
 
564
- def self.organism_platforms(organism)
565
- Dir.glob(File.join(MARQ.datadir, "GEO/GPL*")).collect{|f|
566
- File.basename(f)
567
- }.select{|platform|
568
- SOFT.GPL(platform)[:organism] == organism &&
569
- platform_datasets(platform).any?
570
- }
601
+ def self.dataset_platform(dataset)
602
+ path = dataset_path(dataset)
603
+ return nil if path.nil?
604
+ path.match(/(GPL\d+)/)
605
+ return $1
571
606
  end
572
607
 
608
+ def self.platform_organism(platform)
609
+ GEO::SOFT.GPL(platform)[:organism]
610
+ end
573
611
 
574
-
575
- def self.platform_datasets(platform)
576
- Dir.glob(File.join(platform_path(platform),"*/*.orders")).collect{|f| File.basename(f).sub(/.orders$/,'')}.select{|d| !is_cross_platform?(d)}
612
+ def self.dataset_organism(dataset)
613
+ platform_organism(dataset_platform(dataset))
577
614
  end
578
615
 
579
- def self.dataset_platform(dataset)
580
- dataset_path(dataset).match(/(GPL\d+)/)
581
- $1
616
+ def self.process_platform(platform)
617
+ GEO::Process.GPL(platform)
582
618
  end
583
619
 
584
- def self.GDS_info(name)
585
- begin
586
- title, description = Open.read(dataset_path(name) + '.description').split(/\n--\n/).values_at(0,1)
587
- {:title => title.strip, :description => description.strip}
588
- rescue Exception
589
- puts $!.message
590
- {:title => "" , :description => "" }
620
+ def self.process_dataset(dataset, platform)
621
+ case dataset_type(dataset)
622
+ when :GDS
623
+ GEO::Process.GDS(dataset, platform)
624
+ when :GSE
625
+ info = YAML::load(File.open("series/#{ dataset }.yaml"))
626
+ FileUtils.rm("platforms/#{ info[:platform] }.skip") if File.exist? "platforms/#{ info[:platform] }.skip"
627
+ GEO::Process.GSE(dataset, info)
591
628
  end
592
-
593
629
  end
594
630
 
595
-
596
631
  end
597
632
 
598
633
 
599
634
  if __FILE__ == $0
635
+ p GEO.dataset_path 'GDS2791_cross_platform', 'GPL96'
600
636
 
601
637
  end
602
638