rbbt-marq 1.1.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,236 @@
1
+ $expr_threshold ||= (ENV['threshold'] || 0.05).to_f
2
+ $folds ||= (ENV['folds'] || 2.5).to_f
3
+ $nth_genes ||= (ENV['nth_genes'] || 100).to_i
4
+
5
+ $force = [$force, ENV['force'], false].compact.first.to_s == 'true'
6
+ $tranlations = [$tranlations, ENV['translations'], false].compact.first.to_s == 'true'
7
+ $series = [$series, ENV['series'], true].compact.first.to_s == 'true'
8
+ $update_db = [$update_db, ENV['update_db'], false].compact.first.to_s == 'true'
9
+ $skip_db = [$skip_db, ENV['skip_db'], false].compact.first.to_s == 'true'
10
+ $fdr = [$fdr, ENV['fdr'], true].compact.first.to_s == 'true'
11
+ $do_folds = [$do_folds, ENV['do_folds'], true].compact.first.to_s == 'true'
12
+
13
+
14
+ $changes = false
15
+ module GEO::Process::R
16
+ class << self
17
+ alias_method :GDS_old, :GDS
18
+ def GDS(*args)
19
+ $changes = true
20
+ GDS_old(*args)
21
+ end
22
+
23
+ alias_method :GSE_old, :GSE
24
+ def GSE(*args)
25
+ $changes = true
26
+ GSE_old(*args)
27
+ end
28
+ end
29
+ end
30
+
31
+ desc "Analyze datasets"
32
+ task 'data' do
33
+
34
+ platforms_to_save = []
35
+
36
+ platforms = process_list
37
+ platforms.each{|platform, datasets|
38
+
39
+ begin
40
+ # Prepare the platform
41
+ MARQ::Platform.process(platform)
42
+ rescue
43
+ puts "Error processing platform #{platform}"
44
+ puts $!.message
45
+ puts $!.backtrace.join("\n")
46
+ next
47
+ end
48
+
49
+ next if $tranlations
50
+
51
+ $changes = false
52
+ # Process all datasets
53
+ datasets.each{|dataset|
54
+ begin
55
+ next unless $force || MARQ::Dataset.path(dataset).nil?
56
+ MARQ::Dataset.process(dataset, platform)
57
+ rescue
58
+ puts "Error processing dataset #{ dataset }"
59
+ puts $!.message
60
+ puts $!.backtrace.join("\n")
61
+ end
62
+ }
63
+
64
+ # Mark the platform for saving in DB
65
+ platforms_to_save << platform if $changes || $update_db
66
+ }
67
+
68
+ platforms_to_save.each{|platform|
69
+ begin
70
+ puts "Saving #{platform}"
71
+ MADB.save_platform(platform)
72
+ rescue
73
+ puts "Error saving platform #{ platform }"
74
+ puts $!.message
75
+ puts $!.backtrace.join("\n")
76
+ end
77
+ }
78
+ end
79
+
80
+ def annotations(name, cross_platform = false, &block)
81
+ platforms = process_list
82
+
83
+ platforms.each do |platform, datasets|
84
+ datasets.each do |dataset|
85
+ begin
86
+ next if File.exist?(File.join("annotations", name, dataset)) && ! $force
87
+ next if MARQ::Dataset.path(dataset).nil?
88
+
89
+ FileUtils.mkdir_p File.join("annotations", name)
90
+ filename = File.join("annotations", name, dataset)
91
+ dataset += '_cross_platform' if cross_platform && MARQ::Platform::has_cross_platform?(platform)
92
+ next if ! MARQ::Dataset.exists?(dataset)
93
+ terms = block.call(dataset)
94
+ Open.write(filename, terms.to_yaml)
95
+ rescue
96
+ puts "Error processing dataset #{ dataset }"
97
+ puts $!.message
98
+ puts $!.backtrace.join("\n")
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+
105
+ task 'annotate_Words' do
106
+ require 'MARQ/annotations'
107
+ require 'rbbt/bow/bow'
108
+ annotations('Words') do |dataset|
109
+ terms = {}
110
+ description = Open.read(MARQ::Dataset.path(dataset) + '.description')
111
+ terms[:dataset] = [dataset] + description.words.uniq
112
+ Open.read(MARQ::Dataset.path(dataset) + '.experiments').collect{|name|
113
+ name = name.strip
114
+ terms[name] = name.sub(/.*?: /,'').sub(/\[ratio\]/,'').words.uniq
115
+ }
116
+ terms
117
+ end
118
+ end
119
+
120
+
121
+ task 'annotate_UMLS' do
122
+ require 'MARQ/annotations'
123
+ require 'rbbt/util/misc'
124
+ annotations('UMLS') do |dataset|
125
+ terms = {}
126
+ description = Open.read(MARQ::Dataset.path(dataset) + '.description')
127
+ terms[:dataset] = Annotations::UMLS::OBA(description).uniq
128
+ Open.read(MARQ::Dataset.path(dataset) + '.experiments').collect{|name|
129
+ name = name.strip
130
+ terms[name] = Annotations::UMLS::OBA(name.sub(/.*?: /,'').sub(/\[ratio\]/,'')).uniq
131
+ }
132
+ terms
133
+ end
134
+ end
135
+
136
+
137
+ task 'annotate_Polysearch' do
138
+ require 'MARQ/annotations'
139
+ require 'rbbt/util/misc'
140
+ require 'rbbt/sources/polysearch'
141
+ annotations('Polysearch') do |dataset|
142
+ terms = {}
143
+ description = Open.read(MARQ::Dataset.path(dataset) + '.description')
144
+ terms[:dataset] = Polysearch::match(description).values.flatten.sort.collect{|n| n.gsub(/\s+/,' ').downcase}.uniq
145
+ Open.read(MARQ::Dataset.path(dataset) + '.experiments').collect{|name|
146
+ name = name.strip
147
+ terms[name] = Polysearch::match(name.sub(/.*?: /,'').sub(/\[ratio\]/,'')).values.flatten.sort.collect{|n| n.gsub(/\s+/,' ').downcase}.uniq
148
+ }
149
+ terms
150
+ end
151
+
152
+ end
153
+
154
+ def goterms(org, list, slim, threshold)
155
+ return [] if list.empty?
156
+ results = Annotations::Genes::Genecodis::Local.analysis(org, list, slim)
157
+ return [] if results.nil?
158
+ results.
159
+ select{|info| info[:s].to_i > 2 }.
160
+ select{|info| info[:hyp_c].to_f < threshold }.
161
+ collect{|info| info[:items]}.collect{|id| GO::id2name(id)}
162
+ end
163
+
164
+ task 'annotate_GO' do
165
+ require 'MARQ/annotations'
166
+ require 'rbbt/sources/go'
167
+ options = { :cut_off => $expr_threshold, :fdr => $fdr, :folds => $folds, :do_folds => $do_folds, :nth_genes => $nth_genes}
168
+ annotations('GO_up', true) do |dataset|
169
+ org = MARQ::Dataset.organism(dataset)
170
+ genes = Annotations::Genes.get_genes(dataset, options)
171
+
172
+ up = {}
173
+ genes[:up] ||= []
174
+ genes[:up].collect{|experiment,list|
175
+ up[experiment] = goterms(org, list, false, $expr_threshold)
176
+ }
177
+ up
178
+ end
179
+
180
+ annotations('GO_down', true) do |dataset|
181
+ org = MARQ::Dataset.organism(dataset)
182
+ genes = Annotations::Genes.get_genes(dataset, options)
183
+
184
+ down = {}
185
+ genes[:down] ||= []
186
+ genes[:down].collect{|experiment,list|
187
+ down[experiment] = goterms(org, list, false, $expr_threshold)
188
+ }
189
+ down
190
+ end
191
+
192
+ annotations('GOSlim_up', true) do |dataset|
193
+ org = MARQ::Dataset.organism(dataset)
194
+ genes = Annotations::Genes.get_genes(dataset, options)
195
+
196
+ up = {}
197
+ genes[:up] ||= []
198
+ genes[:up].collect{|experiment,list|
199
+ up[experiment] = goterms(org, list, true, $expr_threshold)
200
+ }
201
+ up
202
+ end
203
+
204
+ annotations('GOSlim_down', true) do |dataset|
205
+ org = MARQ::Dataset.organism(dataset)
206
+ genes = Annotations::Genes.get_genes(dataset, options)
207
+
208
+ down = {}
209
+ genes[:down] ||= []
210
+ genes[:down].collect{|experiment,list|
211
+ down[experiment] = goterms(org, list, true, $expr_threshold)
212
+ }
213
+ down
214
+ end
215
+ end
216
+
217
+ task 'annotate_SENT' do
218
+ require 'MARQ/annotations'
219
+ options = { :cut_off => $expr_threshold, :fdr => $fdr, :folds => $folds, :do_folds => $do_folds, :nth_genes => $nth_genes}
220
+ annotations('SENT') do |dataset|
221
+ org = MARQ::Dataset.organism(dataset)
222
+ genes = Annotations::Genes.get_genes(dataset, options)
223
+ terms = Annotations::Genes::SENT.terms(org, genes)
224
+ terms
225
+ end
226
+
227
+
228
+ end
229
+
230
+ task 'default' do
231
+ Rake::Task['data'].invoke
232
+ Rake::Task['annotate_Words'].invoke
233
+ Rake::Task['annotate_UMLS'].invoke
234
+ Rake::Task['annotate_Polysearch'].invoke
235
+ Rake::Task['annotate_GO'].invoke
236
+ end
data/lib/MARQ/CustomDS.rb CHANGED
@@ -5,9 +5,8 @@ require 'MARQ/ID'
5
5
  module CustomDS
6
6
  @@r = nil
7
7
 
8
- def self.customdir
9
- File.join(MARQ.datadir,'CustomDS')
10
- end
8
+ DATA_DIR = File.join(MARQ.datadir,'CustomDS')
9
+
11
10
 
12
11
  def self.r
13
12
  require 'rsruby'
@@ -35,33 +34,12 @@ module CustomDS
35
34
  end
36
35
  end
37
36
 
38
- def self.path(dataset)
39
- files = Dir.glob(customdir + "/*/#{ dataset }.orders")
40
- if files.length == 1
41
- files.first.sub(/.orders/,'')
42
- else
43
- Dir.glob(customdir + "/*/#{ dataset }").first
44
- end
45
- end
46
-
47
37
  def self.organism(dataset)
48
- path(dataset).match(/#{ customdir }\/(.*?)\//)[1]
49
- end
50
-
51
- def self.is_cross_platform?(dataset)
52
- dataset.match(/_cross_platform/)
53
- end
54
-
55
- def self.clean(dataset)
56
- dataset.sub(/_cross_platform/,'')
57
- end
58
-
59
- def self.has_cross_platform?(dataset)
60
- Dir.glob(path(clean(dataset)) + '_cross_platform.orders').any?
38
+ path(dataset).match(/#{ DATA_DIR }\/(.*?)\//)[1]
61
39
  end
62
40
 
63
41
  def self.datasets(org)
64
- Dir.glob(File.join(customdir, org) + '/*.orders').collect{|f| clean(File.basename(f.sub(/.orders/,'')))}.uniq
42
+ Dir.glob(File.join(DATA_DIR, org) + '/*.orders').collect{|f| clean(File.basename(f.sub(/.orders/,'')))}.uniq
65
43
  end
66
44
 
67
45
  def self.process_matrix(prefix, org)
@@ -82,9 +60,62 @@ module CustomDS
82
60
  end
83
61
 
84
62
  def self.process(name)
85
- puts "Processing #{ name }"
86
- org = organism(name)
87
- prefix = File.join(customdir, org, name)
63
+ end
64
+
65
+ def self.organisms
66
+ Dir.glob(File.join(DATA_DIR, '*')).
67
+ select {|path| File.directory? path}.
68
+ collect {|path| File.basename path}
69
+ end
70
+
71
+ def self.dataset_path(dataset)
72
+ organisms.each do |organism|
73
+ case
74
+ when File.exists?(File.join(DATA_DIR, organism, dataset + '.orders'))
75
+ return File.join(DATA_DIR, organism, dataset)
76
+ when File.exists?(File.join(DATA_DIR, organism, dataset + '.skip'))
77
+ return nil
78
+ end
79
+ end
80
+ return nil
81
+ end
82
+
83
+ def self.platform_path(platform)
84
+ dataset_path(platform)
85
+ end
86
+
87
+ def self.platform_datasets(platform)
88
+ MARQ::Dataset.clean(platform)
89
+ end
90
+
91
+ def self.platform_organism(platform)
92
+ path = platform_path(platform)
93
+ return nil if path.nil?
94
+ path.match(/#{DATA_DIR}\/(.*)\/#{ platform }$/)
95
+ return $1
96
+ end
97
+
98
+ def self.dataset_organism(dataset)
99
+ platform_organism(dataset)
100
+ end
101
+
102
+ def self.dataset_platform(dataset)
103
+ dataset
104
+ end
105
+
106
+ def self.organism_platforms(organism)
107
+ Dir.glob(File.join(DATA_DIR,organism,'*.orders')).
108
+ collect {|path| File.basename(path).sub(/\.orders$/,'').sub(/_cross_platform/,'')}.
109
+ uniq
110
+ end
111
+
112
+ def self.process_platform(platform)
113
+ end
114
+
115
+ def self.process_dataset(dataset, platform = nil)
116
+ puts "Processing #{ dataset }"
117
+ org = dataset_organism(dataset)
118
+ prefix = File.join(DATA_DIR, org, dataset)
88
119
 
89
120
  CustomDS::process_matrix(prefix, org)
90
121
  end
@@ -94,13 +125,13 @@ end
94
125
 
95
126
  if __FILE__ == $0
96
127
  p CustomDS::datasets('sgd')
97
- p CustomDS::path('HaploidData')
98
- p CustomDS::path('HaploidData_cross_platform')
128
+ p CustomDS::dataset_path('HaploidData')
129
+ p CustomDS::dataset_path('HaploidData_cross_platform')
99
130
 
100
131
  exit
101
132
 
102
133
  org = 'sgd'
103
- process = Dir.glob(File.join(CustomDS::customdir, org) + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)} - CustomDS.datasets('sgd')
134
+ process = Dir.glob(File.join(CustomDS::DATA_DIR, org) + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)} - CustomDS.datasets('sgd')
104
135
  p process
105
136
  process.each{|d| CustomDS::process(d)}
106
137
 
data/lib/MARQ/GEO.rb CHANGED
@@ -1,45 +1,56 @@
1
1
  require 'MARQ'
2
+ require 'MARQ/main'
2
3
  require 'rbbt/sources/organism'
3
4
 
4
5
  # Work with GEO datasets
5
6
  module GEO
6
7
 
8
+ CACHE_DIR = File.join(MARQ.cachedir,'GEO')
9
+ FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
10
+
11
+ DATA_DIR = File.join(MARQ.datadir, 'GEO')
12
+
7
13
  # Get information from Entrez
8
14
  module Remote
9
15
 
16
+ @@nice = 1
10
17
  def self.organism_platforms(org)
11
18
  name = Organism.name(org)
12
- Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=\"#{name}\"[Organism:exp]+AND+%22gpl%22[Filter]&retmax=10000").
19
+ Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=\"#{name}\"[Organism:exp]+AND+%22gpl%22[Filter]&retmax=10000", :nice => @@nice).
13
20
  scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.collect{|id| "GPL#{id.sub(/^100*/,'')}"}
14
21
  end
15
22
 
16
23
  def self.platform_datasets(platform)
17
- Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=#{platform}[Accession]&retmax=2000").
24
+ Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds&term=#{platform}[Accession]&retmax=2000", :nice => @@nice).
18
25
  scan(/<Id>(\d+?)<\/Id>/).collect{|id| id.first}.select{|id| !id.match(/^(1|2)000/) }.collect{|id| "GDS#{id}"}
19
26
  end
20
27
 
21
28
  def self.dataset_platform(dataset)
22
29
  if dataset =~ /GSE/
23
- Open.read("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
30
+ Open.read("http://www.ncbi.nlm.nih.gov/projects/geo/query/acc.cgi?acc=#{dataset}", :nice => @@nice).scan(/GPL\d+/).uniq.sort.join("_")
24
31
  else
25
- Open.read("http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=#{dataset}").scan(/GPL\d+/).uniq.sort.join("_")
32
+ Open.read("http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=#{dataset}", :nice => @@nice).scan(/GPL\d+/).uniq.sort.join("_")
26
33
  end
27
34
  end
28
35
 
29
36
  def self.series_dataset?(gse)
30
- Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=geo&term=#{gse}[Accession]&retmax=2000").
37
+ Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=geo&term=#{gse}[Accession]&retmax=2000", :nice => @@nice).
31
38
  match(/<Id>(\d+?)<\/Id>/) != nil
32
39
  end
33
40
 
41
+ def self.platform_organism(platform)
42
+ Open.read("http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=#{platform}", :nice => @@nice).
43
+ match(%r#<td><a href="http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi\?mode=Info&amp;id=\d+" onmouseout="onLinkOut\('HelpMessage' , geo_empty_help\)" onmouseover="onLinkOver\('HelpMessage' , geoaxema_organismus\)">(.*)</a></td>#)[1]
44
+ end
45
+
34
46
  end
35
47
 
36
- CACHE_DIR = File.join(MARQ.cachedir,'GEO')
37
- FileUtils.mkdir_p CACHE_DIR unless File.exists? CACHE_DIR
38
48
 
39
49
 
40
50
  # Parse information in .soft files
41
51
  module SOFT
42
52
 
53
+ @@nice = 1
43
54
  GEO_SOFT="http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=self&view=full&form=text&acc="
44
55
 
45
56
  # Download a soft file. Uses cache
@@ -49,7 +60,7 @@ module GEO
49
60
  if File.exist?( cache_file )
50
61
  File.open(cache_file).read
51
62
  else
52
- content = Open.read(GEO_SOFT + item, :nocache => true)
63
+ content = Open.read(GEO_SOFT + item, :nocache => true, :nice => @@nice)
53
64
  raise "SOFT file error" if content !~ /!/
54
65
  fout = File.open(cache_file,'w')
55
66
  fout.write content
@@ -105,7 +116,7 @@ module GEO
105
116
  soft = get_soft(series)
106
117
 
107
118
  if match = soft.scan(/!Series_platform_id\s*=?\s*(.*)/)
108
- platform = match.flatten.collect{|p| p.strip}
119
+ platform = match.flatten.collect{|p| p.strip}.join("_")
109
120
  else
110
121
  raise "No Platform information"
111
122
  end
@@ -131,7 +142,7 @@ module GEO
131
142
  end
132
143
 
133
144
  {
134
- :platform => platform.join("_"),
145
+ :platform => platform,
135
146
  :description =>description.strip,
136
147
  :title => title.strip,
137
148
  :samples => samples,
@@ -162,8 +173,8 @@ module GEO
162
173
  end
163
174
 
164
175
  def self.GPL(platform)
165
- if !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")) &&
166
- !File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
176
+ if !File.exist?(File.join(DATA_DIR, 'platforms',"#{platform}.yaml")) &&
177
+ !File.exist?(File.join(DATA_DIR, 'platforms',"#{platform}.skip"))
167
178
  begin
168
179
  if platform =~ /_/
169
180
  organism = GPL(platform.match(/(.*?)_/)[1])[:organism]
@@ -230,17 +241,17 @@ module GEO
230
241
  info[:other_ID_field] = [other_pos + 1, other_name] if other_pos > 0
231
242
 
232
243
 
233
- Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml"), info.to_yaml)
244
+ Open.write(File.join(DATA_DIR, 'platforms',"#{platform}.yaml"), info.to_yaml)
234
245
  rescue Exception
235
246
  puts $!.message
236
247
  puts $!.backtrace
237
- Open.write(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"), $!.message)
248
+ Open.write(File.join(DATA_DIR, 'platforms',"#{platform}.skip"), $!.message)
238
249
  end
239
250
  end
240
251
 
241
252
  raise "Platform info for #{ platform } is not available and could not be automatically produced." if File.exist?(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.skip"))
242
253
 
243
- YAML::load(File.open(File.join(MARQ.datadir, 'GEO', 'platforms',"#{platform}.yaml")))
254
+ YAML::load(File.open(File.join(DATA_DIR, 'platforms',"#{platform}.yaml")))
244
255
  end
245
256
 
246
257
 
@@ -365,6 +376,8 @@ module GEO
365
376
  if File.exist?(File.join(platform,'cross_platform'))
366
377
  puts "-- Translated to cross_platform format"
367
378
  R.GDS(dataset, prefix + '_cross_platform', field, File.join(platform_path, 'translations'))
379
+ else
380
+ puts "No cross_platform probe ids for platform"
368
381
  end
369
382
  end
370
383
 
@@ -388,10 +401,11 @@ module GEO
388
401
  do_log = !info[:log2] if info[:log2]
389
402
  fields = info[:fields]
390
403
 
391
- puts "Processing GSE #{ series }. Platform #{ platform }"
392
-
393
404
  platform_path = GEO::platform_path(platform)
405
+ return if platform_path.nil?
394
406
  prefix = File.join(platform_path, 'GSE', series.to_s)
407
+
408
+ puts "Processing GSE #{ series }. Platform #{ platform }"
395
409
  puts "-- Original"
396
410
  R.GSE(gsms, conditions, do_log, prefix, nil, fields, info[:title], info[:description])
397
411
 
@@ -433,6 +447,8 @@ module GEO
433
447
  fix_GSE_ids(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform');
434
448
  FileUtils.cp(File.join(platform_path, 'cross_platform'),prefix + '_cross_platform.codes')
435
449
  FileUtils.rm(prefix + '.translations') if File.exist?(prefix + '.translations')
450
+ else
451
+ puts "No cross_platform probe ids for platform"
436
452
  end
437
453
  FileUtils.rm(prefix + '.swap') if File.exist?(prefix + '.swap')
438
454
  end
@@ -441,7 +457,7 @@ module GEO
441
457
  # system (called biomart for clarity)
442
458
  def self.GPL(platform)
443
459
  path = GEO::platform_path(platform)
444
- return if File.exist? path
460
+ return if path.nil? || File.exist?(path)
445
461
 
446
462
  if platform =~ /_/
447
463
  FileUtils.mkdir(path)
@@ -522,81 +538,101 @@ module GEO
522
538
 
523
539
  end
524
540
 
541
+ def self.platforms
542
+ Dir.glob(File.join(DATA_DIR, "GPL*")).collect {|path| File.basename(path) }
543
+ end
525
544
 
526
545
 
527
- #{{{ Local data store info
528
-
529
- def self.clean(name)
530
- name.sub(/_cross_platform/,'') if name
546
+ def self.dataset_type(dataset)
547
+ case
548
+ when dataset =~ /^GDS/
549
+ :GDS
550
+ when dataset =~ /^GSE/
551
+ :GSE
552
+ end
531
553
  end
532
554
 
533
-
534
555
  def self.platform_path(platform)
535
- File.join(MARQ.datadir, "GEO/#{clean(platform)}")
556
+ path = File.join(DATA_DIR, platform)
557
+ path = nil unless File.exists? path
558
+ path
536
559
  end
537
560
 
561
+ def self.dataset_path(dataset, platform = nil)
562
+ if platform
563
+ platforms = [platform]
564
+ else
565
+ platforms = self.platforms
566
+ end
538
567
 
539
- def self.is_cross_platform?(dataset)
540
- dataset =~ /_cross_platform/
541
- end
568
+ platforms.each do |platform|
569
+ platform_path = platform_path(platform)
570
+ next if platform_path.nil?
542
571
 
543
- def self.has_cross_platform?(dataset = nil, platform = nil)
544
- platform = clean(platform)
545
- raise "Dataset #{ dataset } not found" if dataset && dataset_path(dataset, platform).nil?
546
- raise "Platform #{ platform } not found" if platform && platform_path(platform).nil?
547
- if dataset
548
- File.exists?(dataset_path(dataset, platform) + "_cross_platform.orders")
549
- else
550
- Dir.glob(File.join(platform_path(platform), '*', '*_cross_platform.orders')).any?
572
+ prefix = File.join(platform_path, dataset_type(dataset).to_s, dataset)
573
+ case
574
+ when File.exists?(prefix + '.orders')
575
+ return File.join(platform_path, dataset_type(dataset).to_s, dataset)
576
+ when File.exists?(prefix + '.skip')
577
+ return nil
578
+ end
551
579
  end
580
+
581
+ return nil
552
582
  end
553
583
 
554
- def self.dataset_path(dataset, platform = nil)
555
- if platform
556
- files = Dir.glob(File.join(platform_path(clean(platform)),"/*/#{ dataset }"))
584
+ def self.platform_datasets(platform)
585
+ cross_platform = MARQ::Platform.is_cross_platform? platform
586
+
587
+ path = platform_path(MARQ::Platform.clean(platform))
588
+ return [] if path.nil?
589
+
590
+ datasets = Dir.glob(File.join(path, '*', '*.orders')).
591
+ collect {|path| File.basename(path).sub(/\.orders$/,'')}
592
+
593
+ if cross_platform
594
+ datasets.select {|dataset| MARQ::Dataset.is_cross_platform? dataset }.
595
+ collect {|dataset| MARQ::Dataset.clean(dataset) }
557
596
  else
558
- files = Dir.glob(File.join(MARQ.datadir, "GEO/GPL*/*/#{ dataset }.*"))
597
+ datasets.select {|dataset| ! MARQ::Dataset.is_cross_platform? dataset }
559
598
  end
560
- return nil if files.empty?
561
- return files.first.match(/(.*)\./)[1]
562
599
  end
563
600
 
564
- def self.organism_platforms(organism)
565
- Dir.glob(File.join(MARQ.datadir, "GEO/GPL*")).collect{|f|
566
- File.basename(f)
567
- }.select{|platform|
568
- SOFT.GPL(platform)[:organism] == organism &&
569
- platform_datasets(platform).any?
570
- }
601
+ def self.dataset_platform(dataset)
602
+ path = dataset_path(dataset)
603
+ return nil if path.nil?
604
+ path.match(/(GPL\d+)/)
605
+ return $1
571
606
  end
572
607
 
608
+ def self.platform_organism(platform)
609
+ GEO::SOFT.GPL(platform)[:organism]
610
+ end
573
611
 
574
-
575
- def self.platform_datasets(platform)
576
- Dir.glob(File.join(platform_path(platform),"*/*.orders")).collect{|f| File.basename(f).sub(/.orders$/,'')}.select{|d| !is_cross_platform?(d)}
612
+ def self.dataset_organism(dataset)
613
+ platform_organism(dataset_platform(dataset))
577
614
  end
578
615
 
579
- def self.dataset_platform(dataset)
580
- dataset_path(dataset).match(/(GPL\d+)/)
581
- $1
616
+ def self.process_platform(platform)
617
+ GEO::Process.GPL(platform)
582
618
  end
583
619
 
584
- def self.GDS_info(name)
585
- begin
586
- title, description = Open.read(dataset_path(name) + '.description').split(/\n--\n/).values_at(0,1)
587
- {:title => title.strip, :description => description.strip}
588
- rescue Exception
589
- puts $!.message
590
- {:title => "" , :description => "" }
620
+ def self.process_dataset(dataset, platform)
621
+ case dataset_type(dataset)
622
+ when :GDS
623
+ GEO::Process.GDS(dataset, platform)
624
+ when :GSE
625
+ info = YAML::load(File.open("series/#{ dataset }.yaml"))
626
+ FileUtils.rm("platforms/#{ info[:platform] }.skip") if File.exist? "platforms/#{ info[:platform] }.skip"
627
+ GEO::Process.GSE(dataset, info)
591
628
  end
592
-
593
629
  end
594
630
 
595
-
596
631
  end
597
632
 
598
633
 
599
634
  if __FILE__ == $0
635
+ p GEO.dataset_path 'GDS2791_cross_platform', 'GPL96'
600
636
 
601
637
  end
602
638