rbbt-marq 1.0.9 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/R/GEO.R CHANGED
@@ -1,106 +1,8 @@
1
1
  library(Biobase);
2
2
  library(GEOquery);
3
3
 
4
-
5
- GEO.path <- function(dataset, cross_platform = FALSE, datadir = NULL){
6
- if (is.null(datadir) && exists('MARQ.config')){
7
- datadir= paste(MARQ.config$datadir, 'GEO', sep="/");
8
- }
9
-
10
- if (is.null(datadir)){
11
- print("No datadir specified and no default found (MARQ.config$datadir");
12
- exit(-1);
13
- }
14
-
15
- if ( length(grep('_cross_platform', dataset)) == 0 && cross_platform){
16
- dataset = paste(dataset, '_cross_platform', sep = "");
17
-
18
- }
19
-
20
-
21
- files = Sys.glob(paste(datadir,'*', '*', paste(dataset, 'orders', sep="."), sep="/"));
22
-
23
- if (length(files) == 0){
24
- return(NULL);
25
- }
26
- else{
27
- return(sub('.orders','', files[1]));
28
- }
29
- }
30
-
31
- GEO.platform <- function(dataset, datadir = NULL){
32
-
33
- path = GEO.path(dataset, datadir = datadir);
34
-
35
- if (is.null(path)){ return(NULL);}
36
-
37
- return(sub(".*(GPL\\d+).*","\\1", path, perl = TRUE));
38
- }
39
-
40
- GEO.platform.path <- function(platform, datadir = NULL){
41
- if (is.null(datadir) && exists('MARQ.config')){
42
- datadir= paste(MARQ.config$datadir, 'GEO', sep="/");
43
- }
44
-
45
- if (is.null(datadir)){
46
- print("No datadir specified and no default found (MARQ.config$datadir");
47
- exit(-1);
48
- }
49
-
50
- return(paste(datadir, platform, sep="/"));
51
- }
52
-
53
- GEO.platform.datasets <- function(platform, cross_platform = TRUE, series = TRUE, datadir = NULL){
54
- if (cross_platform){
55
- cp.suffix = '_cross_platform'
56
- }
57
- else{
58
- cp.suffix = ''
59
- }
60
-
61
- if (series){
62
- pattern = '*'
63
- }
64
- else{
65
- pattern = 'GDS'
66
- }
67
- files = Sys.glob(paste(GEO.platform.path(platform, datadir), pattern, paste('*',cp.suffix,'.orders',sep=""),sep="/"))
68
-
69
- return(sapply(files, function(path){ sub(".*((?:GDS|GSE)\\d+).*", '\\1', path, perl=TRUE)}, USE.NAMES = FALSE));
70
- }
71
-
72
- GEO.values <- function(data){
73
- values <- MA.process(data$m, data$conditions, data$two.channel)
74
-
75
- if (length(values$ratios) == 0){
76
- return(NULL);
77
- }else{
78
- ratios = as.data.frame(values$ratios);
79
- t = as.data.frame(values$t);
80
- p.values = as.data.frame(values$p.values);
81
-
82
-
83
- # Calculate orders from best information
84
- best = vector();
85
- names = vector();
86
- for (name in colnames(ratios)){
87
- if (sum(colnames(t) == name) > 0){
88
- best = cbind(best, t[,name]);
89
- names = c(names, name);
90
- }else{
91
- best = cbind(best, ratios[,name]);
92
- names = c(names, paste(name,'[ratio]', sep=" "));
93
- }
94
- }
95
- rownames(best) <- rownames(ratios)
96
- orders = as.data.frame(MA.get_order(best));
97
- colnames(orders) <- names
98
-
99
- return(list(ratios = ratios, t = t, p.values = p.values, orders = orders));
100
- }
101
- }
102
-
103
-
4
+ ####################################################
5
+ # Data retrieval functions
104
6
 
105
7
  GEO.get <- function(name, cachedir = NULL){
106
8
  if (is.null(cachedir) && exists('MARQ.config')){
@@ -161,7 +63,10 @@ GEO.GDS.data <- function(name, id.field = NULL, translation.file = NULL, cachedi
161
63
  if (!is.null(id.field)){
162
64
  trans = featureData(eSet)[[id.field]];
163
65
  }
164
- if (!is.null(translation.file)){
66
+ if (!is.null(translation.file) && translation.file != FALSE){
67
+ if (translation.file == TRUE && exists(MARQ.platform.path)){
68
+ translation.file = paste(MARQ.platform.path(gpl_name), 'translations', sep="/");
69
+ }
165
70
  trans = scan(file=translation.file,what=character(),sep="\n",quiet=T);
166
71
  }
167
72
 
@@ -169,33 +74,12 @@ GEO.GDS.data <- function(name, id.field = NULL, translation.file = NULL, cachedi
169
74
  m <- MA.translate(m, trans);
170
75
  }
171
76
 
172
- return (list(conditions = conditions, m = m, two.channel = two.channel, description = description))
173
- }
174
77
 
175
78
 
176
- GEO.GDS.process <- function(name, prefix, id.field = NULL, translation.file = NULL,cachedir=NULL){
177
- tryCatch(
178
- {
179
- gds.data = GEO.GDS.data(name, id.field, translation.file, cachedir)
180
- values = GEO.values(gds.data)
181
- if (is.null(values)){
182
- write(file=paste(prefix,'skip',sep="."), "No suitable samples for analysis" );
183
- }else{
184
- MA.save(prefix, values$orders, values$ratios, values$t, values$p.values, colnames(values$orders), gds.data$description);
185
- }
186
- }
187
- ,
188
- error=function(x){
189
- print("Exception caught");
190
- print(x);
191
- write(file=paste(prefix,'skip',sep="."), paste("An exception was caught during the analysis.",x,sep="\n") );
192
- }
193
- )
79
+ return (list(conditions = conditions, m = m, two.channel = two.channel, description = description))
194
80
  }
195
81
 
196
82
 
197
-
198
-
199
83
  GEO.GSE.data <- function(gsms, conditions, do.log2 = NULL, translation.file = NULL, use.fields = NULL, cachedir = NULL){
200
84
 
201
85
  c = sapply(conditions,function(x){x});
@@ -261,7 +145,10 @@ GEO.GSE.data <- function(gsms, conditions, do.log2 = NULL, translation.file = NU
261
145
  }
262
146
 
263
147
  trans = NULL
264
- if (!is.null(translation.file)){
148
+ if (!is.null(translation.file) && translation.file != FALSE){
149
+ if (translation.file == TRUE && exists(MARQ.platform.path)){
150
+ translation.file = paste(MARQ.platform.path(gpl_name), 'translations', sep="/");
151
+ }
265
152
  trans = read.table(file=translation.file, sep="\t",header=F)[,1];
266
153
  }
267
154
  if (!is.null(trans)){
@@ -282,6 +169,63 @@ GEO.GSE.data <- function(gsms, conditions, do.log2 = NULL, translation.file = NU
282
169
  return (list(conditions = conditions, m = m, two.channel = two.channel))
283
170
  }
284
171
 
172
+
173
+ ####################################################
174
+ # Processing Functions
175
+
176
+ GEO.values <- function(data){
177
+ values <- MA.process(data$m, data$conditions, data$two.channel)
178
+
179
+ if (length(values$ratios) == 0){
180
+ return(NULL);
181
+ }else{
182
+ ratios = as.data.frame(values$ratios);
183
+ t = as.data.frame(values$t);
184
+ p.values = as.data.frame(values$p.values);
185
+
186
+
187
+ # Calculate orders from best information
188
+ best = vector();
189
+ names = vector();
190
+ for (name in colnames(ratios)){
191
+ if (sum(colnames(t) == name) > 0){
192
+ best = cbind(best, t[,name]);
193
+ names = c(names, name);
194
+ }else{
195
+ best = cbind(best, ratios[,name]);
196
+ names = c(names, paste(name,'[ratio]', sep=" "));
197
+ }
198
+ }
199
+ rownames(best) <- rownames(ratios)
200
+ orders = as.data.frame(MA.get_order(best));
201
+ colnames(orders) <- names
202
+
203
+ return(list(ratios = ratios, t = t, p.values = p.values, orders = orders));
204
+ }
205
+ }
206
+
207
+
208
+
209
+ GEO.GDS.process <- function(name, prefix, id.field = NULL, translation.file = NULL,cachedir=NULL){
210
+ tryCatch(
211
+ {
212
+ gds.data = GEO.GDS.data(name, id.field, translation.file, cachedir)
213
+ values = GEO.values(gds.data)
214
+ if (is.null(values)){
215
+ write(file=paste(prefix,'skip',sep="."), "No suitable samples for analysis" );
216
+ }else{
217
+ MA.save(prefix, values$orders, values$ratios, values$t, values$p.values, colnames(values$orders), gds.data$description);
218
+ }
219
+ }
220
+ ,
221
+ error=function(x){
222
+ print("Exception caught");
223
+ print(x);
224
+ write(file=paste(prefix,'skip',sep="."), paste("An exception was caught during the analysis.",x,sep="\n") );
225
+ }
226
+ )
227
+ }
228
+
285
229
  GEO.GSE.process <- function(gsms, conditions, prefix, do.log2 = NULL, translation.file = NULL, use.field = NULL, title = NULL, description = NULL,cachedir=NULL){
286
230
  tryCatch(
287
231
  {
@@ -0,0 +1,44 @@
1
+ library(GEOquery);
2
+
3
+ # The original version of the function failed if the dataset had extra probe ids not in the platform.
4
+ # This version fixes that
5
+ "GDS2eSet" <-
6
+ function(GDS,do.log2=FALSE,GPL=NULL,AnnotGPL=TRUE) {
7
+ require(Biobase)
8
+ # exclude non-numeric columns
9
+ if(is.null(GPL)) {
10
+ GPL <- getGEO(Meta(GDS)$platform,AnnotGPL=AnnotGPL)
11
+ }
12
+ ord.table <- match(Table(GDS)[,1],Table(GPL)[,1])
13
+ inc.columns <- grep('GSM',colnames(Table(GDS)))
14
+ mat <- suppressWarnings(as.matrix(apply(Table(GDS)[,inc.columns],2,
15
+ function(x) {as.numeric(as.character(x))})))
16
+ if(do.log2) {
17
+ expr <- log2(mat)
18
+ } else {
19
+ expr <- mat
20
+ }
21
+ rownames(expr) <- as.character(Table(GDS)$ID_REF)
22
+ tmp <- Columns(GDS)
23
+ rownames(tmp) <- as.character(tmp$sample)
24
+ pheno <- new("AnnotatedDataFrame",data=tmp)
25
+ mabstract=ifelse(is.null(Meta(GDS)$description),"",Meta(GDS)$description)
26
+ mpubmedids=ifelse(is.null(Meta(GDS)$pubmed_id),"",Meta(GDS)$pubmed_id)
27
+ mtitle=ifelse(is.null(Meta(GDS)$title),"",Meta(GDS)$title)
28
+ dt <- Table(GPL)
29
+ rownames(dt) <- as.character(dt$ID)
30
+ featuredata <- new('AnnotatedDataFrame',data=dt[ord.table,],
31
+ varMetadata=data.frame(Column=Columns(GPL)[,1],
32
+ labelDescription=Columns(GPL)[,2]))
33
+
34
+ # use !is.na(ord.table) to remove extra probe ids in GDS and not in GPL
35
+ eset <- new('ExpressionSet',exprs=expr[!is.na(ord.table),],phenoData=pheno,
36
+ featureData=featuredata[!is.na(ord.table),],
37
+ experimentData=new("MIAME",
38
+ abstract=mabstract,
39
+ title=mtitle,
40
+ pubMedIds=mpubmedids,
41
+ other=Meta(GDS)))
42
+ return(eset)
43
+ }
44
+
data/R/MARQ.R CHANGED
@@ -1,3 +1,79 @@
1
1
  library('yaml');
2
2
 
3
3
  MARQ.config = yaml.load_file('~/.MARQ');
4
+
5
+
6
+ ####################################################
7
+ # GEO platforms and datasets helper functions
8
+
9
+ MARQ.GEO.path <- function(dataset, cross_platform = FALSE, datadir = NULL){
10
+ if (is.null(datadir) && exists('MARQ.config')){
11
+ datadir= paste(MARQ.config$datadir, 'GEO', sep="/");
12
+ }
13
+
14
+ if (is.null(datadir)){
15
+ print("No datadir specified and no default found (MARQ.config$datadir");
16
+ exit(-1);
17
+ }
18
+
19
+ if ( length(grep('_cross_platform', dataset)) == 0 && cross_platform){
20
+ dataset = paste(dataset, '_cross_platform', sep = "");
21
+
22
+ }
23
+
24
+
25
+ files = Sys.glob(paste(datadir,'*', '*', paste(dataset, 'orders', sep="."), sep="/"));
26
+
27
+ if (length(files) == 0){
28
+ return(NULL);
29
+ }
30
+ else{
31
+ return(sub('.orders','', files[1]));
32
+ }
33
+ }
34
+
35
+ MARQ.GEO.platform <- function(dataset, datadir = NULL){
36
+
37
+ path = MARQ.GEO.path(dataset, datadir = datadir);
38
+
39
+ if (is.null(path)){ return(NULL);}
40
+
41
+ return(sub(".*(GPL\\d+).*","\\1", path, perl = TRUE));
42
+ }
43
+
44
+ MARQ.GEO.platform.path <- function(platform, datadir = NULL){
45
+ if (is.null(datadir) && exists('MARQ.config')){
46
+ datadir= paste(MARQ.config$datadir, 'GEO', sep="/");
47
+ }
48
+
49
+ if (is.null(datadir)){
50
+ print("No datadir specified and no default found (MARQ.config$datadir");
51
+ exit(-1);
52
+ }
53
+
54
+ return(paste(datadir, platform, sep="/"));
55
+ }
56
+
57
+ MARQ.GEO.platform.datasets <- function(platform, cross_platform = TRUE, series = TRUE, datadir = NULL){
58
+ if (cross_platform){
59
+ cp.suffix = '_cross_platform'
60
+ }
61
+ else{
62
+ cp.suffix = ''
63
+ }
64
+
65
+ if (series){
66
+ pattern = '*'
67
+ }
68
+ else{
69
+ pattern = 'GDS'
70
+ }
71
+ files = Sys.glob(paste(MARQ.GEO.platform.path(platform, datadir), pattern, paste('*',cp.suffix,'.orders',sep=""),sep="/"))
72
+
73
+ return(sapply(files, function(path){ sub(".*((?:GDS|GSE)\\d+).*", '\\1', path, perl=TRUE)}, USE.NAMES = FALSE));
74
+ }
75
+
76
+ MARQ.GEO.load <- function(dataset, cross_platform = FALSE, orders = TRUE, logratios = TRUE, t = TRUE, p.values = TRUE){
77
+ return(MA.load(MARQ.GEO.path(dataset, cross_platform), orders, logratios, t, p.values));
78
+ }
79
+
@@ -5,71 +5,144 @@ require 'MARQ/ID'
5
5
  require 'yaml'
6
6
  require 'progress-monitor'
7
7
  require 'MARQ/MADB'
8
+ require 'rbbt/sources/organism'
8
9
 
9
10
  $platform ||= ENV['platform']
10
- $org ||= [$organism, ENV['organism'], nil].reject{|e| e.nil?}.first
11
- $series ||= ENV['series']
11
+ $organism ||= [$organism, ENV['organism'], nil].compact.first
12
+ $dataset ||= ENV['dataset']
12
13
 
13
14
  $expr_threshold ||= (ENV['threshold'] || 0.05).to_f
14
15
  $folds ||= (ENV['folds'] || 2.5).to_f
15
16
  $nth_genes ||= (ENV['nth_genes'] || 100).to_i
16
17
 
17
- $force = [$force, ENV['force'], false]. reject{|e| e.nil?}.first
18
- $update_db = [$update_db, ENV['update_db'], true]. reject{|e| e.nil?}.first
19
- $fdr = [$fdr, ENV['fdr'], true]. reject{|e| e.nil?}.first
20
- $do_folds = [$do_folds, ENV['do_folds'], true]. reject{|e| e.nil?}.first
18
+ $force = [$force, ENV['force'], false].compact.first.to_s == 'true'
19
+ $tranlations = [$tranlations, ENV['translations'], false].compact.first.to_s == 'true'
20
+ $series = [$series, ENV['series'], true].compact.first.to_s == 'true'
21
+ $update_db = [$update_db, ENV['update_db'], false].compact.first.to_s == 'true'
22
+ $skip_db = [$skip_db, ENV['skip_db'], false].compact.first.to_s == 'true'
23
+ $fdr = [$fdr, ENV['fdr'], true].compact.first.to_s == 'true'
24
+ $do_folds = [$do_folds, ENV['do_folds'], true].compact.first.to_s == 'true'
21
25
 
22
26
 
23
27
  # Record changes in order to update DB
24
28
  $changes = false
25
- module GEO
29
+ module GEO::Process::R
26
30
  class << self
27
- alias_method :get_GDS_old, :get_GDS
28
- def get_GDS(*args)
31
+ alias_method :GDS_old, :GDS
32
+ def GDS(*args)
29
33
  $changes = true
30
- get_GDS_old(*args)
34
+ GDS_old(*args)
31
35
  end
32
36
 
33
- alias_method :get_GSE_old, :get_GSE
34
- def get_GSE(*args)
37
+ alias_method :GSE_old, :GSE
38
+ def GSE(*args)
35
39
  $changes = true
36
- get_GSE_old(*args)
40
+ GSE_old(*args)
37
41
  end
38
42
  end
39
43
  end
40
44
 
41
45
 
42
- def process_platform(platform, update_db = false)
43
- begin
44
- $changes = false
45
- GEO.process_platform(platform)
46
- GEO.process_platform_datasets(platform, $force && !update_db)
47
- if update_db && ($changes || $force)
48
- puts "Saving #{platform}"
49
- MADB::GEO.saveGPL(platform) if update_db && ($changes || $force)
50
- end
51
- rescue
52
- puts $!.message
46
+ def process_list
47
+ return {GEO::Remote::dataset_platform($dataset) => [$dataset]} if $dataset
48
+ return {$platform => GEO::Remote::platform_datasets($platform)} if $platform
49
+
50
+ if $organism
51
+ organisms = [$organism]
52
+ else
53
+ organisms = Organism.all(true)
54
+ end
55
+
56
+ list = {}
57
+ organisms.each{|organism|
58
+ GEO::Remote::organism_platforms(organism).each{|platform|
59
+ datasets = GEO::Remote::platform_datasets(platform)
60
+
61
+ # Platforms with no datasets are skiped, although if they have series
62
+ # they may be considered later
63
+ next if datasets.empty?
64
+
65
+ if $tranlations || $update_db
66
+ list[platform] = []
67
+ else
68
+ if ! $force
69
+ datasets.reject!{|dataset| Dir.glob(File.join(GEO.dataset_path(dataset, platform) || "MISSING", '.*')).any? }
70
+ end
71
+ list[platform] = datasets
72
+ end
73
+ }
74
+ }
75
+
76
+ if $series && ! $translations
77
+ series = Dir.glob('series/*.yaml').collect{|f| File.basename(f).sub(/.yaml/,'')}
78
+ series.each{|serie|
79
+ platform = GEO::SOFT.GSE(serie)[:platform]
80
+ begin
81
+ if organisms.include? GEO::SOFT::GPL(platform)[:organism].to_s
82
+ list[platform] ||= []
83
+ list[platform] << serie unless $tranlations || $update_db
84
+ end
85
+ rescue
86
+ puts "Error process series #{serie} platform #{platform}"
87
+ puts $!.message
88
+ puts $!.backtrace.join("\n")
89
+ end
90
+ }
53
91
  end
92
+
93
+ return list
54
94
  end
55
95
 
56
- def process_serie(serie, update_db = false)
57
- begin
96
+ desc "Analyze datasets"
97
+ task 'data' do
98
+
99
+ platforms_to_save = []
100
+
101
+ process_list.each{|platform, datasets|
102
+
103
+ begin
104
+ # Prepare the platform
105
+ GEO::Process.GPL(platform)
106
+ rescue
107
+ puts "Error processing platform #{platform}"
108
+ puts $!.message
109
+ puts $!.backtrace.join("\n")
110
+ next
111
+ end
112
+
58
113
  $changes = false
59
- info = YAML::load(File.open("series/#{ serie }.yaml"))
114
+ # Process all datasets
115
+ datasets.each{|dataset|
116
+ begin
117
+ if dataset =~ /GDS/
118
+ GEO::Process.GDS(dataset, platform)
119
+ else
120
+ info = YAML::load(File.open("series/#{ dataset }.yaml"))
121
+ FileUtils.rm("platforms/#{ info[:platform] }.skip") if File.exist? "platforms/#{ info[:platform] }.skip"
122
+ GEO::Process.GSE(dataset, info)
123
+ end
124
+
125
+ # Mark the platform for saving in DB
126
+ rescue
127
+ puts "Error processing dataset #{ dataset }"
128
+ puts $!.message
129
+ puts $!.backtrace.join("\n")
130
+ end
131
+ }
60
132
 
61
- FileUtils.rm("platforms/#{ info[:platform] }.skip") if File.exist? "platforms/#{ info[:platform] }.skip"
62
- GEO.process_platform(info[:platform]) if !File.exist? info[:platform]
133
+ platforms_to_save << platform if $changes || $update_db
134
+ }
63
135
 
64
- GEO.process_GSE(serie, info)
65
- if info[:platform] && update_db
66
- puts "Saving #{info[:platform]}"
67
- MADB::GEO.saveGPL(info[:platform]) if info[:platform] && update_db
136
+ platforms_to_save.each{|platform|
137
+ begin
138
+ puts "Saving #{platform}"
139
+ MADB::GEO.saveGPL(platform)
140
+ rescue
141
+ puts "Error saving platform #{ platform }"
142
+ puts $!.message
143
+ puts $!.backtrace.join("\n")
68
144
  end
69
- info[:platform]
70
- rescue
71
- puts $!.message
72
- end
145
+ }
73
146
  end
74
147
 
75
148
  def annotations(name, cross_platform = false, &block)
@@ -98,7 +171,7 @@ def annotations(name, cross_platform = false, &block)
98
171
  end
99
172
  }
100
173
  return if $platform
101
-
174
+
102
175
  Progress.monitor("Annotating with #{ name }, series")
103
176
  series = Dir.glob('series/*.yaml').collect{|f| File.basename(f).sub(/.yaml/,'')}
104
177
  series.each{|serie|
@@ -125,42 +198,6 @@ def goterms(org, list, slim, threshold)
125
198
  end
126
199
 
127
200
 
128
- task 'data' do
129
- if $platform
130
- process_platform($platform, $update_db)
131
- elsif $series
132
- process_serie($series, $update_db)
133
- elsif $org
134
- org = $org
135
- platforms = GEO::Eutils.organism_platforms(org).select{|platform| GEO::Eutils.GPL_datasets(platform).any?}
136
- Progress.monitor("Platforms for #{Organism.name(org)}")
137
- platforms.each{|platform|
138
- puts "Platform #{ platform }"
139
- process_platform(platform,$update_db)
140
- }
141
- else
142
- # Series
143
- series = Dir.glob('series/*.yaml').collect{|f| File.basename(f).sub(/.yaml/,'')}
144
- platform2save = []
145
- series.each{|serie|
146
- platform = process_serie(serie, false)
147
- platform2save << platform if $update_db && ($changes || $force)
148
- }
149
-
150
- Progress.monitor("Saving #{platform2save.uniq.length} platforms: ")
151
- platform2save.uniq.each{|platform| MADB::GEO.saveGPL(platform) }
152
-
153
- # Platforms
154
- for org in Organism.all
155
- platforms = GEO::Eutils.organism_platforms(org).select{|platform| GEO::Eutils.GPL_datasets(platform).any?}
156
- Progress.monitor("Platforms for #{Organism.name(org)}")
157
- platforms.each{|platform|
158
- puts "Platform #{ platform }"
159
- process_platform(platform,$update_db)
160
- }
161
- end
162
- end
163
- end
164
201
 
165
202
  task 'annotate_Words' do
166
203
  require 'MARQ/annotations'