rbbt-GE 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/GE/GEO.rb +28 -14
- data/lib/rbbt/GE.rb +9 -6
- data/share/install/GEO/Rakefile +20 -0
- data/share/lib/R/MA.R +20 -1
- metadata +34 -49
data/lib/rbbt/GE/GEO.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
require 'rbbt-util'
|
2
3
|
require 'rbbt/GE'
|
3
4
|
require 'rbbt/sources/organism'
|
@@ -9,7 +10,7 @@ module GEO
|
|
9
10
|
self.pkgdir = "geo"
|
10
11
|
self.subdir = "arrays"
|
11
12
|
|
12
|
-
GEO.claim GEO.root
|
13
|
+
GEO.claim GEO.root, :rake, Rbbt.share.install.GEO.Rakefile.find(:lib)
|
13
14
|
|
14
15
|
def self.comparison_name(field, condition, control)
|
15
16
|
condition = condition * " AND " if Array === condition
|
@@ -74,11 +75,12 @@ module GEO
|
|
74
75
|
}
|
75
76
|
|
76
77
|
GDS_INFO = {
|
77
|
-
:DELIMITER => "\\^SUBSET
|
78
|
-
:
|
79
|
-
:
|
80
|
-
:
|
81
|
-
:
|
78
|
+
:DELIMITER => "\\^SUBSET",
|
79
|
+
:value_type => "!dataset_value_type",
|
80
|
+
:channel_count => "!dataset_channel_count",
|
81
|
+
:platform => "!dataset_platform",
|
82
|
+
:reference_series => "!dataset_reference_series",
|
83
|
+
:description => "!dataset_description"
|
82
84
|
}
|
83
85
|
|
84
86
|
GDS_SUBSET_INFO = {
|
@@ -149,7 +151,7 @@ module GEO
|
|
149
151
|
end
|
150
152
|
|
151
153
|
def self.guess_id(organism, codes)
|
152
|
-
num_codes = codes.
|
154
|
+
num_codes = codes.length
|
153
155
|
best = nil
|
154
156
|
best_count = 0
|
155
157
|
new_fields = []
|
@@ -160,9 +162,15 @@ module GEO
|
|
160
162
|
values = CMD.cmd("cat #{ codefile }|cut -f #{ i + 1 }| tr '|' '\\n'|grep [[:alpha:]]|sort -u").read.split("\n").reject{|code| code.empty?}
|
161
163
|
|
162
164
|
new_field, count = Organism.guess_id(organism, values)
|
165
|
+
new_field ||= field
|
166
|
+
count ||= 0
|
167
|
+
new_field = "UNKNOWN(#{new_field})" unless count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.2 and count > values.uniq.length * 0.5
|
168
|
+
|
169
|
+
Log.debug "Original field: #{ field }. New: #{new_field}. Count: #{ count }/#{num_codes}/#{values.uniq.length}"
|
170
|
+
new_fields << new_field
|
171
|
+
|
163
172
|
field_counts[new_field] = count
|
164
|
-
|
165
|
-
new_fields << (count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.5 ? new_field : "UNKNOWN(#{ field })")
|
173
|
+
|
166
174
|
if count > best_count
|
167
175
|
best = new_field
|
168
176
|
best_count = count
|
@@ -184,7 +192,8 @@ module GEO
|
|
184
192
|
code_file = File.join(directory, 'codes')
|
185
193
|
info_file = File.join(directory, 'info.yaml')
|
186
194
|
|
187
|
-
|
195
|
+
# Fix platforms with the '.\d' extension (eg. NM_020527.1)
|
196
|
+
stream = CMD.cmd('sed \'s/\.[[:digit:]]\+\(\t\|$\)/\1/g;s/ *\/\/[^\t]*//g\'', :in => Open.open(GPL_URL.gsub('#PLATFORM#', platform), :nocache => true), :pipe => true)
|
188
197
|
|
189
198
|
info = parse_header(stream, GPL_INFO)
|
190
199
|
info[:code_file] = code_file
|
@@ -278,7 +287,9 @@ module GEO
|
|
278
287
|
|
279
288
|
samples << sample
|
280
289
|
|
281
|
-
|
290
|
+
chunk = chunk.encode "UTF-8"
|
291
|
+
chunk = Misc.fixutf8 chunk
|
292
|
+
sample_values = TSV.open(StringIO.new(chunk.match(/!sample_table_begin\n(.*)\n!sample_table_end/mi)[1].strip), :type => :list, :header_hash => '')
|
282
293
|
sample_values.fields = [sample]
|
283
294
|
|
284
295
|
if values.nil?
|
@@ -310,12 +321,14 @@ module GEO
|
|
310
321
|
key_field = TSV.parse_header(GEO[info[:platform]]['codes'].open).key_field
|
311
322
|
values.key_field = key_field
|
312
323
|
|
313
|
-
|
314
|
-
Open.write(info_file, info.to_yaml)
|
315
|
-
|
324
|
+
info[:sample_info] ||= sample_info
|
316
325
|
info[:channel_count] ||= sample_info.values.first[:channel_count]
|
317
326
|
info[:value_type] ||= sample_info.values.first[:value_type]
|
318
327
|
|
328
|
+
|
329
|
+
Open.write(value_file, values.to_s)
|
330
|
+
Open.write(info_file, info.to_yaml)
|
331
|
+
|
319
332
|
info
|
320
333
|
end
|
321
334
|
end
|
@@ -346,3 +359,4 @@ module GEO
|
|
346
359
|
GE.analyze(value_file, condition_samples, control_samples, log2, path, format)
|
347
360
|
end
|
348
361
|
end
|
362
|
+
|
data/lib/rbbt/GE.rb
CHANGED
@@ -9,15 +9,18 @@ module GE
|
|
9
9
|
R.run(cmd)
|
10
10
|
end
|
11
11
|
|
12
|
-
def self.r_format(list)
|
12
|
+
def self.r_format(list, options = {})
|
13
|
+
strings = options[:strings]
|
13
14
|
case
|
14
15
|
when list.nil?
|
15
16
|
"NULL"
|
16
17
|
when Array === list
|
17
|
-
"c(#{list.collect{|e| r_format e} * ", "})"
|
18
|
-
when (
|
18
|
+
"c(#{list.collect{|e| r_format e, options} * ", "})"
|
19
|
+
when (Fixnum === list or Float === list)
|
20
|
+
list.to_s
|
21
|
+
when (not strings and String === list and list === list.to_i.to_s)
|
19
22
|
list.to_i
|
20
|
-
when (String === list and list === list.to_f.to_s)
|
23
|
+
when (not strings and String === list and list === list.to_f.to_s)
|
21
24
|
list.to_f
|
22
25
|
when TrueClass === list
|
23
26
|
"TRUE"
|
@@ -28,9 +31,9 @@ module GE
|
|
28
31
|
end
|
29
32
|
end
|
30
33
|
|
31
|
-
def self.analyze(datafile, main, contrast = nil, log2 = false, outfile = nil, key_field = nil)
|
34
|
+
def self.analyze(datafile, main, contrast = nil, log2 = false, outfile = nil, key_field = nil, two_channel = nil)
|
32
35
|
FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
|
33
|
-
GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main)}, contrast = #{r_format(contrast)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field})")
|
36
|
+
GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main, :strings => true)}, contrast = #{r_format(contrast, :strings => true)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field}, two.channel = #{r_format two_channel})")
|
34
37
|
end
|
35
38
|
end
|
36
39
|
|
data/share/install/GEO/Rakefile
CHANGED
@@ -6,6 +6,26 @@ rule /^(GPL\d+)\/?(codes|info\.yaml)?$/ do |t|
|
|
6
6
|
GEO::SOFT.GPL(platform, file.nil? ? t.name : File.dirname(t.name))
|
7
7
|
end
|
8
8
|
|
9
|
+
rule /^(GDS\d+)\/samples$/ => [proc{|t| t.sub('samples', 'info.yaml')}, proc{|t| t.sub('samples', 'values')} ] do |t|
|
10
|
+
info = YAML.load(Open.open(t.prerequisites.first))
|
11
|
+
|
12
|
+
subsets = info[:subsets]
|
13
|
+
fields = subsets.keys
|
14
|
+
|
15
|
+
all_samples = TSV::Parser.new(Open.open(t.prerequisites.last)).fields
|
16
|
+
samples = TSV.setup(all_samples, :key_field => "Sample", :fields => [], :type => :list)
|
17
|
+
|
18
|
+
fields.each do |field|
|
19
|
+
assignments = subsets[field]
|
20
|
+
|
21
|
+
samples = samples.add_field field do |sample, v|
|
22
|
+
assignments.select{|value, list| list.include? sample}.first.first
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
Open.write(t.name, samples.to_s)
|
27
|
+
end
|
28
|
+
|
9
29
|
rule /^(GDS\d+)\/?(values|info\.yaml)?$/ do |t|
|
10
30
|
t.name =~ /^(GDS\d+)\/?(values|info\.yaml)?/
|
11
31
|
dataset = $1
|
data/share/lib/R/MA.R
CHANGED
@@ -55,13 +55,31 @@ rbbt.GE.process.limma.twoside <- function(expr, subset.main, subset.contrast){
|
|
55
55
|
}
|
56
56
|
|
57
57
|
|
58
|
+
rbbt.GE.guess.log2 <- function(m, two.channel){
|
59
|
+
if (two.channel){
|
60
|
+
return (sum(m < 0, na.rm = TRUE) == 0);
|
61
|
+
}else{
|
62
|
+
return (max(m, na.rm = TRUE) > 100);
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
|
58
67
|
|
59
|
-
rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL){
|
68
|
+
rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL, two.channel = NULL){
|
60
69
|
data = rbbt.tsv(file);
|
61
70
|
ids = rownames(data);
|
62
71
|
|
72
|
+
if (is.null(log2)){
|
73
|
+
print(str(data));
|
74
|
+
log2 = rbbt.GE.guess.log2(data, two.channel)
|
75
|
+
}
|
76
|
+
|
63
77
|
if (log2){
|
64
78
|
data = log2(data);
|
79
|
+
min = min(data[data != -Inf])
|
80
|
+
data[data == -Inf] = min
|
81
|
+
print(summary(data));
|
82
|
+
return
|
65
83
|
}
|
66
84
|
|
67
85
|
if (is.null(contrast)){
|
@@ -103,6 +121,7 @@ rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile =
|
|
103
121
|
|
104
122
|
if (! is.null(limma)){
|
105
123
|
result = data.frame(ratio = ratio[ids], t.values = limma$t[ids], p.values = limma$p.values[ids])
|
124
|
+
result["adjusted.p.values"] = p.adjust(result$p.values, "fdr")
|
106
125
|
}else{
|
107
126
|
result = data.frame(ratio = ratio)
|
108
127
|
}
|
metadata
CHANGED
@@ -1,45 +1,39 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-GE
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 2
|
9
|
-
- 0
|
10
|
-
version: 0.2.0
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Miguel Vazquez
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-12-21 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
21
15
|
name: rbbt-util
|
22
|
-
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
24
17
|
none: false
|
25
|
-
requirements:
|
26
|
-
- -
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
|
29
|
-
segments:
|
30
|
-
- 0
|
31
|
-
version: "0"
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
32
22
|
type: :runtime
|
33
|
-
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
34
30
|
description: Gene Expression in RBBT
|
35
31
|
email: miguel.vazquez@cnio.es
|
36
32
|
executables: []
|
37
|
-
|
38
33
|
extensions: []
|
39
|
-
|
40
|
-
extra_rdoc_files:
|
34
|
+
extra_rdoc_files:
|
41
35
|
- LICENSE
|
42
|
-
files:
|
36
|
+
files:
|
43
37
|
- LICENSE
|
44
38
|
- lib/rbbt/GE.rb
|
45
39
|
- lib/rbbt/GE/GEO.rb
|
@@ -49,37 +43,28 @@ files:
|
|
49
43
|
- test/rbbt/GE/test_GEO.rb
|
50
44
|
homepage: http://github.com/mikisvaz/rbbt-GE
|
51
45
|
licenses: []
|
52
|
-
|
53
46
|
post_install_message:
|
54
47
|
rdoc_options: []
|
55
|
-
|
56
|
-
require_paths:
|
48
|
+
require_paths:
|
57
49
|
- lib
|
58
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
59
51
|
none: false
|
60
|
-
requirements:
|
61
|
-
- -
|
62
|
-
- !ruby/object:Gem::Version
|
63
|
-
|
64
|
-
|
65
|
-
- 0
|
66
|
-
version: "0"
|
67
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
57
|
none: false
|
69
|
-
requirements:
|
70
|
-
- -
|
71
|
-
- !ruby/object:Gem::Version
|
72
|
-
|
73
|
-
segments:
|
74
|
-
- 0
|
75
|
-
version: "0"
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
76
62
|
requirements: []
|
77
|
-
|
78
63
|
rubyforge_project:
|
79
|
-
rubygems_version: 1.8.
|
64
|
+
rubygems_version: 1.8.24
|
80
65
|
signing_key:
|
81
66
|
specification_version: 3
|
82
67
|
summary: Gene Expression in RBBT
|
83
|
-
test_files:
|
68
|
+
test_files:
|
84
69
|
- test/test_helper.rb
|
85
70
|
- test/rbbt/GE/test_GEO.rb
|