rbbt-GE 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rbbt/GE/GEO.rb CHANGED
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require 'rbbt-util'
2
3
  require 'rbbt/GE'
3
4
  require 'rbbt/sources/organism'
@@ -9,7 +10,7 @@ module GEO
9
10
  self.pkgdir = "geo"
10
11
  self.subdir = "arrays"
11
12
 
12
- GEO.claim GEO.root.find(:user), :rake, Rbbt.share.install.GEO.Rakefile.find(:lib)
13
+ GEO.claim GEO.root, :rake, Rbbt.share.install.GEO.Rakefile.find(:lib)
13
14
 
14
15
  def self.comparison_name(field, condition, control)
15
16
  condition = condition * " AND " if Array === condition
@@ -74,11 +75,12 @@ module GEO
74
75
  }
75
76
 
76
77
  GDS_INFO = {
77
- :DELIMITER => "\\^SUBSET|!sample_table_begin",
78
- :title => "!Sample_title",
79
- :accession => "!Sample_geo_accession",
80
- :channel_count => "!Sample_channel_count",
81
- :platform => "!Sample_platform_id",
78
+ :DELIMITER => "\\^SUBSET",
79
+ :value_type => "!dataset_value_type",
80
+ :channel_count => "!dataset_channel_count",
81
+ :platform => "!dataset_platform",
82
+ :reference_series => "!dataset_reference_series",
83
+ :description => "!dataset_description"
82
84
  }
83
85
 
84
86
  GDS_SUBSET_INFO = {
@@ -149,7 +151,7 @@ module GEO
149
151
  end
150
152
 
151
153
  def self.guess_id(organism, codes)
152
- num_codes = codes.size
154
+ num_codes = codes.length
153
155
  best = nil
154
156
  best_count = 0
155
157
  new_fields = []
@@ -160,9 +162,15 @@ module GEO
160
162
  values = CMD.cmd("cat #{ codefile }|cut -f #{ i + 1 }| tr '|' '\\n'|grep [[:alpha:]]|sort -u").read.split("\n").reject{|code| code.empty?}
161
163
 
162
164
  new_field, count = Organism.guess_id(organism, values)
165
+ new_field ||= field
166
+ count ||= 0
167
+ new_field = "UNKNOWN(#{new_field})" unless count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.2 and count > values.uniq.length * 0.5
168
+
169
+ Log.debug "Original field: #{ field }. New: #{new_field}. Count: #{ count }/#{num_codes}/#{values.uniq.length}"
170
+ new_fields << new_field
171
+
163
172
  field_counts[new_field] = count
164
- Log.debug "Original field: #{ field }. New: #{new_field}. Count: #{ count }/#{num_codes}"
165
- new_fields << (count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.5 ? new_field : "UNKNOWN(#{ field })")
173
+
166
174
  if count > best_count
167
175
  best = new_field
168
176
  best_count = count
@@ -184,7 +192,8 @@ module GEO
184
192
  code_file = File.join(directory, 'codes')
185
193
  info_file = File.join(directory, 'info.yaml')
186
194
 
187
- stream = Open.open(GPL_URL.gsub('#PLATFORM#', platform), :nocache => true, :pipe => true)
195
+ # Fix platforms with the '.\d' extension (eg. NM_020527.1)
196
+ stream = CMD.cmd('sed \'s/\.[[:digit:]]\+\(\t\|$\)/\1/g;s/ *\/\/[^\t]*//g\'', :in => Open.open(GPL_URL.gsub('#PLATFORM#', platform), :nocache => true), :pipe => true)
188
197
 
189
198
  info = parse_header(stream, GPL_INFO)
190
199
  info[:code_file] = code_file
@@ -278,7 +287,9 @@ module GEO
278
287
 
279
288
  samples << sample
280
289
 
281
- sample_values = TSV.open(StringIO.new(chunk.match(/!sample_table_begin(.*)!sample_table_end/msi)[0].strip), :type => :list, :header_hash => '')
290
+ chunk = chunk.encode "UTF-8"
291
+ chunk = Misc.fixutf8 chunk
292
+ sample_values = TSV.open(StringIO.new(chunk.match(/!sample_table_begin\n(.*)\n!sample_table_end/mi)[1].strip), :type => :list, :header_hash => '')
282
293
  sample_values.fields = [sample]
283
294
 
284
295
  if values.nil?
@@ -310,12 +321,14 @@ module GEO
310
321
  key_field = TSV.parse_header(GEO[info[:platform]]['codes'].open).key_field
311
322
  values.key_field = key_field
312
323
 
313
- Open.write(value_file, values.to_s)
314
- Open.write(info_file, info.to_yaml)
315
-
324
+ info[:sample_info] ||= sample_info
316
325
  info[:channel_count] ||= sample_info.values.first[:channel_count]
317
326
  info[:value_type] ||= sample_info.values.first[:value_type]
318
327
 
328
+
329
+ Open.write(value_file, values.to_s)
330
+ Open.write(info_file, info.to_yaml)
331
+
319
332
  info
320
333
  end
321
334
  end
@@ -346,3 +359,4 @@ module GEO
346
359
  GE.analyze(value_file, condition_samples, control_samples, log2, path, format)
347
360
  end
348
361
  end
362
+
data/lib/rbbt/GE.rb CHANGED
@@ -9,15 +9,18 @@ module GE
9
9
  R.run(cmd)
10
10
  end
11
11
 
12
- def self.r_format(list)
12
+ def self.r_format(list, options = {})
13
+ strings = options[:strings]
13
14
  case
14
15
  when list.nil?
15
16
  "NULL"
16
17
  when Array === list
17
- "c(#{list.collect{|e| r_format e} * ", "})"
18
- when (String === list and list === list.to_i.to_s)
18
+ "c(#{list.collect{|e| r_format e, options} * ", "})"
19
+ when (Fixnum === list or Float === list)
20
+ list.to_s
21
+ when (not strings and String === list and list === list.to_i.to_s)
19
22
  list.to_i
20
- when (String === list and list === list.to_f.to_s)
23
+ when (not strings and String === list and list === list.to_f.to_s)
21
24
  list.to_f
22
25
  when TrueClass === list
23
26
  "TRUE"
@@ -28,9 +31,9 @@ module GE
28
31
  end
29
32
  end
30
33
 
31
- def self.analyze(datafile, main, contrast = nil, log2 = false, outfile = nil, key_field = nil)
34
+ def self.analyze(datafile, main, contrast = nil, log2 = false, outfile = nil, key_field = nil, two_channel = nil)
32
35
  FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
33
- GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main)}, contrast = #{r_format(contrast)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field})")
36
+ GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main, :strings => true)}, contrast = #{r_format(contrast, :strings => true)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field}, two.channel = #{r_format two_channel})")
34
37
  end
35
38
  end
36
39
 
@@ -6,6 +6,26 @@ rule /^(GPL\d+)\/?(codes|info\.yaml)?$/ do |t|
6
6
  GEO::SOFT.GPL(platform, file.nil? ? t.name : File.dirname(t.name))
7
7
  end
8
8
 
9
+ rule /^(GDS\d+)\/samples$/ => [proc{|t| t.sub('samples', 'info.yaml')}, proc{|t| t.sub('samples', 'values')} ] do |t|
10
+ info = YAML.load(Open.open(t.prerequisites.first))
11
+
12
+ subsets = info[:subsets]
13
+ fields = subsets.keys
14
+
15
+ all_samples = TSV::Parser.new(Open.open(t.prerequisites.last)).fields
16
+ samples = TSV.setup(all_samples, :key_field => "Sample", :fields => [], :type => :list)
17
+
18
+ fields.each do |field|
19
+ assignments = subsets[field]
20
+
21
+ samples = samples.add_field field do |sample, v|
22
+ assignments.select{|value, list| list.include? sample}.first.first
23
+ end
24
+ end
25
+
26
+ Open.write(t.name, samples.to_s)
27
+ end
28
+
9
29
  rule /^(GDS\d+)\/?(values|info\.yaml)?$/ do |t|
10
30
  t.name =~ /^(GDS\d+)\/?(values|info\.yaml)?/
11
31
  dataset = $1
data/share/lib/R/MA.R CHANGED
@@ -55,13 +55,31 @@ rbbt.GE.process.limma.twoside <- function(expr, subset.main, subset.contrast){
55
55
  }
56
56
 
57
57
 
58
+ rbbt.GE.guess.log2 <- function(m, two.channel){
59
+ if (two.channel){
60
+ return (sum(m < 0, na.rm = TRUE) == 0);
61
+ }else{
62
+ return (max(m, na.rm = TRUE) > 100);
63
+ }
64
+ }
65
+
66
+
58
67
 
59
- rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL){
68
+ rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL, two.channel = NULL){
60
69
  data = rbbt.tsv(file);
61
70
  ids = rownames(data);
62
71
 
72
+ if (is.null(log2)){
73
+ print(str(data));
74
+ log2 = rbbt.GE.guess.log2(data, two.channel)
75
+ }
76
+
63
77
  if (log2){
64
78
  data = log2(data);
79
+ min = min(data[data != -Inf])
80
+ data[data == -Inf] = min
81
+ print(summary(data));
82
+ return
65
83
  }
66
84
 
67
85
  if (is.null(contrast)){
@@ -103,6 +121,7 @@ rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile =
103
121
 
104
122
  if (! is.null(limma)){
105
123
  result = data.frame(ratio = ratio[ids], t.values = limma$t[ids], p.values = limma$p.values[ids])
124
+ result["adjusted.p.values"] = p.adjust(result$p.values, "fdr")
106
125
  }else{
107
126
  result = data.frame(ratio = ratio)
108
127
  }
metadata CHANGED
@@ -1,45 +1,39 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: rbbt-GE
3
- version: !ruby/object:Gem::Version
4
- hash: 23
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 2
9
- - 0
10
- version: 0.2.0
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Miguel Vazquez
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2012-01-31 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
12
+ date: 2012-12-21 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
21
15
  name: rbbt-util
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
24
17
  none: false
25
- requirements:
26
- - - ">="
27
- - !ruby/object:Gem::Version
28
- hash: 3
29
- segments:
30
- - 0
31
- version: "0"
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
32
22
  type: :runtime
33
- version_requirements: *id001
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
34
30
  description: Gene Expression in RBBT
35
31
  email: miguel.vazquez@cnio.es
36
32
  executables: []
37
-
38
33
  extensions: []
39
-
40
- extra_rdoc_files:
34
+ extra_rdoc_files:
41
35
  - LICENSE
42
- files:
36
+ files:
43
37
  - LICENSE
44
38
  - lib/rbbt/GE.rb
45
39
  - lib/rbbt/GE/GEO.rb
@@ -49,37 +43,28 @@ files:
49
43
  - test/rbbt/GE/test_GEO.rb
50
44
  homepage: http://github.com/mikisvaz/rbbt-GE
51
45
  licenses: []
52
-
53
46
  post_install_message:
54
47
  rdoc_options: []
55
-
56
- require_paths:
48
+ require_paths:
57
49
  - lib
58
- required_ruby_version: !ruby/object:Gem::Requirement
50
+ required_ruby_version: !ruby/object:Gem::Requirement
59
51
  none: false
60
- requirements:
61
- - - ">="
62
- - !ruby/object:Gem::Version
63
- hash: 3
64
- segments:
65
- - 0
66
- version: "0"
67
- required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
57
  none: false
69
- requirements:
70
- - - ">="
71
- - !ruby/object:Gem::Version
72
- hash: 3
73
- segments:
74
- - 0
75
- version: "0"
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
76
62
  requirements: []
77
-
78
63
  rubyforge_project:
79
- rubygems_version: 1.8.10
64
+ rubygems_version: 1.8.24
80
65
  signing_key:
81
66
  specification_version: 3
82
67
  summary: Gene Expression in RBBT
83
- test_files:
68
+ test_files:
84
69
  - test/test_helper.rb
85
70
  - test/rbbt/GE/test_GEO.rb