lederhosen 2.0.6 → 2.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/lederhosen.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "2.0.6"
8
+ s.version = "2.0.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
12
- s.date = "2013-02-27"
12
+ s.date = "2013-03-11"
13
13
  s.description = "Various tools for OTU clustering"
14
14
  s.email = "harekrishna@gmail.com"
15
15
  s.executables = ["lederhosen"]
@@ -18,57 +18,71 @@ module Lederhosen
18
18
 
19
19
  ohai "filtering otu file #{input} (reads = #{reads}, samples = #{min_samples})"
20
20
 
21
- cluster_sample_count = Hash.new { |h, k| h[k] = Hash.new }
21
+ # make one pass finding which OTUs to keep
22
+ # create mask that maps which columns correspond to good OTUs
23
+ # pass over table again printing only those columns
22
24
 
23
- ohai "loading csv file #{input}"
25
+ seen = Hash.new { |h, k| h[k] = 0 }
24
26
 
25
- # slurp up CSV file
26
- File.open input do |handle|
27
+ otu_order = []
28
+
29
+ pbar = ProgressBar.new 'counting', File.size(input)
30
+ total_reads = 0
31
+
32
+ File.open(input) do |handle|
27
33
  header = handle.gets.strip.split(',')
28
- cluster_ids = header[0..-1]
34
+ header.each { |x| otu_order << x }
35
+
29
36
  handle.each do |line|
37
+ pbar.set handle.pos
30
38
  line = line.strip.split(',')
31
- sample_id = line[0].to_sym
32
- counts = line[1..-1].map(&:to_i)
33
- cluster_ids.zip(counts).each do |cluster, count|
34
- cluster_sample_count[cluster][sample_id] = count
39
+ sample_name = line[0]
40
+ abunds = line[1..-1].map &:to_i
41
+ otu_order.zip(abunds) do |o, a|
42
+ total_reads += a
43
+ seen[o] += 1 if a >= reads
35
44
  end
36
45
  end
37
46
  end
38
47
 
39
- ohai "filtering"
48
+ pbar.finish
40
49
 
41
- # filter sample_cluster_count
42
- # todo: move filtered reads to 'unclassified_reads' classification
43
- filtered = cluster_sample_count.reject { |k, v| v.reject { |k, v| v < reads }.size < min_samples }
50
+ mask = otu_order.map { |x| seen[x] >= min_samples }
44
51
 
45
- # use functional programming they said
46
- # it will make your better they said
47
- noise = cluster_sample_count.keys - filtered.keys
52
+ ohai "found #{otu_order.size} otus, keeping #{mask.count(true)}"
48
53
 
49
- ohai "saving to #{output}"
54
+ output = File.open(output, 'w')
50
55
 
51
- # save the table
52
- out = File.open(output, 'w')
53
- samples = filtered.values.map(&:keys).flatten.uniq
54
- clusters = filtered.keys
55
- out.puts "-,#{clusters.join(',')},noise"
56
+ pbar = ProgressBar.new 'writing', File.size(input)
57
+ filtered_reads = 0
58
+ File.open(input) do |handle|
59
+ header = handle.gets.strip.split(',')
60
+ header = header.zip(mask).map { |k, m| k if m }.compact
61
+ output.print header.join(',')
62
+ output.print ",noise\n" # need a "noise" column
63
+
64
+ handle.each do |line|
65
+ pbar.set handle.pos
66
+ line = line.strip.split(',')
67
+
68
+ sample_name = line[0]
69
+ counts = line[1..-1].map &:to_i
70
+
71
+ kept_counts = counts.zip(mask).map { |c, m| c if m }.compact
72
+ noise = counts.zip(mask).map { |c, m| c unless m }.compact.inject(:+)
73
+ filtered_reads += noise
74
+
75
+ output.puts "#{sample_name},#{kept_counts.join(',')},#{noise}"
56
76
 
57
- samples.each do |sample|
58
- out.print "#{sample}"
59
- clusters.each do |cluster|
60
- out.print ",#{filtered[cluster][sample]}"
61
77
  end
62
- noise_sum = noise.map { |n| cluster_sample_count[n][sample] }.inject(:+)
63
- out.print ",#{noise_sum || 0}"
64
- out.print "\n"
65
78
  end
66
- out.close
67
79
 
68
- ohai "kept #{filtered.keys.size} clusters (#{filtered.keys.size/cluster_sample_count.size.to_f})."
69
- kept_reads = filtered.values.map { |x| x.values.inject(:+) }.inject(:+)
70
- total_reads = cluster_sample_count.values.map { |x| x.values.inject(:+) }.inject(:+)
71
- ohai "kept #{kept_reads}/#{total_reads} reads (#{kept_reads/total_reads.to_f})."
80
+ pbar.finish
81
+
82
+ ohai "kept #{total_reads - filtered_reads}/#{total_reads} reads (#{100*(total_reads - filtered_reads)/total_reads.to_f}%)"
83
+
84
+ output.close
85
+
72
86
  end
73
87
 
74
88
  end
@@ -3,7 +3,7 @@ module Lederhosen
3
3
  MAJOR = 2
4
4
  MINOR = 0
5
5
  CODENAME = 'Schnittlauchbrot' # changes for minor versions
6
- PATCH = 6
6
+ PATCH = 7
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH].join('.')
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.6
4
+ version: 2.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-27 00:00:00.000000000 Z
12
+ date: 2013-03-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: dna
@@ -149,7 +149,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
149
149
  version: '0'
150
150
  segments:
151
151
  - 0
152
- hash: -902984509893215788
152
+ hash: -4462113867490056177
153
153
  required_rubygems_version: !ruby/object:Gem::Requirement
154
154
  none: false
155
155
  requirements: