lederhosen 2.0.6 → 2.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lederhosen.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "2.0.6"
8
+ s.version = "2.0.7"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
12
- s.date = "2013-02-27"
12
+ s.date = "2013-03-11"
13
13
  s.description = "Various tools for OTU clustering"
14
14
  s.email = "harekrishna@gmail.com"
15
15
  s.executables = ["lederhosen"]
@@ -18,57 +18,71 @@ module Lederhosen
18
18
 
19
19
  ohai "filtering otu file #{input} (reads = #{reads}, samples = #{min_samples})"
20
20
 
21
- cluster_sample_count = Hash.new { |h, k| h[k] = Hash.new }
21
+ # make one pass finding which OTUs to keep
22
+ # create mask that maps which columns correspond to good OTUs
23
+ # pass over table again printing only those columns
22
24
 
23
- ohai "loading csv file #{input}"
25
+ seen = Hash.new { |h, k| h[k] = 0 }
24
26
 
25
- # slurp up CSV file
26
- File.open input do |handle|
27
+ otu_order = []
28
+
29
+ pbar = ProgressBar.new 'counting', File.size(input)
30
+ total_reads = 0
31
+
32
+ File.open(input) do |handle|
27
33
  header = handle.gets.strip.split(',')
28
- cluster_ids = header[0..-1]
34
+ header.each { |x| otu_order << x }
35
+
29
36
  handle.each do |line|
37
+ pbar.set handle.pos
30
38
  line = line.strip.split(',')
31
- sample_id = line[0].to_sym
32
- counts = line[1..-1].map(&:to_i)
33
- cluster_ids.zip(counts).each do |cluster, count|
34
- cluster_sample_count[cluster][sample_id] = count
39
+ sample_name = line[0]
40
+ abunds = line[1..-1].map &:to_i
41
+ otu_order.zip(abunds) do |o, a|
42
+ total_reads += a
43
+ seen[o] += 1 if a >= reads
35
44
  end
36
45
  end
37
46
  end
38
47
 
39
- ohai "filtering"
48
+ pbar.finish
40
49
 
41
- # filter sample_cluster_count
42
- # todo: move filtered reads to 'unclassified_reads' classification
43
- filtered = cluster_sample_count.reject { |k, v| v.reject { |k, v| v < reads }.size < min_samples }
50
+ mask = otu_order.map { |x| seen[x] >= min_samples }
44
51
 
45
- # use functional programming they said
46
- # it will make your better they said
47
- noise = cluster_sample_count.keys - filtered.keys
52
+ ohai "found #{otu_order.size} otus, keeping #{mask.count(true)}"
48
53
 
49
- ohai "saving to #{output}"
54
+ output = File.open(output, 'w')
50
55
 
51
- # save the table
52
- out = File.open(output, 'w')
53
- samples = filtered.values.map(&:keys).flatten.uniq
54
- clusters = filtered.keys
55
- out.puts "-,#{clusters.join(',')},noise"
56
+ pbar = ProgressBar.new 'writing', File.size(input)
57
+ filtered_reads = 0
58
+ File.open(input) do |handle|
59
+ header = handle.gets.strip.split(',')
60
+ header = header.zip(mask).map { |k, m| k if m }.compact
61
+ output.print header.join(',')
62
+ output.print ",noise\n" # need a "noise" column
63
+
64
+ handle.each do |line|
65
+ pbar.set handle.pos
66
+ line = line.strip.split(',')
67
+
68
+ sample_name = line[0]
69
+ counts = line[1..-1].map &:to_i
70
+
71
+ kept_counts = counts.zip(mask).map { |c, m| c if m }.compact
72
+ noise = counts.zip(mask).map { |c, m| c unless m }.compact.inject(:+)
73
+ filtered_reads += noise
74
+
75
+ output.puts "#{sample_name},#{kept_counts.join(',')},#{noise}"
56
76
 
57
- samples.each do |sample|
58
- out.print "#{sample}"
59
- clusters.each do |cluster|
60
- out.print ",#{filtered[cluster][sample]}"
61
77
  end
62
- noise_sum = noise.map { |n| cluster_sample_count[n][sample] }.inject(:+)
63
- out.print ",#{noise_sum || 0}"
64
- out.print "\n"
65
78
  end
66
- out.close
67
79
 
68
- ohai "kept #{filtered.keys.size} clusters (#{filtered.keys.size/cluster_sample_count.size.to_f})."
69
- kept_reads = filtered.values.map { |x| x.values.inject(:+) }.inject(:+)
70
- total_reads = cluster_sample_count.values.map { |x| x.values.inject(:+) }.inject(:+)
71
- ohai "kept #{kept_reads}/#{total_reads} reads (#{kept_reads/total_reads.to_f})."
80
+ pbar.finish
81
+
82
+ ohai "kept #{total_reads - filtered_reads}/#{total_reads} reads (#{100*(total_reads - filtered_reads)/total_reads.to_f}%)"
83
+
84
+ output.close
85
+
72
86
  end
73
87
 
74
88
  end
@@ -3,7 +3,7 @@ module Lederhosen
3
3
  MAJOR = 2
4
4
  MINOR = 0
5
5
  CODENAME = 'Schnittlauchbrot' # changes for minor versions
6
- PATCH = 6
6
+ PATCH = 7
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH].join('.')
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.6
4
+ version: 2.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-27 00:00:00.000000000 Z
12
+ date: 2013-03-11 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: dna
@@ -149,7 +149,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
149
149
  version: '0'
150
150
  segments:
151
151
  - 0
152
- hash: -902984509893215788
152
+ hash: -4462113867490056177
153
153
  required_rubygems_version: !ruby/object:Gem::Requirement
154
154
  none: false
155
155
  requirements: