lederhosen 1.8.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -8,7 +8,7 @@ group :test do
8
8
  gem 'rspec', '2.12.0'
9
9
  gem 'rspec-prof', '0.0.3'
10
10
  gem 'pry'
11
- gem 'plymouth'
11
+ # gem 'plymouth'
12
12
  end
13
13
 
14
14
  group :development do
data/lederhosen.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "1.8.2"
8
+ s.version = "2.0.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
12
- s.date = "2013-01-17"
12
+ s.date = "2013-01-24"
13
13
  s.description = "Various tools for OTU clustering"
14
14
  s.email = "harekrishna@gmail.com"
15
15
  s.executables = ["lederhosen"]
@@ -27,13 +27,16 @@ Gem::Specification.new do |s|
27
27
  "lib/lederhosen/cli.rb",
28
28
  "lib/lederhosen/no_tasks.rb",
29
29
  "lib/lederhosen/tasks/cluster.rb",
30
+ "lib/lederhosen/tasks/count_taxonomies.rb",
30
31
  "lib/lederhosen/tasks/get_reps.rb",
31
32
  "lib/lederhosen/tasks/join_otu_tables.rb",
32
33
  "lib/lederhosen/tasks/make_udb.rb",
33
34
  "lib/lederhosen/tasks/otu_filter.rb",
34
35
  "lib/lederhosen/tasks/otu_table.rb",
36
+ "lib/lederhosen/tasks/separate_unclassified.rb",
35
37
  "lib/lederhosen/tasks/split_fasta.rb",
36
38
  "lib/lederhosen/tasks/version.rb",
39
+ "lib/lederhosen/uc_parser.rb",
37
40
  "lib/lederhosen/version.rb",
38
41
  "readme.md",
39
42
  "scripts/illumina_pipeline/.gitignore",
@@ -46,7 +49,8 @@ Gem::Specification.new do |s|
46
49
  "spec/data/test.uc",
47
50
  "spec/data/trimmed/ILT_L_9_B_001.fasta",
48
51
  "spec/no_tasks_spec.rb",
49
- "spec/spec_helper.rb"
52
+ "spec/spec_helper.rb",
53
+ "spec/uc_parser_spec.rb"
50
54
  ]
51
55
  s.homepage = "http://audy.github.com/lederhosen"
52
56
  s.licenses = ["MIT"]
@@ -6,26 +6,24 @@ module Lederhosen
6
6
 
7
7
  no_tasks do
8
8
 
9
- # parse a line of usearch prefix
10
- # return a hash in the form:
11
- # { :taxonomy => '', :identity => '0.00', ... }
12
- # unless the line is not a "hit" in which case
13
- # the function returns nil
14
- def parse_usearch_line(str)
15
-
16
- # skip non hits
17
- return nil unless str =~ /^H/
18
-
19
- str = str.split
20
-
21
- taxonomic_description = str[9]
22
- identity = str[3]
9
+ # get a taxonomic description from a line of usearch (uc) output
10
+ # return 'unclassified_reads' if the result was not a hit
11
+ # if the result was neither a hit nor a miss (for example, a seed)
12
+ # return nil
13
+ # this will probably break for different versions of uc file
14
+ # as produced by uclust or older versions of usearch
15
+ def get_tax(s)
16
+ dat = parse_usearch_line(s.strip)
17
+ if dat[:type] == 'H'
18
+ dat[:taxonomic_description].tr(',', '_')
19
+ elsif dat[:type] == 'N'
20
+ 'unclassified_reads'
21
+ else
22
+ nil
23
+ end
24
+ end
23
25
 
24
- # parse taxonomic_description
25
- taxonomies = parse_taxonomy(taxonomic_description) rescue { 'original' => str[9] }
26
26
 
27
- { :identity => identity }.merge(taxonomies)
28
- end
29
27
 
30
28
  # detect whether the taxonomy is one of the following
31
29
  # possible formats:
@@ -40,6 +38,8 @@ module Lederhosen
40
38
  :taxcollector
41
39
  elsif taxonomy =~ /^\d/
42
40
  :greengenes
41
+ elsif taxonomy.nil?
42
+ raise "nil ain't no taxonomy I ever heard of!"
43
43
  else
44
44
  :qiime
45
45
  end
@@ -0,0 +1,83 @@
1
+ module Lederhosen
2
+ class CLI
3
+
4
+ desc 'count_taxonomies', 'count taxonomies from a uc file, generating a csv file with: <taxonomy>,<reads>'
5
+
6
+ method_option :input, :type => :string, :required => true
7
+ method_option :output, :type => :string, :required => true
8
+ method_option :strict, :type => :string, :default => false,
9
+ :banner => '<level> only count reads where both taxonomies are in agreement at <level>'
10
+
11
+ def count_taxonomies
12
+ input = options[:input]
13
+ output = options[:output]
14
+ strict = options[:strict]
15
+
16
+ ohai "generating #{output} from #{input}"
17
+
18
+ handle = File.open(input)
19
+ uc = UCParser.new(handle)
20
+
21
+ taxonomy_count =
22
+ if not strict
23
+ get_taxonomy_count(uc)
24
+ elsif strict
25
+ get_strict_taxonomy_count(uc, strict)
26
+ end
27
+
28
+ handle.close
29
+
30
+ out = File.open(output, 'w')
31
+ out.puts '# taxonomy, number_of_reads'
32
+ taxonomy_count.each_pair do |taxonomy, count|
33
+ out.puts "#{taxonomy.tr(',','_')},#{count}"
34
+ end
35
+ out.close
36
+
37
+ end
38
+
39
+ no_tasks do
40
+ # returns Hash of taxonomy => number_of_reads
41
+ def get_taxonomy_count(uc)
42
+ taxonomy_count = Hash.new { |h, k| h[k] = 0 }
43
+ uc.each do |result|
44
+ if result.hit?
45
+ taxonomy_count[result.target] += 1
46
+ else
47
+ taxonomy_count['unclassified_reads'] += 1
48
+ end
49
+ end
50
+ taxonomy_count
51
+ end
52
+
53
+ # returns Hash of taxonomy => number_of_reads
54
+ # if a pair of reads do not agree at a taxonomic level,
55
+ # or if at least one is unclassified, bot reads are counted
56
+ # as unclassified_reads
57
+ def get_strict_taxonomy_count(uc, level)
58
+ taxonomy_count = Hash.new { |h, k| h[k] = 0 }
59
+ # TODO: I'm making a block for results because I don't know how to
60
+ # make results return an Enumerator when not given a block
61
+ uc.each_slice(2) do |left, right|
62
+ if left.miss? or right.miss? # at least one is a miss
63
+ taxonomy_count['unclassified_reads'] += 2
64
+ # both are hits, check taxonomies
65
+ else
66
+ ta = parse_taxonomy(left.target)
67
+ tb = parse_taxonomy(right.target)
68
+ # they match up, count both separately
69
+ if ta[level] == tb[level]
70
+ taxonomy_count[left.target] += 1
71
+ taxonomy_count[right.target] += 1
72
+ # they don't match up, count as unclassified
73
+ else
74
+ taxonomy_count['unclassified_reads'] += 2
75
+ end
76
+ end
77
+ end # results.each_slice
78
+ taxonomy_count
79
+ end
80
+
81
+ end
82
+ end
83
+ end
@@ -23,10 +23,9 @@ module Lederhosen
23
23
 
24
24
  inputs.each do |input|
25
25
  File.open(input) do |handle|
26
- pbar.inc
27
- handle.each do |line|
28
- header = parse_usearch_line(line.strip)
29
- taxa << header['original'] rescue nil
26
+ results = UCParser.new(handle)
27
+ results.each do |result|
28
+ taxa << result.target if result.hit?
30
29
  end
31
30
  end
32
31
  end
@@ -3,8 +3,8 @@ module Lederhosen
3
3
 
4
4
  desc 'make_udb', 'format database for usearch'
5
5
 
6
- method_option :input, :type => :string, :required => true
7
- method_option :output, :type => :string, :required => true
6
+ method_option :input, :type => :string, :required => true
7
+ method_option :output, :type => :string, :required => true
8
8
 
9
9
  def make_udb
10
10
  input = options[:input]
@@ -39,20 +39,27 @@ module Lederhosen
39
39
  ohai "filtering"
40
40
 
41
41
  # filter sample_cluster_count
42
+ # todo: move filtered reads to 'unclassified_reads' classification
42
43
  filtered = cluster_sample_count.reject { |k, v| v.reject { |k, v| v < reads }.size < min_samples }
43
44
 
45
+ # use functional programming they said
46
+ # it will make your better they said
47
+ noise = cluster_sample_count.keys - filtered.keys
48
+
44
49
  ohai "saving to #{output}"
45
50
 
46
51
  # save the table
47
52
  out = File.open(output, 'w')
48
53
  samples = filtered.values.map(&:keys).flatten.uniq
49
54
  clusters = filtered.keys
50
- out.puts "-,#{clusters.join(',')}"
55
+ out.puts "-,#{clusters.join(',')},noise"
51
56
  samples.each do |sample|
52
57
  out.print "#{sample}"
53
58
  clusters.each do |cluster|
54
59
  out.print ",#{filtered[cluster][sample]}"
55
60
  end
61
+ noise_sum = noise.map { |n| cluster_sample_count[n][sample]}.inject(:+)
62
+ out.print ",#{noise_sum}"
56
63
  out.print "\n"
57
64
  end
58
65
  out.close
@@ -6,96 +6,59 @@ module Lederhosen
6
6
  class CLI
7
7
 
8
8
  desc "otu_table",
9
- "create an OTU abundance matrix from USEARCH prefix"
9
+ "create an OTU abundance matrix from taxonomy count files"
10
10
 
11
11
  method_option :files, :type => :string, :required => true
12
-
13
- method_option :prefix, :type => :string, :required => true,
14
- :banner => 'prefix prefix'
15
-
16
- method_option :levels, :type => :array, :required => true,
17
- :banner => 'valid options: domain, kingdom, phylum, class, order, genus, species, original (or all of them at once)'
12
+ method_option :level, :type => :string, :required => true
13
+ method_option :output, :type => :string, :required => true
18
14
 
19
15
  def otu_table
20
- input = Dir[options[:files]]
21
- prefix = options[:prefix]
22
- levels = options[:levels].map(&:downcase)
16
+ inputs = Dir[options[:files]]
17
+ level = options[:level].downcase
18
+ output = options[:output]
23
19
 
24
- ohai "generating #{levels.join(', ')} table(s) from #{input.size} file(s) and saving to prefix #{prefix}."
20
+ ohai "Generating OTU matrix from #{inputs.size} inputs at #{level} level and saving to #{output}."
25
21
 
26
22
  # sanity check
27
- levels.each do |level|
28
- fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
29
- end
23
+ fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
24
+ fail 'no inputs matched your glob' if inputs.size == 0
30
25
 
31
- # there has to be a more efficient way of doing this
32
- level_sample_cluster_count =
33
- Hash.new do |h, k|
34
- h[k] = Hash.new do |h, k|
35
- h[k] = Hash.new(0)
36
- end
37
- end
26
+ sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
38
27
 
39
28
  # create a progress bar with the total number of bytes of
40
29
  # the files we're slurping up
41
- pbar = ProgressBar.new "loading", input.size
42
-
43
- # Load cluster table
44
- input.each do |input_file|
45
- pbar.inc
46
- File.open(input_file) do |handle|
47
- handle.each do |line|
48
-
49
- dat = parse_usearch_line(line.strip)
50
- levels.each do |level|
51
- name =
52
- if dat.nil?
53
- 'unclassified_reads'
54
- else
55
- dat[level] || 'unparsed_name'
56
- end
57
-
58
- # remove commas from name
59
- name = name.tr(',', '_')
60
-
61
- # the next two lines are what is slow
62
- level_sample_cluster_count[level][input_file][name] += 1
30
+ pbar = ProgressBar.new "loading", inputs.size
31
+
32
+ inputs.each do |input_file|
33
+ File.open(input_file).each do |line|
34
+ next if line =~ /^#/ # skip header(s)
35
+ line = line.strip.split(',')
36
+ taxonomy, count = line
37
+ count = count.to_i
38
+ tax =
39
+ if taxonomy == 'unclassified_reads'
40
+ 'unclassified_reads'
41
+ else
42
+ parse_taxonomy(taxonomy)[level]
63
43
  end
64
-
65
- end
44
+ sample_cluster_count[input_file][tax] += count
66
45
  end
67
46
  end
68
47
 
69
- pbar.finish
48
+ all_clusters = sample_cluster_count.values.map(&:keys).flatten.uniq.sort
70
49
 
71
- # get all taxonomic names at each level
72
- all_names = Hash.new.tap do |bar|
73
- level_sample_cluster_count.each_pair.map do |k, v|
74
- names = v.each_value.map(&:keys).flatten.uniq
75
- bar[k] = names
50
+ out = File.open(output, 'w')
51
+
52
+ out.puts all_clusters.join(',')
53
+ inputs.sort.each do |input|
54
+ out.print "#{input}"
55
+ all_clusters.each do |c|
56
+ out.print ",#{sample_cluster_count[input][c]}"
76
57
  end
58
+ out.print "\n"
77
59
  end
78
60
 
79
- # save to csv(s)
80
- levels.each do |level|
81
-
82
- ohai "saving #{level} table"
83
-
84
- File.open("#{prefix}.#{level}.csv", 'w') do |handle|
85
- header = all_names[level].to_a.compact.sort
86
- handle.puts "#{level.capitalize},#{header.join(',')}"
87
-
88
- input.each do |sample|
89
- handle.print "#{sample}"
90
- header.each do |name|
91
- handle.print ",#{level_sample_cluster_count[level][sample][name]}"
92
- end
93
- handle.print "\n"
94
- end
95
- end
96
- end
97
61
  end
98
62
 
99
-
100
63
  end # class CLI
101
64
  end # module Lederhosen
@@ -0,0 +1,65 @@
1
+ require 'set'
2
+
3
+ module Lederhosen
4
+
5
+ class CLI
6
+
7
+ desc 'separate_unclassified',
8
+ 'separate unclassified reads (with or without strict pairing)'
9
+
10
+ method_option :uc_file, :type => :string, :required => true
11
+ method_option :reads, :type => :string, :required => true
12
+ method_option :output, :type => :string, :required => true
13
+ method_option :strict, :type => :string, :default => false
14
+
15
+ def separate_unclassified
16
+ uc_file = options[:uc_file]
17
+ reads = options[:reads]
18
+ output = options[:output]
19
+ strict = options[:strict]
20
+
21
+ unclassifieds = Set.new
22
+ handle = File.open(uc_file)
23
+ uc = UCParser.new(handle)
24
+
25
+ if not strict
26
+ uc.each do |result|
27
+ unclassifieds << result.query if result.miss?
28
+ end
29
+
30
+ elsif strict
31
+
32
+ uc.each_slice(2) do |left, right|
33
+ if left.miss? || right.miss? # at least one is a miss
34
+ unclassifieds << left.query
35
+ unclassifieds << right.query
36
+ # both are hits, check taxonomies
37
+ else
38
+ ta = parse_taxonomy(right.target)
39
+ tb = parse_taxonomy(left.target)
40
+ # inconsistent assignment or at least one is a miss
41
+ if (ta[strict] != tb[strict])
42
+ unclassifieds << left.query
43
+ unclassifieds << right.query
44
+ end
45
+ end
46
+ end
47
+
48
+ end
49
+
50
+ ohai "found #{unclassifieds.size} unclassified #{'(strict pairing)' if strict} reads."
51
+
52
+ handle.close
53
+
54
+ # open fasta file, output unclassified reads
55
+ out = File.open(output, 'w')
56
+ Dna.new(File.open(reads)).each do |record|
57
+ if unclassifieds.include? record.name
58
+ out.puts record
59
+ end
60
+ end
61
+ out.close
62
+
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,88 @@
1
+ require 'ostruct'
2
+
3
+ module Lederhosen
4
+
5
+ # represents a usearch result
6
+ class UResult
7
+
8
+ def initialize(hash)
9
+ @source = OpenStruct.new(hash)
10
+ end
11
+
12
+ def method_missing(method, *args, &block)
13
+ @source.send(method, *args, &block)
14
+ end
15
+
16
+ def hit?
17
+ @source.type == 'H'
18
+ end
19
+
20
+ def miss?
21
+ @source.type == 'N'
22
+ end
23
+ end
24
+
25
+ # class for parsing UC files, generates UResult objects
26
+ class UCParser
27
+ include Enumerable
28
+
29
+ def initialize(handle)
30
+ @handle = handle
31
+ end
32
+
33
+ def each(&block)
34
+ @handle.each do |line|
35
+ next if line =~ /^[#C]/ # skip comments and cluster summaries
36
+ dat = parse_usearch_line(line.strip)
37
+ result = UResult.new(dat)
38
+ block.call(result)
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ # parse a line of usearch prefix
45
+ # return a hash in the form:
46
+ # { :taxonomy => '', :identity => '0.00', ... }
47
+ # unless the line is not a "hit" in which case
48
+ # the function returns nil
49
+ def parse_usearch_line(str)
50
+
51
+ # from http://drive5.com/usearch/manual/ucout.html
52
+ # 1 Record type S, H, C or N (see table below).
53
+ # 2 Cluster number (0-based).
54
+ # 3 Sequence length (S, N and H) or cluster size (C).
55
+ # 4 For H records, percent identity with target.
56
+ # 5 For H records, the strand: + or - for nucleotides, . for proteins.
57
+ # 6 Not used, parsers should ignore this field. Included for backwards compatibility.
58
+ # 7 Not used, parsers should ignore this field. Included for backwards compatibility.
59
+ # 8 Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10).
60
+ # 9 Label of query sequence (always present).
61
+ # 10 Label of target sequence (H records only).
62
+
63
+ str = str.split("\t")
64
+
65
+ dat = {
66
+ :type => str[0],
67
+ :cluster_no => str[1],
68
+ :alignment => str[7],
69
+ :query => str[8],
70
+ :target => str[9],
71
+ }
72
+
73
+ r =
74
+ if dat[:type] =~ /[SNH]/ # hits
75
+ { :length => str[2].to_i,
76
+ :identity => str[3],
77
+ :strand => str[4],
78
+ }
79
+ elsif dat[:type] == 'C' # clusters
80
+ { :cluster_size => str[2].to_i }
81
+ else
82
+ raise Exception, "Do not understand record type #{str[0]}!"
83
+ end
84
+
85
+ dat.merge(r)
86
+ end
87
+ end
88
+ end
@@ -1,9 +1,9 @@
1
1
  module Lederhosen
2
2
  module Version
3
- MAJOR = 1
4
- MINOR = 8
5
- CODENAME = 'Karottensaft' # changes for minor versions
6
- PATCH = 2
3
+ MAJOR = 2
4
+ MINOR = 0
5
+ CODENAME = 'Schnittlauchbrot' # changes for minor versions
6
+ PATCH = 0
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH].join('.')
9
9
  end