lederhosen 1.8.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -8,7 +8,7 @@ group :test do
8
8
  gem 'rspec', '2.12.0'
9
9
  gem 'rspec-prof', '0.0.3'
10
10
  gem 'pry'
11
- gem 'plymouth'
11
+ # gem 'plymouth'
12
12
  end
13
13
 
14
14
  group :development do
data/lederhosen.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "1.8.2"
8
+ s.version = "2.0.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
12
- s.date = "2013-01-17"
12
+ s.date = "2013-01-24"
13
13
  s.description = "Various tools for OTU clustering"
14
14
  s.email = "harekrishna@gmail.com"
15
15
  s.executables = ["lederhosen"]
@@ -27,13 +27,16 @@ Gem::Specification.new do |s|
27
27
  "lib/lederhosen/cli.rb",
28
28
  "lib/lederhosen/no_tasks.rb",
29
29
  "lib/lederhosen/tasks/cluster.rb",
30
+ "lib/lederhosen/tasks/count_taxonomies.rb",
30
31
  "lib/lederhosen/tasks/get_reps.rb",
31
32
  "lib/lederhosen/tasks/join_otu_tables.rb",
32
33
  "lib/lederhosen/tasks/make_udb.rb",
33
34
  "lib/lederhosen/tasks/otu_filter.rb",
34
35
  "lib/lederhosen/tasks/otu_table.rb",
36
+ "lib/lederhosen/tasks/separate_unclassified.rb",
35
37
  "lib/lederhosen/tasks/split_fasta.rb",
36
38
  "lib/lederhosen/tasks/version.rb",
39
+ "lib/lederhosen/uc_parser.rb",
37
40
  "lib/lederhosen/version.rb",
38
41
  "readme.md",
39
42
  "scripts/illumina_pipeline/.gitignore",
@@ -46,7 +49,8 @@ Gem::Specification.new do |s|
46
49
  "spec/data/test.uc",
47
50
  "spec/data/trimmed/ILT_L_9_B_001.fasta",
48
51
  "spec/no_tasks_spec.rb",
49
- "spec/spec_helper.rb"
52
+ "spec/spec_helper.rb",
53
+ "spec/uc_parser_spec.rb"
50
54
  ]
51
55
  s.homepage = "http://audy.github.com/lederhosen"
52
56
  s.licenses = ["MIT"]
@@ -6,26 +6,24 @@ module Lederhosen
6
6
 
7
7
  no_tasks do
8
8
 
9
- # parse a line of usearch prefix
10
- # return a hash in the form:
11
- # { :taxonomy => '', :identity => '0.00', ... }
12
- # unless the line is not a "hit" in which case
13
- # the function returns nil
14
- def parse_usearch_line(str)
15
-
16
- # skip non hits
17
- return nil unless str =~ /^H/
18
-
19
- str = str.split
20
-
21
- taxonomic_description = str[9]
22
- identity = str[3]
9
+ # get a taxonomic description from a line of usearch (uc) output
10
+ # return 'unclassified_reads' if the result was not a hit
11
+ # if the result was neither a hit nor a miss (for example, a seed)
12
+ # return nil
13
+ # this will probably break for different versions of uc file
14
+ # as produced by uclust or older versions of usearch
15
+ def get_tax(s)
16
+ dat = parse_usearch_line(s.strip)
17
+ if dat[:type] == 'H'
18
+ dat[:taxonomic_description].tr(',', '_')
19
+ elsif dat[:type] == 'N'
20
+ 'unclassified_reads'
21
+ else
22
+ nil
23
+ end
24
+ end
23
25
 
24
- # parse taxonomic_description
25
- taxonomies = parse_taxonomy(taxonomic_description) rescue { 'original' => str[9] }
26
26
 
27
- { :identity => identity }.merge(taxonomies)
28
- end
29
27
 
30
28
  # detect whether the taxonomy is one of the following
31
29
  # possible formats:
@@ -40,6 +38,8 @@ module Lederhosen
40
38
  :taxcollector
41
39
  elsif taxonomy =~ /^\d/
42
40
  :greengenes
41
+ elsif taxonomy.nil?
42
+ raise "nil ain't no taxonomy I ever heard of!"
43
43
  else
44
44
  :qiime
45
45
  end
@@ -0,0 +1,83 @@
1
+ module Lederhosen
2
+ class CLI
3
+
4
+ desc 'count_taxonomies', 'count taxonomies from a uc file, generating a csv file with: <taxonomy>,<reads>'
5
+
6
+ method_option :input, :type => :string, :required => true
7
+ method_option :output, :type => :string, :required => true
8
+ method_option :strict, :type => :string, :default => false,
9
+ :banner => '<level> only count reads where both taxonomies are in agreement at <level>'
10
+
11
+ def count_taxonomies
12
+ input = options[:input]
13
+ output = options[:output]
14
+ strict = options[:strict]
15
+
16
+ ohai "generating #{output} from #{input}"
17
+
18
+ handle = File.open(input)
19
+ uc = UCParser.new(handle)
20
+
21
+ taxonomy_count =
22
+ if not strict
23
+ get_taxonomy_count(uc)
24
+ elsif strict
25
+ get_strict_taxonomy_count(uc, strict)
26
+ end
27
+
28
+ handle.close
29
+
30
+ out = File.open(output, 'w')
31
+ out.puts '# taxonomy, number_of_reads'
32
+ taxonomy_count.each_pair do |taxonomy, count|
33
+ out.puts "#{taxonomy.tr(',','_')},#{count}"
34
+ end
35
+ out.close
36
+
37
+ end
38
+
39
+ no_tasks do
40
+ # returns Hash of taxonomy => number_of_reads
41
+ def get_taxonomy_count(uc)
42
+ taxonomy_count = Hash.new { |h, k| h[k] = 0 }
43
+ uc.each do |result|
44
+ if result.hit?
45
+ taxonomy_count[result.target] += 1
46
+ else
47
+ taxonomy_count['unclassified_reads'] += 1
48
+ end
49
+ end
50
+ taxonomy_count
51
+ end
52
+
53
+ # returns Hash of taxonomy => number_of_reads
54
+ # if a pair of reads do not agree at a taxonomic level,
55
+ # or if at least one is unclassified, bot reads are counted
56
+ # as unclassified_reads
57
+ def get_strict_taxonomy_count(uc, level)
58
+ taxonomy_count = Hash.new { |h, k| h[k] = 0 }
59
+ # TODO: I'm making a block for results because I don't know how to
60
+ # make results return an Enumerator when not given a block
61
+ uc.each_slice(2) do |left, right|
62
+ if left.miss? or right.miss? # at least one is a miss
63
+ taxonomy_count['unclassified_reads'] += 2
64
+ # both are hits, check taxonomies
65
+ else
66
+ ta = parse_taxonomy(left.target)
67
+ tb = parse_taxonomy(right.target)
68
+ # they match up, count both separately
69
+ if ta[level] == tb[level]
70
+ taxonomy_count[left.target] += 1
71
+ taxonomy_count[right.target] += 1
72
+ # they don't match up, count as unclassified
73
+ else
74
+ taxonomy_count['unclassified_reads'] += 2
75
+ end
76
+ end
77
+ end # results.each_slice
78
+ taxonomy_count
79
+ end
80
+
81
+ end
82
+ end
83
+ end
@@ -23,10 +23,9 @@ module Lederhosen
23
23
 
24
24
  inputs.each do |input|
25
25
  File.open(input) do |handle|
26
- pbar.inc
27
- handle.each do |line|
28
- header = parse_usearch_line(line.strip)
29
- taxa << header['original'] rescue nil
26
+ results = UCParser.new(handle)
27
+ results.each do |result|
28
+ taxa << result.target if result.hit?
30
29
  end
31
30
  end
32
31
  end
@@ -3,8 +3,8 @@ module Lederhosen
3
3
 
4
4
  desc 'make_udb', 'format database for usearch'
5
5
 
6
- method_option :input, :type => :string, :required => true
7
- method_option :output, :type => :string, :required => true
6
+ method_option :input, :type => :string, :required => true
7
+ method_option :output, :type => :string, :required => true
8
8
 
9
9
  def make_udb
10
10
  input = options[:input]
@@ -39,20 +39,27 @@ module Lederhosen
39
39
  ohai "filtering"
40
40
 
41
41
  # filter sample_cluster_count
42
+ # todo: move filtered reads to 'unclassified_reads' classification
42
43
  filtered = cluster_sample_count.reject { |k, v| v.reject { |k, v| v < reads }.size < min_samples }
43
44
 
45
+ # use functional programming they said
46
+ # it will make your better they said
47
+ noise = cluster_sample_count.keys - filtered.keys
48
+
44
49
  ohai "saving to #{output}"
45
50
 
46
51
  # save the table
47
52
  out = File.open(output, 'w')
48
53
  samples = filtered.values.map(&:keys).flatten.uniq
49
54
  clusters = filtered.keys
50
- out.puts "-,#{clusters.join(',')}"
55
+ out.puts "-,#{clusters.join(',')},noise"
51
56
  samples.each do |sample|
52
57
  out.print "#{sample}"
53
58
  clusters.each do |cluster|
54
59
  out.print ",#{filtered[cluster][sample]}"
55
60
  end
61
+ noise_sum = noise.map { |n| cluster_sample_count[n][sample]}.inject(:+)
62
+ out.print ",#{noise_sum}"
56
63
  out.print "\n"
57
64
  end
58
65
  out.close
@@ -6,96 +6,59 @@ module Lederhosen
6
6
  class CLI
7
7
 
8
8
  desc "otu_table",
9
- "create an OTU abundance matrix from USEARCH prefix"
9
+ "create an OTU abundance matrix from taxonomy count files"
10
10
 
11
11
  method_option :files, :type => :string, :required => true
12
-
13
- method_option :prefix, :type => :string, :required => true,
14
- :banner => 'prefix prefix'
15
-
16
- method_option :levels, :type => :array, :required => true,
17
- :banner => 'valid options: domain, kingdom, phylum, class, order, genus, species, original (or all of them at once)'
12
+ method_option :level, :type => :string, :required => true
13
+ method_option :output, :type => :string, :required => true
18
14
 
19
15
  def otu_table
20
- input = Dir[options[:files]]
21
- prefix = options[:prefix]
22
- levels = options[:levels].map(&:downcase)
16
+ inputs = Dir[options[:files]]
17
+ level = options[:level].downcase
18
+ output = options[:output]
23
19
 
24
- ohai "generating #{levels.join(', ')} table(s) from #{input.size} file(s) and saving to prefix #{prefix}."
20
+ ohai "Generating OTU matrix from #{inputs.size} inputs at #{level} level and saving to #{output}."
25
21
 
26
22
  # sanity check
27
- levels.each do |level|
28
- fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
29
- end
23
+ fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
24
+ fail 'no inputs matched your glob' if inputs.size == 0
30
25
 
31
- # there has to be a more efficient way of doing this
32
- level_sample_cluster_count =
33
- Hash.new do |h, k|
34
- h[k] = Hash.new do |h, k|
35
- h[k] = Hash.new(0)
36
- end
37
- end
26
+ sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
38
27
 
39
28
  # create a progress bar with the total number of bytes of
40
29
  # the files we're slurping up
41
- pbar = ProgressBar.new "loading", input.size
42
-
43
- # Load cluster table
44
- input.each do |input_file|
45
- pbar.inc
46
- File.open(input_file) do |handle|
47
- handle.each do |line|
48
-
49
- dat = parse_usearch_line(line.strip)
50
- levels.each do |level|
51
- name =
52
- if dat.nil?
53
- 'unclassified_reads'
54
- else
55
- dat[level] || 'unparsed_name'
56
- end
57
-
58
- # remove commas from name
59
- name = name.tr(',', '_')
60
-
61
- # the next two lines are what is slow
62
- level_sample_cluster_count[level][input_file][name] += 1
30
+ pbar = ProgressBar.new "loading", inputs.size
31
+
32
+ inputs.each do |input_file|
33
+ File.open(input_file).each do |line|
34
+ next if line =~ /^#/ # skip header(s)
35
+ line = line.strip.split(',')
36
+ taxonomy, count = line
37
+ count = count.to_i
38
+ tax =
39
+ if taxonomy == 'unclassified_reads'
40
+ 'unclassified_reads'
41
+ else
42
+ parse_taxonomy(taxonomy)[level]
63
43
  end
64
-
65
- end
44
+ sample_cluster_count[input_file][tax] += count
66
45
  end
67
46
  end
68
47
 
69
- pbar.finish
48
+ all_clusters = sample_cluster_count.values.map(&:keys).flatten.uniq.sort
70
49
 
71
- # get all taxonomic names at each level
72
- all_names = Hash.new.tap do |bar|
73
- level_sample_cluster_count.each_pair.map do |k, v|
74
- names = v.each_value.map(&:keys).flatten.uniq
75
- bar[k] = names
50
+ out = File.open(output, 'w')
51
+
52
+ out.puts all_clusters.join(',')
53
+ inputs.sort.each do |input|
54
+ out.print "#{input}"
55
+ all_clusters.each do |c|
56
+ out.print ",#{sample_cluster_count[input][c]}"
76
57
  end
58
+ out.print "\n"
77
59
  end
78
60
 
79
- # save to csv(s)
80
- levels.each do |level|
81
-
82
- ohai "saving #{level} table"
83
-
84
- File.open("#{prefix}.#{level}.csv", 'w') do |handle|
85
- header = all_names[level].to_a.compact.sort
86
- handle.puts "#{level.capitalize},#{header.join(',')}"
87
-
88
- input.each do |sample|
89
- handle.print "#{sample}"
90
- header.each do |name|
91
- handle.print ",#{level_sample_cluster_count[level][sample][name]}"
92
- end
93
- handle.print "\n"
94
- end
95
- end
96
- end
97
61
  end
98
62
 
99
-
100
63
  end # class CLI
101
64
  end # module Lederhosen
@@ -0,0 +1,65 @@
1
+ require 'set'
2
+
3
+ module Lederhosen
4
+
5
+ class CLI
6
+
7
+ desc 'separate_unclassified',
8
+ 'separate unclassified reads (with or without strict pairing)'
9
+
10
+ method_option :uc_file, :type => :string, :required => true
11
+ method_option :reads, :type => :string, :required => true
12
+ method_option :output, :type => :string, :required => true
13
+ method_option :strict, :type => :string, :default => false
14
+
15
+ def separate_unclassified
16
+ uc_file = options[:uc_file]
17
+ reads = options[:reads]
18
+ output = options[:output]
19
+ strict = options[:strict]
20
+
21
+ unclassifieds = Set.new
22
+ handle = File.open(uc_file)
23
+ uc = UCParser.new(handle)
24
+
25
+ if not strict
26
+ uc.each do |result|
27
+ unclassifieds << result.query if result.miss?
28
+ end
29
+
30
+ elsif strict
31
+
32
+ uc.each_slice(2) do |left, right|
33
+ if left.miss? || right.miss? # at least one is a miss
34
+ unclassifieds << left.query
35
+ unclassifieds << right.query
36
+ # both are hits, check taxonomies
37
+ else
38
+ ta = parse_taxonomy(right.target)
39
+ tb = parse_taxonomy(left.target)
40
+ # inconsistent assignment or at least one is a miss
41
+ if (ta[strict] != tb[strict])
42
+ unclassifieds << left.query
43
+ unclassifieds << right.query
44
+ end
45
+ end
46
+ end
47
+
48
+ end
49
+
50
+ ohai "found #{unclassifieds.size} unclassified #{'(strict pairing)' if strict} reads."
51
+
52
+ handle.close
53
+
54
+ # open fasta file, output unclassified reads
55
+ out = File.open(output, 'w')
56
+ Dna.new(File.open(reads)).each do |record|
57
+ if unclassifieds.include? record.name
58
+ out.puts record
59
+ end
60
+ end
61
+ out.close
62
+
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,88 @@
1
+ require 'ostruct'
2
+
3
+ module Lederhosen
4
+
5
+ # represents a usearch result
6
+ class UResult
7
+
8
+ def initialize(hash)
9
+ @source = OpenStruct.new(hash)
10
+ end
11
+
12
+ def method_missing(method, *args, &block)
13
+ @source.send(method, *args, &block)
14
+ end
15
+
16
+ def hit?
17
+ @source.type == 'H'
18
+ end
19
+
20
+ def miss?
21
+ @source.type == 'N'
22
+ end
23
+ end
24
+
25
+ # class for parsing UC files, generates UResult objects
26
+ class UCParser
27
+ include Enumerable
28
+
29
+ def initialize(handle)
30
+ @handle = handle
31
+ end
32
+
33
+ def each(&block)
34
+ @handle.each do |line|
35
+ next if line =~ /^[#C]/ # skip comments and cluster summaries
36
+ dat = parse_usearch_line(line.strip)
37
+ result = UResult.new(dat)
38
+ block.call(result)
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ # parse a line of usearch prefix
45
+ # return a hash in the form:
46
+ # { :taxonomy => '', :identity => '0.00', ... }
47
+ # unless the line is not a "hit" in which case
48
+ # the function returns nil
49
+ def parse_usearch_line(str)
50
+
51
+ # from http://drive5.com/usearch/manual/ucout.html
52
+ # 1 Record type S, H, C or N (see table below).
53
+ # 2 Cluster number (0-based).
54
+ # 3 Sequence length (S, N and H) or cluster size (C).
55
+ # 4 For H records, percent identity with target.
56
+ # 5 For H records, the strand: + or - for nucleotides, . for proteins.
57
+ # 6 Not used, parsers should ignore this field. Included for backwards compatibility.
58
+ # 7 Not used, parsers should ignore this field. Included for backwards compatibility.
59
+ # 8 Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10).
60
+ # 9 Label of query sequence (always present).
61
+ # 10 Label of target sequence (H records only).
62
+
63
+ str = str.split("\t")
64
+
65
+ dat = {
66
+ :type => str[0],
67
+ :cluster_no => str[1],
68
+ :alignment => str[7],
69
+ :query => str[8],
70
+ :target => str[9],
71
+ }
72
+
73
+ r =
74
+ if dat[:type] =~ /[SNH]/ # hits
75
+ { :length => str[2].to_i,
76
+ :identity => str[3],
77
+ :strand => str[4],
78
+ }
79
+ elsif dat[:type] == 'C' # clusters
80
+ { :cluster_size => str[2].to_i }
81
+ else
82
+ raise Exception, "Do not understand record type #{str[0]}!"
83
+ end
84
+
85
+ dat.merge(r)
86
+ end
87
+ end
88
+ end
@@ -1,9 +1,9 @@
1
1
  module Lederhosen
2
2
  module Version
3
- MAJOR = 1
4
- MINOR = 8
5
- CODENAME = 'Karottensaft' # changes for minor versions
6
- PATCH = 2
3
+ MAJOR = 2
4
+ MINOR = 0
5
+ CODENAME = 'Schnittlauchbrot' # changes for minor versions
6
+ PATCH = 0
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH].join('.')
9
9
  end