lederhosen 1.8.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -1
- data/lederhosen.gemspec +7 -3
- data/lib/lederhosen/no_tasks.rb +18 -18
- data/lib/lederhosen/tasks/count_taxonomies.rb +83 -0
- data/lib/lederhosen/tasks/get_reps.rb +3 -4
- data/lib/lederhosen/tasks/make_udb.rb +2 -2
- data/lib/lederhosen/tasks/otu_filter.rb +8 -1
- data/lib/lederhosen/tasks/otu_table.rb +33 -70
- data/lib/lederhosen/tasks/separate_unclassified.rb +65 -0
- data/lib/lederhosen/uc_parser.rb +88 -0
- data/lib/lederhosen/version.rb +4 -4
- data/readme.md +107 -11
- data/spec/cli_spec.rb +62 -10
- data/spec/data/test.uc +9 -684
- data/spec/data/trimmed/ILT_L_9_B_001.fasta +100 -1596
- data/spec/no_tasks_spec.rb +1 -1
- data/spec/uc_parser_spec.rb +0 -0
- metadata +7 -3
data/Gemfile
CHANGED
data/lederhosen.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "lederhosen"
|
8
|
-
s.version = "
|
8
|
+
s.version = "2.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
12
|
-
s.date = "2013-01-
|
12
|
+
s.date = "2013-01-24"
|
13
13
|
s.description = "Various tools for OTU clustering"
|
14
14
|
s.email = "harekrishna@gmail.com"
|
15
15
|
s.executables = ["lederhosen"]
|
@@ -27,13 +27,16 @@ Gem::Specification.new do |s|
|
|
27
27
|
"lib/lederhosen/cli.rb",
|
28
28
|
"lib/lederhosen/no_tasks.rb",
|
29
29
|
"lib/lederhosen/tasks/cluster.rb",
|
30
|
+
"lib/lederhosen/tasks/count_taxonomies.rb",
|
30
31
|
"lib/lederhosen/tasks/get_reps.rb",
|
31
32
|
"lib/lederhosen/tasks/join_otu_tables.rb",
|
32
33
|
"lib/lederhosen/tasks/make_udb.rb",
|
33
34
|
"lib/lederhosen/tasks/otu_filter.rb",
|
34
35
|
"lib/lederhosen/tasks/otu_table.rb",
|
36
|
+
"lib/lederhosen/tasks/separate_unclassified.rb",
|
35
37
|
"lib/lederhosen/tasks/split_fasta.rb",
|
36
38
|
"lib/lederhosen/tasks/version.rb",
|
39
|
+
"lib/lederhosen/uc_parser.rb",
|
37
40
|
"lib/lederhosen/version.rb",
|
38
41
|
"readme.md",
|
39
42
|
"scripts/illumina_pipeline/.gitignore",
|
@@ -46,7 +49,8 @@ Gem::Specification.new do |s|
|
|
46
49
|
"spec/data/test.uc",
|
47
50
|
"spec/data/trimmed/ILT_L_9_B_001.fasta",
|
48
51
|
"spec/no_tasks_spec.rb",
|
49
|
-
"spec/spec_helper.rb"
|
52
|
+
"spec/spec_helper.rb",
|
53
|
+
"spec/uc_parser_spec.rb"
|
50
54
|
]
|
51
55
|
s.homepage = "http://audy.github.com/lederhosen"
|
52
56
|
s.licenses = ["MIT"]
|
data/lib/lederhosen/no_tasks.rb
CHANGED
@@ -6,26 +6,24 @@ module Lederhosen
|
|
6
6
|
|
7
7
|
no_tasks do
|
8
8
|
|
9
|
-
#
|
10
|
-
# return
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
9
|
+
# get a taxonomic description from a line of usearch (uc) output
|
10
|
+
# return 'unclassified_reads' if the result was not a hit
|
11
|
+
# if the result was neither a hit nor a miss (for example, a seed)
|
12
|
+
# return nil
|
13
|
+
# this will probably break for different versions of uc file
|
14
|
+
# as produced by uclust or older versions of usearch
|
15
|
+
def get_tax(s)
|
16
|
+
dat = parse_usearch_line(s.strip)
|
17
|
+
if dat[:type] == 'H'
|
18
|
+
dat[:taxonomic_description].tr(',', '_')
|
19
|
+
elsif dat[:type] == 'N'
|
20
|
+
'unclassified_reads'
|
21
|
+
else
|
22
|
+
nil
|
23
|
+
end
|
24
|
+
end
|
23
25
|
|
24
|
-
# parse taxonomic_description
|
25
|
-
taxonomies = parse_taxonomy(taxonomic_description) rescue { 'original' => str[9] }
|
26
26
|
|
27
|
-
{ :identity => identity }.merge(taxonomies)
|
28
|
-
end
|
29
27
|
|
30
28
|
# detect whether the taxonomy is one of the following
|
31
29
|
# possible formats:
|
@@ -40,6 +38,8 @@ module Lederhosen
|
|
40
38
|
:taxcollector
|
41
39
|
elsif taxonomy =~ /^\d/
|
42
40
|
:greengenes
|
41
|
+
elsif taxonomy.nil?
|
42
|
+
raise "nil ain't no taxonomy I ever heard of!"
|
43
43
|
else
|
44
44
|
:qiime
|
45
45
|
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module Lederhosen
|
2
|
+
class CLI
|
3
|
+
|
4
|
+
desc 'count_taxonomies', 'count taxonomies from a uc file, generating a csv file with: <taxonomy>,<reads>'
|
5
|
+
|
6
|
+
method_option :input, :type => :string, :required => true
|
7
|
+
method_option :output, :type => :string, :required => true
|
8
|
+
method_option :strict, :type => :string, :default => false,
|
9
|
+
:banner => '<level> only count reads where both taxonomies are in agreement at <level>'
|
10
|
+
|
11
|
+
def count_taxonomies
|
12
|
+
input = options[:input]
|
13
|
+
output = options[:output]
|
14
|
+
strict = options[:strict]
|
15
|
+
|
16
|
+
ohai "generating #{output} from #{input}"
|
17
|
+
|
18
|
+
handle = File.open(input)
|
19
|
+
uc = UCParser.new(handle)
|
20
|
+
|
21
|
+
taxonomy_count =
|
22
|
+
if not strict
|
23
|
+
get_taxonomy_count(uc)
|
24
|
+
elsif strict
|
25
|
+
get_strict_taxonomy_count(uc, strict)
|
26
|
+
end
|
27
|
+
|
28
|
+
handle.close
|
29
|
+
|
30
|
+
out = File.open(output, 'w')
|
31
|
+
out.puts '# taxonomy, number_of_reads'
|
32
|
+
taxonomy_count.each_pair do |taxonomy, count|
|
33
|
+
out.puts "#{taxonomy.tr(',','_')},#{count}"
|
34
|
+
end
|
35
|
+
out.close
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
no_tasks do
|
40
|
+
# returns Hash of taxonomy => number_of_reads
|
41
|
+
def get_taxonomy_count(uc)
|
42
|
+
taxonomy_count = Hash.new { |h, k| h[k] = 0 }
|
43
|
+
uc.each do |result|
|
44
|
+
if result.hit?
|
45
|
+
taxonomy_count[result.target] += 1
|
46
|
+
else
|
47
|
+
taxonomy_count['unclassified_reads'] += 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
taxonomy_count
|
51
|
+
end
|
52
|
+
|
53
|
+
# returns Hash of taxonomy => number_of_reads
|
54
|
+
# if a pair of reads do not agree at a taxonomic level,
|
55
|
+
# or if at least one is unclassified, bot reads are counted
|
56
|
+
# as unclassified_reads
|
57
|
+
def get_strict_taxonomy_count(uc, level)
|
58
|
+
taxonomy_count = Hash.new { |h, k| h[k] = 0 }
|
59
|
+
# TODO: I'm making a block for results because I don't know how to
|
60
|
+
# make results return an Enumerator when not given a block
|
61
|
+
uc.each_slice(2) do |left, right|
|
62
|
+
if left.miss? or right.miss? # at least one is a miss
|
63
|
+
taxonomy_count['unclassified_reads'] += 2
|
64
|
+
# both are hits, check taxonomies
|
65
|
+
else
|
66
|
+
ta = parse_taxonomy(left.target)
|
67
|
+
tb = parse_taxonomy(right.target)
|
68
|
+
# they match up, count both separately
|
69
|
+
if ta[level] == tb[level]
|
70
|
+
taxonomy_count[left.target] += 1
|
71
|
+
taxonomy_count[right.target] += 1
|
72
|
+
# they don't match up, count as unclassified
|
73
|
+
else
|
74
|
+
taxonomy_count['unclassified_reads'] += 2
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end # results.each_slice
|
78
|
+
taxonomy_count
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -23,10 +23,9 @@ module Lederhosen
|
|
23
23
|
|
24
24
|
inputs.each do |input|
|
25
25
|
File.open(input) do |handle|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
taxa << header['original'] rescue nil
|
26
|
+
results = UCParser.new(handle)
|
27
|
+
results.each do |result|
|
28
|
+
taxa << result.target if result.hit?
|
30
29
|
end
|
31
30
|
end
|
32
31
|
end
|
@@ -3,8 +3,8 @@ module Lederhosen
|
|
3
3
|
|
4
4
|
desc 'make_udb', 'format database for usearch'
|
5
5
|
|
6
|
-
method_option :input,
|
7
|
-
method_option :output,
|
6
|
+
method_option :input, :type => :string, :required => true
|
7
|
+
method_option :output, :type => :string, :required => true
|
8
8
|
|
9
9
|
def make_udb
|
10
10
|
input = options[:input]
|
@@ -39,20 +39,27 @@ module Lederhosen
|
|
39
39
|
ohai "filtering"
|
40
40
|
|
41
41
|
# filter sample_cluster_count
|
42
|
+
# todo: move filtered reads to 'unclassified_reads' classification
|
42
43
|
filtered = cluster_sample_count.reject { |k, v| v.reject { |k, v| v < reads }.size < min_samples }
|
43
44
|
|
45
|
+
# use functional programming they said
|
46
|
+
# it will make your better they said
|
47
|
+
noise = cluster_sample_count.keys - filtered.keys
|
48
|
+
|
44
49
|
ohai "saving to #{output}"
|
45
50
|
|
46
51
|
# save the table
|
47
52
|
out = File.open(output, 'w')
|
48
53
|
samples = filtered.values.map(&:keys).flatten.uniq
|
49
54
|
clusters = filtered.keys
|
50
|
-
out.puts "-,#{clusters.join(',')}"
|
55
|
+
out.puts "-,#{clusters.join(',')},noise"
|
51
56
|
samples.each do |sample|
|
52
57
|
out.print "#{sample}"
|
53
58
|
clusters.each do |cluster|
|
54
59
|
out.print ",#{filtered[cluster][sample]}"
|
55
60
|
end
|
61
|
+
noise_sum = noise.map { |n| cluster_sample_count[n][sample]}.inject(:+)
|
62
|
+
out.print ",#{noise_sum}"
|
56
63
|
out.print "\n"
|
57
64
|
end
|
58
65
|
out.close
|
@@ -6,96 +6,59 @@ module Lederhosen
|
|
6
6
|
class CLI
|
7
7
|
|
8
8
|
desc "otu_table",
|
9
|
-
"create an OTU abundance matrix from
|
9
|
+
"create an OTU abundance matrix from taxonomy count files"
|
10
10
|
|
11
11
|
method_option :files, :type => :string, :required => true
|
12
|
-
|
13
|
-
method_option :
|
14
|
-
:banner => 'prefix prefix'
|
15
|
-
|
16
|
-
method_option :levels, :type => :array, :required => true,
|
17
|
-
:banner => 'valid options: domain, kingdom, phylum, class, order, genus, species, original (or all of them at once)'
|
12
|
+
method_option :level, :type => :string, :required => true
|
13
|
+
method_option :output, :type => :string, :required => true
|
18
14
|
|
19
15
|
def otu_table
|
20
|
-
|
21
|
-
|
22
|
-
|
16
|
+
inputs = Dir[options[:files]]
|
17
|
+
level = options[:level].downcase
|
18
|
+
output = options[:output]
|
23
19
|
|
24
|
-
ohai "
|
20
|
+
ohai "Generating OTU matrix from #{inputs.size} inputs at #{level} level and saving to #{output}."
|
25
21
|
|
26
22
|
# sanity check
|
27
|
-
|
28
|
-
|
29
|
-
end
|
23
|
+
fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
|
24
|
+
fail 'no inputs matched your glob' if inputs.size == 0
|
30
25
|
|
31
|
-
|
32
|
-
level_sample_cluster_count =
|
33
|
-
Hash.new do |h, k|
|
34
|
-
h[k] = Hash.new do |h, k|
|
35
|
-
h[k] = Hash.new(0)
|
36
|
-
end
|
37
|
-
end
|
26
|
+
sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
|
38
27
|
|
39
28
|
# create a progress bar with the total number of bytes of
|
40
29
|
# the files we're slurping up
|
41
|
-
pbar = ProgressBar.new "loading",
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
else
|
55
|
-
dat[level] || 'unparsed_name'
|
56
|
-
end
|
57
|
-
|
58
|
-
# remove commas from name
|
59
|
-
name = name.tr(',', '_')
|
60
|
-
|
61
|
-
# the next two lines are what is slow
|
62
|
-
level_sample_cluster_count[level][input_file][name] += 1
|
30
|
+
pbar = ProgressBar.new "loading", inputs.size
|
31
|
+
|
32
|
+
inputs.each do |input_file|
|
33
|
+
File.open(input_file).each do |line|
|
34
|
+
next if line =~ /^#/ # skip header(s)
|
35
|
+
line = line.strip.split(',')
|
36
|
+
taxonomy, count = line
|
37
|
+
count = count.to_i
|
38
|
+
tax =
|
39
|
+
if taxonomy == 'unclassified_reads'
|
40
|
+
'unclassified_reads'
|
41
|
+
else
|
42
|
+
parse_taxonomy(taxonomy)[level]
|
63
43
|
end
|
64
|
-
|
65
|
-
end
|
44
|
+
sample_cluster_count[input_file][tax] += count
|
66
45
|
end
|
67
46
|
end
|
68
47
|
|
69
|
-
|
48
|
+
all_clusters = sample_cluster_count.values.map(&:keys).flatten.uniq.sort
|
70
49
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
50
|
+
out = File.open(output, 'w')
|
51
|
+
|
52
|
+
out.puts all_clusters.join(',')
|
53
|
+
inputs.sort.each do |input|
|
54
|
+
out.print "#{input}"
|
55
|
+
all_clusters.each do |c|
|
56
|
+
out.print ",#{sample_cluster_count[input][c]}"
|
76
57
|
end
|
58
|
+
out.print "\n"
|
77
59
|
end
|
78
60
|
|
79
|
-
# save to csv(s)
|
80
|
-
levels.each do |level|
|
81
|
-
|
82
|
-
ohai "saving #{level} table"
|
83
|
-
|
84
|
-
File.open("#{prefix}.#{level}.csv", 'w') do |handle|
|
85
|
-
header = all_names[level].to_a.compact.sort
|
86
|
-
handle.puts "#{level.capitalize},#{header.join(',')}"
|
87
|
-
|
88
|
-
input.each do |sample|
|
89
|
-
handle.print "#{sample}"
|
90
|
-
header.each do |name|
|
91
|
-
handle.print ",#{level_sample_cluster_count[level][sample][name]}"
|
92
|
-
end
|
93
|
-
handle.print "\n"
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
61
|
end
|
98
62
|
|
99
|
-
|
100
63
|
end # class CLI
|
101
64
|
end # module Lederhosen
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Lederhosen
|
4
|
+
|
5
|
+
class CLI
|
6
|
+
|
7
|
+
desc 'separate_unclassified',
|
8
|
+
'separate unclassified reads (with or without strict pairing)'
|
9
|
+
|
10
|
+
method_option :uc_file, :type => :string, :required => true
|
11
|
+
method_option :reads, :type => :string, :required => true
|
12
|
+
method_option :output, :type => :string, :required => true
|
13
|
+
method_option :strict, :type => :string, :default => false
|
14
|
+
|
15
|
+
def separate_unclassified
|
16
|
+
uc_file = options[:uc_file]
|
17
|
+
reads = options[:reads]
|
18
|
+
output = options[:output]
|
19
|
+
strict = options[:strict]
|
20
|
+
|
21
|
+
unclassifieds = Set.new
|
22
|
+
handle = File.open(uc_file)
|
23
|
+
uc = UCParser.new(handle)
|
24
|
+
|
25
|
+
if not strict
|
26
|
+
uc.each do |result|
|
27
|
+
unclassifieds << result.query if result.miss?
|
28
|
+
end
|
29
|
+
|
30
|
+
elsif strict
|
31
|
+
|
32
|
+
uc.each_slice(2) do |left, right|
|
33
|
+
if left.miss? || right.miss? # at least one is a miss
|
34
|
+
unclassifieds << left.query
|
35
|
+
unclassifieds << right.query
|
36
|
+
# both are hits, check taxonomies
|
37
|
+
else
|
38
|
+
ta = parse_taxonomy(right.target)
|
39
|
+
tb = parse_taxonomy(left.target)
|
40
|
+
# inconsistent assignment or at least one is a miss
|
41
|
+
if (ta[strict] != tb[strict])
|
42
|
+
unclassifieds << left.query
|
43
|
+
unclassifieds << right.query
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
ohai "found #{unclassifieds.size} unclassified #{'(strict pairing)' if strict} reads."
|
51
|
+
|
52
|
+
handle.close
|
53
|
+
|
54
|
+
# open fasta file, output unclassified reads
|
55
|
+
out = File.open(output, 'w')
|
56
|
+
Dna.new(File.open(reads)).each do |record|
|
57
|
+
if unclassifieds.include? record.name
|
58
|
+
out.puts record
|
59
|
+
end
|
60
|
+
end
|
61
|
+
out.close
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
|
3
|
+
module Lederhosen
|
4
|
+
|
5
|
+
# represents a usearch result
|
6
|
+
class UResult
|
7
|
+
|
8
|
+
def initialize(hash)
|
9
|
+
@source = OpenStruct.new(hash)
|
10
|
+
end
|
11
|
+
|
12
|
+
def method_missing(method, *args, &block)
|
13
|
+
@source.send(method, *args, &block)
|
14
|
+
end
|
15
|
+
|
16
|
+
def hit?
|
17
|
+
@source.type == 'H'
|
18
|
+
end
|
19
|
+
|
20
|
+
def miss?
|
21
|
+
@source.type == 'N'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# class for parsing UC files, generates UResult objects
|
26
|
+
class UCParser
|
27
|
+
include Enumerable
|
28
|
+
|
29
|
+
def initialize(handle)
|
30
|
+
@handle = handle
|
31
|
+
end
|
32
|
+
|
33
|
+
def each(&block)
|
34
|
+
@handle.each do |line|
|
35
|
+
next if line =~ /^[#C]/ # skip comments and cluster summaries
|
36
|
+
dat = parse_usearch_line(line.strip)
|
37
|
+
result = UResult.new(dat)
|
38
|
+
block.call(result)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
# parse a line of usearch prefix
|
45
|
+
# return a hash in the form:
|
46
|
+
# { :taxonomy => '', :identity => '0.00', ... }
|
47
|
+
# unless the line is not a "hit" in which case
|
48
|
+
# the function returns nil
|
49
|
+
def parse_usearch_line(str)
|
50
|
+
|
51
|
+
# from http://drive5.com/usearch/manual/ucout.html
|
52
|
+
# 1 Record type S, H, C or N (see table below).
|
53
|
+
# 2 Cluster number (0-based).
|
54
|
+
# 3 Sequence length (S, N and H) or cluster size (C).
|
55
|
+
# 4 For H records, percent identity with target.
|
56
|
+
# 5 For H records, the strand: + or - for nucleotides, . for proteins.
|
57
|
+
# 6 Not used, parsers should ignore this field. Included for backwards compatibility.
|
58
|
+
# 7 Not used, parsers should ignore this field. Included for backwards compatibility.
|
59
|
+
# 8 Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10).
|
60
|
+
# 9 Label of query sequence (always present).
|
61
|
+
# 10 Label of target sequence (H records only).
|
62
|
+
|
63
|
+
str = str.split("\t")
|
64
|
+
|
65
|
+
dat = {
|
66
|
+
:type => str[0],
|
67
|
+
:cluster_no => str[1],
|
68
|
+
:alignment => str[7],
|
69
|
+
:query => str[8],
|
70
|
+
:target => str[9],
|
71
|
+
}
|
72
|
+
|
73
|
+
r =
|
74
|
+
if dat[:type] =~ /[SNH]/ # hits
|
75
|
+
{ :length => str[2].to_i,
|
76
|
+
:identity => str[3],
|
77
|
+
:strand => str[4],
|
78
|
+
}
|
79
|
+
elsif dat[:type] == 'C' # clusters
|
80
|
+
{ :cluster_size => str[2].to_i }
|
81
|
+
else
|
82
|
+
raise Exception, "Do not understand record type #{str[0]}!"
|
83
|
+
end
|
84
|
+
|
85
|
+
dat.merge(r)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
data/lib/lederhosen/version.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
module Lederhosen
|
2
2
|
module Version
|
3
|
-
MAJOR =
|
4
|
-
MINOR =
|
5
|
-
CODENAME = '
|
6
|
-
PATCH =
|
3
|
+
MAJOR = 2
|
4
|
+
MINOR = 0
|
5
|
+
CODENAME = 'Schnittlauchbrot' # changes for minor versions
|
6
|
+
PATCH = 0
|
7
7
|
|
8
8
|
STRING = [MAJOR, MINOR, PATCH].join('.')
|
9
9
|
end
|