lederhosen 1.8.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -1
- data/lederhosen.gemspec +7 -3
- data/lib/lederhosen/no_tasks.rb +18 -18
- data/lib/lederhosen/tasks/count_taxonomies.rb +83 -0
- data/lib/lederhosen/tasks/get_reps.rb +3 -4
- data/lib/lederhosen/tasks/make_udb.rb +2 -2
- data/lib/lederhosen/tasks/otu_filter.rb +8 -1
- data/lib/lederhosen/tasks/otu_table.rb +33 -70
- data/lib/lederhosen/tasks/separate_unclassified.rb +65 -0
- data/lib/lederhosen/uc_parser.rb +88 -0
- data/lib/lederhosen/version.rb +4 -4
- data/readme.md +107 -11
- data/spec/cli_spec.rb +62 -10
- data/spec/data/test.uc +9 -684
- data/spec/data/trimmed/ILT_L_9_B_001.fasta +100 -1596
- data/spec/no_tasks_spec.rb +1 -1
- data/spec/uc_parser_spec.rb +0 -0
- metadata +7 -3
data/Gemfile
CHANGED
data/lederhosen.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "lederhosen"
|
8
|
-
s.version = "
|
8
|
+
s.version = "2.0.0"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
12
|
-
s.date = "2013-01-
|
12
|
+
s.date = "2013-01-24"
|
13
13
|
s.description = "Various tools for OTU clustering"
|
14
14
|
s.email = "harekrishna@gmail.com"
|
15
15
|
s.executables = ["lederhosen"]
|
@@ -27,13 +27,16 @@ Gem::Specification.new do |s|
|
|
27
27
|
"lib/lederhosen/cli.rb",
|
28
28
|
"lib/lederhosen/no_tasks.rb",
|
29
29
|
"lib/lederhosen/tasks/cluster.rb",
|
30
|
+
"lib/lederhosen/tasks/count_taxonomies.rb",
|
30
31
|
"lib/lederhosen/tasks/get_reps.rb",
|
31
32
|
"lib/lederhosen/tasks/join_otu_tables.rb",
|
32
33
|
"lib/lederhosen/tasks/make_udb.rb",
|
33
34
|
"lib/lederhosen/tasks/otu_filter.rb",
|
34
35
|
"lib/lederhosen/tasks/otu_table.rb",
|
36
|
+
"lib/lederhosen/tasks/separate_unclassified.rb",
|
35
37
|
"lib/lederhosen/tasks/split_fasta.rb",
|
36
38
|
"lib/lederhosen/tasks/version.rb",
|
39
|
+
"lib/lederhosen/uc_parser.rb",
|
37
40
|
"lib/lederhosen/version.rb",
|
38
41
|
"readme.md",
|
39
42
|
"scripts/illumina_pipeline/.gitignore",
|
@@ -46,7 +49,8 @@ Gem::Specification.new do |s|
|
|
46
49
|
"spec/data/test.uc",
|
47
50
|
"spec/data/trimmed/ILT_L_9_B_001.fasta",
|
48
51
|
"spec/no_tasks_spec.rb",
|
49
|
-
"spec/spec_helper.rb"
|
52
|
+
"spec/spec_helper.rb",
|
53
|
+
"spec/uc_parser_spec.rb"
|
50
54
|
]
|
51
55
|
s.homepage = "http://audy.github.com/lederhosen"
|
52
56
|
s.licenses = ["MIT"]
|
data/lib/lederhosen/no_tasks.rb
CHANGED
@@ -6,26 +6,24 @@ module Lederhosen
|
|
6
6
|
|
7
7
|
no_tasks do
|
8
8
|
|
9
|
-
#
|
10
|
-
# return
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
9
|
+
# get a taxonomic description from a line of usearch (uc) output
|
10
|
+
# return 'unclassified_reads' if the result was not a hit
|
11
|
+
# if the result was neither a hit nor a miss (for example, a seed)
|
12
|
+
# return nil
|
13
|
+
# this will probably break for different versions of uc file
|
14
|
+
# as produced by uclust or older versions of usearch
|
15
|
+
def get_tax(s)
|
16
|
+
dat = parse_usearch_line(s.strip)
|
17
|
+
if dat[:type] == 'H'
|
18
|
+
dat[:taxonomic_description].tr(',', '_')
|
19
|
+
elsif dat[:type] == 'N'
|
20
|
+
'unclassified_reads'
|
21
|
+
else
|
22
|
+
nil
|
23
|
+
end
|
24
|
+
end
|
23
25
|
|
24
|
-
# parse taxonomic_description
|
25
|
-
taxonomies = parse_taxonomy(taxonomic_description) rescue { 'original' => str[9] }
|
26
26
|
|
27
|
-
{ :identity => identity }.merge(taxonomies)
|
28
|
-
end
|
29
27
|
|
30
28
|
# detect whether the taxonomy is one of the following
|
31
29
|
# possible formats:
|
@@ -40,6 +38,8 @@ module Lederhosen
|
|
40
38
|
:taxcollector
|
41
39
|
elsif taxonomy =~ /^\d/
|
42
40
|
:greengenes
|
41
|
+
elsif taxonomy.nil?
|
42
|
+
raise "nil ain't no taxonomy I ever heard of!"
|
43
43
|
else
|
44
44
|
:qiime
|
45
45
|
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module Lederhosen
|
2
|
+
class CLI
|
3
|
+
|
4
|
+
desc 'count_taxonomies', 'count taxonomies from a uc file, generating a csv file with: <taxonomy>,<reads>'
|
5
|
+
|
6
|
+
method_option :input, :type => :string, :required => true
|
7
|
+
method_option :output, :type => :string, :required => true
|
8
|
+
method_option :strict, :type => :string, :default => false,
|
9
|
+
:banner => '<level> only count reads where both taxonomies are in agreement at <level>'
|
10
|
+
|
11
|
+
def count_taxonomies
|
12
|
+
input = options[:input]
|
13
|
+
output = options[:output]
|
14
|
+
strict = options[:strict]
|
15
|
+
|
16
|
+
ohai "generating #{output} from #{input}"
|
17
|
+
|
18
|
+
handle = File.open(input)
|
19
|
+
uc = UCParser.new(handle)
|
20
|
+
|
21
|
+
taxonomy_count =
|
22
|
+
if not strict
|
23
|
+
get_taxonomy_count(uc)
|
24
|
+
elsif strict
|
25
|
+
get_strict_taxonomy_count(uc, strict)
|
26
|
+
end
|
27
|
+
|
28
|
+
handle.close
|
29
|
+
|
30
|
+
out = File.open(output, 'w')
|
31
|
+
out.puts '# taxonomy, number_of_reads'
|
32
|
+
taxonomy_count.each_pair do |taxonomy, count|
|
33
|
+
out.puts "#{taxonomy.tr(',','_')},#{count}"
|
34
|
+
end
|
35
|
+
out.close
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
no_tasks do
|
40
|
+
# returns Hash of taxonomy => number_of_reads
|
41
|
+
def get_taxonomy_count(uc)
|
42
|
+
taxonomy_count = Hash.new { |h, k| h[k] = 0 }
|
43
|
+
uc.each do |result|
|
44
|
+
if result.hit?
|
45
|
+
taxonomy_count[result.target] += 1
|
46
|
+
else
|
47
|
+
taxonomy_count['unclassified_reads'] += 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
taxonomy_count
|
51
|
+
end
|
52
|
+
|
53
|
+
# returns Hash of taxonomy => number_of_reads
|
54
|
+
# if a pair of reads do not agree at a taxonomic level,
|
55
|
+
# or if at least one is unclassified, bot reads are counted
|
56
|
+
# as unclassified_reads
|
57
|
+
def get_strict_taxonomy_count(uc, level)
|
58
|
+
taxonomy_count = Hash.new { |h, k| h[k] = 0 }
|
59
|
+
# TODO: I'm making a block for results because I don't know how to
|
60
|
+
# make results return an Enumerator when not given a block
|
61
|
+
uc.each_slice(2) do |left, right|
|
62
|
+
if left.miss? or right.miss? # at least one is a miss
|
63
|
+
taxonomy_count['unclassified_reads'] += 2
|
64
|
+
# both are hits, check taxonomies
|
65
|
+
else
|
66
|
+
ta = parse_taxonomy(left.target)
|
67
|
+
tb = parse_taxonomy(right.target)
|
68
|
+
# they match up, count both separately
|
69
|
+
if ta[level] == tb[level]
|
70
|
+
taxonomy_count[left.target] += 1
|
71
|
+
taxonomy_count[right.target] += 1
|
72
|
+
# they don't match up, count as unclassified
|
73
|
+
else
|
74
|
+
taxonomy_count['unclassified_reads'] += 2
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end # results.each_slice
|
78
|
+
taxonomy_count
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -23,10 +23,9 @@ module Lederhosen
|
|
23
23
|
|
24
24
|
inputs.each do |input|
|
25
25
|
File.open(input) do |handle|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
taxa << header['original'] rescue nil
|
26
|
+
results = UCParser.new(handle)
|
27
|
+
results.each do |result|
|
28
|
+
taxa << result.target if result.hit?
|
30
29
|
end
|
31
30
|
end
|
32
31
|
end
|
@@ -3,8 +3,8 @@ module Lederhosen
|
|
3
3
|
|
4
4
|
desc 'make_udb', 'format database for usearch'
|
5
5
|
|
6
|
-
method_option :input,
|
7
|
-
method_option :output,
|
6
|
+
method_option :input, :type => :string, :required => true
|
7
|
+
method_option :output, :type => :string, :required => true
|
8
8
|
|
9
9
|
def make_udb
|
10
10
|
input = options[:input]
|
@@ -39,20 +39,27 @@ module Lederhosen
|
|
39
39
|
ohai "filtering"
|
40
40
|
|
41
41
|
# filter sample_cluster_count
|
42
|
+
# todo: move filtered reads to 'unclassified_reads' classification
|
42
43
|
filtered = cluster_sample_count.reject { |k, v| v.reject { |k, v| v < reads }.size < min_samples }
|
43
44
|
|
45
|
+
# use functional programming they said
|
46
|
+
# it will make your better they said
|
47
|
+
noise = cluster_sample_count.keys - filtered.keys
|
48
|
+
|
44
49
|
ohai "saving to #{output}"
|
45
50
|
|
46
51
|
# save the table
|
47
52
|
out = File.open(output, 'w')
|
48
53
|
samples = filtered.values.map(&:keys).flatten.uniq
|
49
54
|
clusters = filtered.keys
|
50
|
-
out.puts "-,#{clusters.join(',')}"
|
55
|
+
out.puts "-,#{clusters.join(',')},noise"
|
51
56
|
samples.each do |sample|
|
52
57
|
out.print "#{sample}"
|
53
58
|
clusters.each do |cluster|
|
54
59
|
out.print ",#{filtered[cluster][sample]}"
|
55
60
|
end
|
61
|
+
noise_sum = noise.map { |n| cluster_sample_count[n][sample]}.inject(:+)
|
62
|
+
out.print ",#{noise_sum}"
|
56
63
|
out.print "\n"
|
57
64
|
end
|
58
65
|
out.close
|
@@ -6,96 +6,59 @@ module Lederhosen
|
|
6
6
|
class CLI
|
7
7
|
|
8
8
|
desc "otu_table",
|
9
|
-
"create an OTU abundance matrix from
|
9
|
+
"create an OTU abundance matrix from taxonomy count files"
|
10
10
|
|
11
11
|
method_option :files, :type => :string, :required => true
|
12
|
-
|
13
|
-
method_option :
|
14
|
-
:banner => 'prefix prefix'
|
15
|
-
|
16
|
-
method_option :levels, :type => :array, :required => true,
|
17
|
-
:banner => 'valid options: domain, kingdom, phylum, class, order, genus, species, original (or all of them at once)'
|
12
|
+
method_option :level, :type => :string, :required => true
|
13
|
+
method_option :output, :type => :string, :required => true
|
18
14
|
|
19
15
|
def otu_table
|
20
|
-
|
21
|
-
|
22
|
-
|
16
|
+
inputs = Dir[options[:files]]
|
17
|
+
level = options[:level].downcase
|
18
|
+
output = options[:output]
|
23
19
|
|
24
|
-
ohai "
|
20
|
+
ohai "Generating OTU matrix from #{inputs.size} inputs at #{level} level and saving to #{output}."
|
25
21
|
|
26
22
|
# sanity check
|
27
|
-
|
28
|
-
|
29
|
-
end
|
23
|
+
fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
|
24
|
+
fail 'no inputs matched your glob' if inputs.size == 0
|
30
25
|
|
31
|
-
|
32
|
-
level_sample_cluster_count =
|
33
|
-
Hash.new do |h, k|
|
34
|
-
h[k] = Hash.new do |h, k|
|
35
|
-
h[k] = Hash.new(0)
|
36
|
-
end
|
37
|
-
end
|
26
|
+
sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
|
38
27
|
|
39
28
|
# create a progress bar with the total number of bytes of
|
40
29
|
# the files we're slurping up
|
41
|
-
pbar = ProgressBar.new "loading",
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
else
|
55
|
-
dat[level] || 'unparsed_name'
|
56
|
-
end
|
57
|
-
|
58
|
-
# remove commas from name
|
59
|
-
name = name.tr(',', '_')
|
60
|
-
|
61
|
-
# the next two lines are what is slow
|
62
|
-
level_sample_cluster_count[level][input_file][name] += 1
|
30
|
+
pbar = ProgressBar.new "loading", inputs.size
|
31
|
+
|
32
|
+
inputs.each do |input_file|
|
33
|
+
File.open(input_file).each do |line|
|
34
|
+
next if line =~ /^#/ # skip header(s)
|
35
|
+
line = line.strip.split(',')
|
36
|
+
taxonomy, count = line
|
37
|
+
count = count.to_i
|
38
|
+
tax =
|
39
|
+
if taxonomy == 'unclassified_reads'
|
40
|
+
'unclassified_reads'
|
41
|
+
else
|
42
|
+
parse_taxonomy(taxonomy)[level]
|
63
43
|
end
|
64
|
-
|
65
|
-
end
|
44
|
+
sample_cluster_count[input_file][tax] += count
|
66
45
|
end
|
67
46
|
end
|
68
47
|
|
69
|
-
|
48
|
+
all_clusters = sample_cluster_count.values.map(&:keys).flatten.uniq.sort
|
70
49
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
50
|
+
out = File.open(output, 'w')
|
51
|
+
|
52
|
+
out.puts all_clusters.join(',')
|
53
|
+
inputs.sort.each do |input|
|
54
|
+
out.print "#{input}"
|
55
|
+
all_clusters.each do |c|
|
56
|
+
out.print ",#{sample_cluster_count[input][c]}"
|
76
57
|
end
|
58
|
+
out.print "\n"
|
77
59
|
end
|
78
60
|
|
79
|
-
# save to csv(s)
|
80
|
-
levels.each do |level|
|
81
|
-
|
82
|
-
ohai "saving #{level} table"
|
83
|
-
|
84
|
-
File.open("#{prefix}.#{level}.csv", 'w') do |handle|
|
85
|
-
header = all_names[level].to_a.compact.sort
|
86
|
-
handle.puts "#{level.capitalize},#{header.join(',')}"
|
87
|
-
|
88
|
-
input.each do |sample|
|
89
|
-
handle.print "#{sample}"
|
90
|
-
header.each do |name|
|
91
|
-
handle.print ",#{level_sample_cluster_count[level][sample][name]}"
|
92
|
-
end
|
93
|
-
handle.print "\n"
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
61
|
end
|
98
62
|
|
99
|
-
|
100
63
|
end # class CLI
|
101
64
|
end # module Lederhosen
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Lederhosen
|
4
|
+
|
5
|
+
class CLI
|
6
|
+
|
7
|
+
desc 'separate_unclassified',
|
8
|
+
'separate unclassified reads (with or without strict pairing)'
|
9
|
+
|
10
|
+
method_option :uc_file, :type => :string, :required => true
|
11
|
+
method_option :reads, :type => :string, :required => true
|
12
|
+
method_option :output, :type => :string, :required => true
|
13
|
+
method_option :strict, :type => :string, :default => false
|
14
|
+
|
15
|
+
def separate_unclassified
|
16
|
+
uc_file = options[:uc_file]
|
17
|
+
reads = options[:reads]
|
18
|
+
output = options[:output]
|
19
|
+
strict = options[:strict]
|
20
|
+
|
21
|
+
unclassifieds = Set.new
|
22
|
+
handle = File.open(uc_file)
|
23
|
+
uc = UCParser.new(handle)
|
24
|
+
|
25
|
+
if not strict
|
26
|
+
uc.each do |result|
|
27
|
+
unclassifieds << result.query if result.miss?
|
28
|
+
end
|
29
|
+
|
30
|
+
elsif strict
|
31
|
+
|
32
|
+
uc.each_slice(2) do |left, right|
|
33
|
+
if left.miss? || right.miss? # at least one is a miss
|
34
|
+
unclassifieds << left.query
|
35
|
+
unclassifieds << right.query
|
36
|
+
# both are hits, check taxonomies
|
37
|
+
else
|
38
|
+
ta = parse_taxonomy(right.target)
|
39
|
+
tb = parse_taxonomy(left.target)
|
40
|
+
# inconsistent assignment or at least one is a miss
|
41
|
+
if (ta[strict] != tb[strict])
|
42
|
+
unclassifieds << left.query
|
43
|
+
unclassifieds << right.query
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
ohai "found #{unclassifieds.size} unclassified #{'(strict pairing)' if strict} reads."
|
51
|
+
|
52
|
+
handle.close
|
53
|
+
|
54
|
+
# open fasta file, output unclassified reads
|
55
|
+
out = File.open(output, 'w')
|
56
|
+
Dna.new(File.open(reads)).each do |record|
|
57
|
+
if unclassifieds.include? record.name
|
58
|
+
out.puts record
|
59
|
+
end
|
60
|
+
end
|
61
|
+
out.close
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
|
3
|
+
module Lederhosen
|
4
|
+
|
5
|
+
# represents a usearch result
|
6
|
+
class UResult
|
7
|
+
|
8
|
+
def initialize(hash)
|
9
|
+
@source = OpenStruct.new(hash)
|
10
|
+
end
|
11
|
+
|
12
|
+
def method_missing(method, *args, &block)
|
13
|
+
@source.send(method, *args, &block)
|
14
|
+
end
|
15
|
+
|
16
|
+
def hit?
|
17
|
+
@source.type == 'H'
|
18
|
+
end
|
19
|
+
|
20
|
+
def miss?
|
21
|
+
@source.type == 'N'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# class for parsing UC files, generates UResult objects
|
26
|
+
class UCParser
|
27
|
+
include Enumerable
|
28
|
+
|
29
|
+
def initialize(handle)
|
30
|
+
@handle = handle
|
31
|
+
end
|
32
|
+
|
33
|
+
def each(&block)
|
34
|
+
@handle.each do |line|
|
35
|
+
next if line =~ /^[#C]/ # skip comments and cluster summaries
|
36
|
+
dat = parse_usearch_line(line.strip)
|
37
|
+
result = UResult.new(dat)
|
38
|
+
block.call(result)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
# parse a line of usearch prefix
|
45
|
+
# return a hash in the form:
|
46
|
+
# { :taxonomy => '', :identity => '0.00', ... }
|
47
|
+
# unless the line is not a "hit" in which case
|
48
|
+
# the function returns nil
|
49
|
+
def parse_usearch_line(str)
|
50
|
+
|
51
|
+
# from http://drive5.com/usearch/manual/ucout.html
|
52
|
+
# 1 Record type S, H, C or N (see table below).
|
53
|
+
# 2 Cluster number (0-based).
|
54
|
+
# 3 Sequence length (S, N and H) or cluster size (C).
|
55
|
+
# 4 For H records, percent identity with target.
|
56
|
+
# 5 For H records, the strand: + or - for nucleotides, . for proteins.
|
57
|
+
# 6 Not used, parsers should ignore this field. Included for backwards compatibility.
|
58
|
+
# 7 Not used, parsers should ignore this field. Included for backwards compatibility.
|
59
|
+
# 8 Compressed alignment or the symbol '=' (equals sign). The = indicates that the query is 100% identical to the target sequence (field 10).
|
60
|
+
# 9 Label of query sequence (always present).
|
61
|
+
# 10 Label of target sequence (H records only).
|
62
|
+
|
63
|
+
str = str.split("\t")
|
64
|
+
|
65
|
+
dat = {
|
66
|
+
:type => str[0],
|
67
|
+
:cluster_no => str[1],
|
68
|
+
:alignment => str[7],
|
69
|
+
:query => str[8],
|
70
|
+
:target => str[9],
|
71
|
+
}
|
72
|
+
|
73
|
+
r =
|
74
|
+
if dat[:type] =~ /[SNH]/ # hits
|
75
|
+
{ :length => str[2].to_i,
|
76
|
+
:identity => str[3],
|
77
|
+
:strand => str[4],
|
78
|
+
}
|
79
|
+
elsif dat[:type] == 'C' # clusters
|
80
|
+
{ :cluster_size => str[2].to_i }
|
81
|
+
else
|
82
|
+
raise Exception, "Do not understand record type #{str[0]}!"
|
83
|
+
end
|
84
|
+
|
85
|
+
dat.merge(r)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
data/lib/lederhosen/version.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
module Lederhosen
|
2
2
|
module Version
|
3
|
-
MAJOR =
|
4
|
-
MINOR =
|
5
|
-
CODENAME = '
|
6
|
-
PATCH =
|
3
|
+
MAJOR = 2
|
4
|
+
MINOR = 0
|
5
|
+
CODENAME = 'Schnittlauchbrot' # changes for minor versions
|
6
|
+
PATCH = 0
|
7
7
|
|
8
8
|
STRING = [MAJOR, MINOR, PATCH].join('.')
|
9
9
|
end
|