lederhosen 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -4,4 +4,4 @@ gem 'thor'
4
4
  gem 'rspec'
5
5
  gem 'dna'
6
6
  gem 'progressbar'
7
- gem 'awesome_print'
7
+ gem 'awesome_print'
data/bin/lederhosen CHANGED
@@ -4,4 +4,4 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'lederho
4
4
 
5
5
  fail "you need to install uclust and have it in your $PATH" if `which uclust` == ''
6
6
 
7
- Lederhosen::CLI.start
7
+ Lederhosen::CLI.start
@@ -0,0 +1,51 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+ set -x
5
+
6
+ # Hierarchical OTU clustering
7
+ # Austin G. Davis-Richardson
8
+ # <harekrishna at gmail dot com>
9
+ # http://github.com/audy/lederhosen
10
+
11
+ reads='sorted.fasta'
12
+ out='h_clustering'
13
+
14
+ mkdir -p $out
15
+
16
+ # initial clustering at 80%
17
+ lederhosen cluster --input=$reads --output=$out/clusters_0.80.uc --identity=0.80
18
+
19
+ # filter UC file
20
+ lederhosen uc_filter --input=$out/clusters_0.80.uc --output=$out/clusters_0.80.uc.filtered --reads=1 --samples=1
21
+
22
+ # get reads for each cluster
23
+ mkdir -p $out/split_80
24
+ lederhosen split --clusters=$out/clusters_0.80.uc.filtered --reads=$reads --out-dir=$out/split_80/
25
+
26
+ # now cluster each of those at 90%
27
+ for fasta in $out/split_80/*.fasta
28
+ do
29
+
30
+ # sort (awww, do I really have to do this again?)
31
+ lederhosen sort --input=$fasta --output=$fasta.sorted
32
+
33
+ # cluster
34
+ lederhosen cluster --input=$fasta.sorted --output=$fasta.uc --identity=0.90
35
+
36
+ # split
37
+ split=$out/split_80.90_$(basename $fasta .fasta)
38
+ lederhosen split --clusters=$fasta.uc --reads=$fasta --out-dir=$split
39
+ done
40
+
41
+ # Do it again at 95%
42
+ for fasta in $out/split_80/split_*_90.fasta/*.fasta
43
+ do
44
+ # cluster
45
+ lederhosen cluster --input=$fasta --output=$fasta.uc --identity=90
46
+
47
+ # split
48
+ split=$outdir/80.90.$fasta.fasta
49
+ mkdir -p $split
50
+ lederhosen split --clusters=$fasta.uc --reads=$input --out-dir=$split
51
+ done
data/examples/pipeline.sh CHANGED
@@ -68,4 +68,4 @@ do
68
68
  --database=$taxcollector
69
69
  done
70
70
 
71
- echo "complete!"
71
+ echo "complete!"
data/lederhosen.gemspec CHANGED
@@ -25,4 +25,4 @@ Gem::Specification.new do |s|
25
25
  s.add_dependency('progressbar')
26
26
  s.add_dependency('bundler')
27
27
  s.add_dependency('awesome_print')
28
- end
28
+ end
@@ -48,7 +48,7 @@ module Lederhosen
48
48
  end
49
49
  @buffer = Hash.new { |h, k| h[k] = Array.new }
50
50
  end
51
-
51
+
52
52
  end
53
53
 
54
- end
54
+ end
@@ -17,8 +17,13 @@ module Lederhosen
17
17
  cutoff = args[:cutoff] || 20
18
18
  min_length = args[:min_length] || 70
19
19
 
20
- left_handle = File.open left
21
- right_handle = File.open right
20
+ left_handle, right_handle =
21
+ begin
22
+ [ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
23
+ rescue Zlib::GzipFile::Error
24
+ [ File.open(left), File.open(right) ]
25
+ end
26
+
22
27
  out_handle = File.open out, 'w'
23
28
 
24
29
  left_reads = Dna.new left_handle
@@ -57,9 +62,9 @@ module Lederhosen
57
62
  min = args[:min] || 20
58
63
  offset = args[:cutoff] || 64
59
64
 
60
- _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
61
-
62
- dna.quality.each_byte.each_with_index do |b, a|
65
+ _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
66
+
67
+ dna.quality.each_byte.each_with_index do |b, a|
63
68
  _sum += (b - offset - min)
64
69
  if _sum > _max
65
70
  _max = _sum
@@ -3,84 +3,84 @@
3
3
  #
4
4
 
5
5
  module Lederhosen
6
- class CLI
7
-
8
- desc "add_names",
9
- "add names to otu abundance matrix using blat output"
10
-
11
- method_option :blat, :type => :string, :required => true
12
- method_option :table, :type => :string, :required => true
13
- method_option :level, :type => :string, :required => true
14
- method_option :output, :type => :string, :required => false
15
-
16
- def add_names
17
- blat = options[:blat]
18
- table = options[:table]
19
- level = options[:level]
20
- output = options[:output] || $stdout
21
-
22
- levels = { 'kingdom' => 0,
23
- 'domain' => 0,
24
- 'phylum' => 1,
25
- 'class' => 2,
26
- 'order' => 3,
27
- 'family' => 4,
28
- 'genus' => 5,
29
- 'species' => 6 }
30
-
31
- fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
32
-
33
- # Corresponds with the numbers used in the TaxCollector database
34
- # taxonomic descriptions
35
- level_no = levels[level]
36
-
37
- # map cluster_id to taxonomic description
38
- # default is the cluster_id itself in case
39
- # the cluster was not classified.
40
- clusterid_to_name = Hash.new { |h, k| h[k] = k }
41
-
42
- # map clusterid to name using blat output
43
- ohai "loading BLAT output from #{blat}"
44
- File.open(blat) do |handle|
45
- handle.each do |line|
46
- line = line.strip.split
47
-
48
- # Only get first match
49
- # TODO something smarter here
50
- cluster_id = line[0].split(':')[3]
51
- next if clusterid_to_name.include? cluster_id
52
-
53
- taxonomic_description = line[1]
54
-
55
- # match by level_no
56
- # Example:
57
- # [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
58
- # I want to match Actinobacteria given level_no = 2
59
- level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
60
-
61
- clusterid_to_name[cluster_id] = level_name
62
- end
63
- end
64
-
65
- # load table, replace cluster names with taxonomic descriptions
66
- output = File.open(output, 'w') unless output == $stdout
67
- ohai "replacing names in #{table}"
68
- File.open(table) do |handle|
69
-
70
- # read in header, replace clusterids to names
71
- header = handle.gets.strip.split(',')
72
- header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
73
-
74
- # print new header
75
- output.puts header.join(',')
76
-
77
- # print rest of table
78
- handle.each { |l| output.print l }
79
- end
80
-
81
- # print status message
82
- ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
83
- end
84
-
85
- end
6
+ class CLI
7
+
8
+ desc "add_names",
9
+ "add names to otu abundance matrix using blat output"
10
+
11
+ method_option :blat, :type => :string, :required => true
12
+ method_option :table, :type => :string, :required => true
13
+ method_option :level, :type => :string, :required => true
14
+ method_option :output, :type => :string, :required => false
15
+
16
+ def add_names
17
+ blat = options[:blat]
18
+ table = options[:table]
19
+ level = options[:level]
20
+ output = options[:output] || $stdout
21
+
22
+ levels = { 'kingdom' => 0,
23
+ 'domain' => 0,
24
+ 'phylum' => 1,
25
+ 'class' => 2,
26
+ 'order' => 3,
27
+ 'family' => 4,
28
+ 'genus' => 5,
29
+ 'species' => 6 }
30
+
31
+ fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
32
+
33
+ # Corresponds with the numbers used in the TaxCollector database
34
+ # taxonomic descriptions
35
+ level_no = levels[level]
36
+
37
+ # map cluster_id to taxonomic description
38
+ # default is the cluster_id itself in case
39
+ # the cluster was not classified.
40
+ clusterid_to_name = Hash.new { |h, k| h[k] = k }
41
+
42
+ # map clusterid to name using blat output
43
+ ohai "loading BLAT output from #{blat}"
44
+ File.open(blat) do |handle|
45
+ handle.each do |line|
46
+ line = line.strip.split
47
+
48
+ # Only get first match
49
+ # TODO something smarter here
50
+ cluster_id = line[0].split(':')[3]
51
+ next if clusterid_to_name.include? cluster_id
52
+
53
+ taxonomic_description = line[1]
54
+
55
+ # match by level_no
56
+ # Example:
57
+ # [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
58
+ # I want to match Actinobacteria given level_no = 2
59
+ level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
60
+
61
+ clusterid_to_name[cluster_id] = level_name
62
+ end
63
+ end
64
+
65
+ # load table, replace cluster names with taxonomic descriptions
66
+ output = File.open(output, 'w') unless output == $stdout
67
+ ohai "replacing names in #{table}"
68
+ File.open(table) do |handle|
69
+
70
+ # read in header, replace clusterids to names
71
+ header = handle.gets.strip.split(',')
72
+ header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
73
+
74
+ # print new header
75
+ output.puts header.join(',')
76
+
77
+ # print rest of table
78
+ handle.each { |l| output.print l }
79
+ end
80
+
81
+ # print status message
82
+ ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
83
+ end
84
+
85
+ end
86
86
  end
@@ -16,7 +16,7 @@ module Lederhosen
16
16
  identity = options[:identity]
17
17
  output = options[:output]
18
18
  input = options[:input]
19
-
19
+
20
20
  cmd = [
21
21
  'uclust',
22
22
  "--input #{input}",
@@ -25,9 +25,9 @@ module Lederhosen
25
25
  '-t=dna',
26
26
  '-q=dna',
27
27
  '-out=blast8',
28
- output
28
+ output
29
29
  ]
30
-
30
+
31
31
  exec cmd.join(' ')
32
32
 
33
33
  end
@@ -1,45 +1,45 @@
1
1
  module Lederhosen
2
- class CLI
3
-
4
- desc 'otu_filter', 'works like uc_filter but uses an OTU table as input'
5
-
6
- method_option :input, :type => :string, :required => true
7
- method_option :output, :type => :string, :required => true
8
- method_option :reads, :type => :numeric, :required => true
9
- method_option :samples, :type => :numeric, :required => true
10
-
11
- def otu_filter
12
- input = options[:input]
13
- output = options[:output]
14
- reads = options[:reads]
15
- samples = options[:samples]
16
-
17
- ##
18
- # Iterate over otu table line by line.
19
- # Only print if cluster meets criteria
20
- #
21
- kept = 0
22
- File.open(input) do |handle|
23
- header = handle.gets.strip
24
- header = header.split(',')
25
- samples = header[1..-1]
26
-
27
- puts header.join(',')
28
-
29
- handle.each do |line|
30
- line = line.strip.split(',')
31
- cluster_no = line[0]
32
- counts = line[1..-1].collect { |x| x.to_i }
33
-
34
- # should be the same as uc_filter
35
- if counts.reject { |x| x < reads }.length > samples
36
- puts line.join(',')
37
- kept += 1
38
- end
39
- end
40
- end
41
- ohai "kept #{kept} clusters."
42
- end
43
-
44
- end
2
+ class CLI
3
+
4
+ desc 'otu_filter', 'works like uc_filter but uses an OTU table as input'
5
+
6
+ method_option :input, :type => :string, :required => true
7
+ method_option :output, :type => :string, :required => true
8
+ method_option :reads, :type => :numeric, :required => true
9
+ method_option :samples, :type => :numeric, :required => true
10
+
11
+ def otu_filter
12
+ input = options[:input]
13
+ output = options[:output]
14
+ reads = options[:reads]
15
+ samples = options[:samples]
16
+
17
+ ##
18
+ # Iterate over otu table line by line.
19
+ # Only print if cluster meets criteria
20
+ #
21
+ kept = 0
22
+ File.open(input) do |handle|
23
+ header = handle.gets.strip
24
+ header = header.split(',')
25
+ samples = header[1..-1]
26
+
27
+ puts header.join(',')
28
+
29
+ handle.each do |line|
30
+ line = line.strip.split(',')
31
+ cluster_no = line[0]
32
+ counts = line[1..-1].collect { |x| x.to_i }
33
+
34
+ # should be the same as uc_filter
35
+ if counts.reject { |x| x < reads }.length > samples
36
+ puts line.join(',')
37
+ kept += 1
38
+ end
39
+ end
40
+ end
41
+ ohai "kept #{kept} clusters."
42
+ end
43
+
44
+ end
45
45
  end
@@ -17,32 +17,32 @@ module Lederhosen
17
17
  input = options[:clusters]
18
18
  output = options[:output]
19
19
  joined_reads = options[:joined]
20
-
20
+
21
21
  # Load cluster table
22
22
 
23
- clstr_info = Helpers.load_uc_file input
23
+ clstr_info = Helpers.load_uc_file input
24
24
  clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
25
25
  clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
26
26
  samples = clstr_info[:samples]
27
27
 
28
28
  # print OTU abundance matrix
29
- # clusters as columns
30
- # samples as rows
29
+ # clusters as columns
30
+ # samples as rows
31
31
 
32
32
  File.open("#{output}.csv", 'w') do |h|
33
33
  samples = samples.sort
34
34
  clusters = clstr_counts.keys
35
35
 
36
36
  # print header (cluster names)
37
- h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
38
-
39
- samples.each do |sample|
40
- h.print sample
41
- clusters.each do |cluster|
42
- h.print "#{SEP}#{clstr_counts[cluster][sample]}"
43
- end
44
- h.print "\n"
45
- end
37
+ h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
38
+
39
+ samples.each do |sample|
40
+ h.print sample
41
+ clusters.each do |cluster|
42
+ h.print "#{SEP}#{clstr_counts[cluster][sample]}"
43
+ end
44
+ h.print "\n"
45
+ end
46
46
  end
47
47
  end
48
48
 
@@ -19,7 +19,7 @@ module Lederhosen
19
19
 
20
20
 
21
21
  # Load cluster table!
22
- clstr_info = Helpers.load_uc_file input
22
+ clstr_info = Helpers.load_uc_file input
23
23
  clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
24
24
  seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
25
25
  samples = clstr_info[:samples]
@@ -36,7 +36,7 @@ module Lederhosen
36
36
  end
37
37
  end
38
38
  end
39
-
39
+
40
40
  out_handle.close
41
41
  end
42
42
 
@@ -3,46 +3,46 @@
3
3
  #
4
4
 
5
5
  module Lederhosen
6
- class CLI
7
-
8
- desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
9
-
10
- method_option :csv_file, :type => :string, :required => true
11
- method_option :output, :type => :string, :required => false
12
-
13
- def squish
14
- csv_file = options[:csv_file]
15
- output = options[:output] || $stdout
16
-
17
- # sample_name -> column name -> total number of reads
18
- total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
19
- column_names = '' # scope
20
- # Load CSV file, merge counts in columns with the same name
21
- File.open(csv_file) do |handle|
22
- column_names = handle.gets.strip.split(',')[1..-1]
23
- handle.each do |line|
24
- line = line.strip.split(',')
25
- sample = line[0]
26
- line[1..-1].zip(column_names) do |reads, column_name|
27
- total_by_sample_by_column[sample][column_name] += reads.to_i
28
- end
29
- end
30
- end
31
-
32
- output = File.open(output, 'w') rescue $stdout
33
-
34
- # print the new, squished csv file
35
- column_names.uniq!.sort!
36
- output.puts "-,#{column_names.join(',')}"
37
- total_by_sample_by_column.each_pair do |sample_id, row|
38
- output.print "#{sample_id}"
39
- column_names.each do |column_name|
40
- output.print ",#{row[column_name]}"
41
- end
42
- output.print "\n"
43
- end
44
-
45
- output.close
46
- end
47
- end
6
+ class CLI
7
+
8
+ desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
9
+
10
+ method_option :csv_file, :type => :string, :required => true
11
+ method_option :output, :type => :string, :required => false
12
+
13
+ def squish
14
+ csv_file = options[:csv_file]
15
+ output = options[:output] || $stdout
16
+
17
+ # sample_name -> column name -> total number of reads
18
+ total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
19
+ column_names = '' # scope
20
+ # Load CSV file, merge counts in columns with the same name
21
+ File.open(csv_file) do |handle|
22
+ column_names = handle.gets.strip.split(',')[1..-1]
23
+ handle.each do |line|
24
+ line = line.strip.split(',')
25
+ sample = line[0]
26
+ line[1..-1].zip(column_names) do |reads, column_name|
27
+ total_by_sample_by_column[sample][column_name] += reads.to_i
28
+ end
29
+ end
30
+ end
31
+
32
+ output = File.open(output, 'w') rescue $stdout
33
+
34
+ # print the new, squished csv file
35
+ column_names.uniq!.sort!
36
+ output.puts "-,#{column_names.join(',')}"
37
+ total_by_sample_by_column.each_pair do |sample_id, row|
38
+ output.print "#{sample_id}"
39
+ column_names.each do |column_name|
40
+ output.print ",#{row[column_name]}"
41
+ end
42
+ output.print "\n"
43
+ end
44
+
45
+ output.close
46
+ end
47
+ end
48
48
  end
data/lib/lederhosen.rb CHANGED
@@ -4,6 +4,7 @@ require 'dna'
4
4
  require 'set'
5
5
  require 'progressbar'
6
6
  require 'awesome_print'
7
+ require 'zlib'
7
8
 
8
9
  Dir.glob(File.join(File.dirname(__FILE__), 'lederhosen', '*.rb')).each { |f| require f }
9
10
 
@@ -13,4 +14,4 @@ class String
13
14
  k -= 1
14
15
  (0..(self.length-k-1)).collect { |i| self[i..i+k] }
15
16
  end
16
- end
17
+ end
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Lederhosen
2
- VERSION = '0.1.6'
2
+ VERSION = '0.1.7'
3
3
  end
data/readme.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Lederhosen
2
2
 
3
- Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own risk.
3
+ Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
4
4
 
5
5
  ## How do I get Lederhosen?
6
6
 
@@ -33,9 +33,9 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
33
33
 
34
34
  ### trim
35
35
 
36
- Trim (Illumina) reads using quality scores. Output will be a directory of fasta files.
36
+ Trim (Illumina) reads using quality scores. Output will be a directory of fasta files. Reads can optionally be gzipped.
37
37
 
38
- lederhosen trim --reads_dir=reads/* --out_dir=trimmed/
38
+ lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
39
39
 
40
40
  ### join
41
41
 
@@ -43,6 +43,10 @@ Join paired reads from all samples end-to-end. This method enables the use of uc
43
43
 
44
44
  lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
45
45
 
46
+ If your reads are not paired, then you do not need to do this step. Instead, concatenate all of the trimmed reads files.
47
+
48
+ cat trimmed/*.fasta > joined.fasta
49
+
46
50
  ### sort
47
51
 
48
52
  Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
Binary file
Binary file
Binary file
Binary file
data/spec/helpers_spec.rb CHANGED
@@ -1,26 +1,26 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Lederhosen::Helpers do
4
-
5
- let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt') }
6
-
4
+
5
+ let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt.gz') }
6
+
7
7
  it 'should have a method for grouping QSEQ files' do
8
8
  groups.length.should == 2
9
9
  end
10
10
 
11
11
  it 'should have a method for trimming sequences' do
12
12
  reads = groups.values.first.first
13
- record = File.open(reads) do |handle|
13
+ record = Zlib::GzipReader.open(reads) do |handle|
14
14
  Dna.new(handle).first
15
15
  end
16
16
  # I should probably test with a bad read
17
- Lederhosen::Helpers.trim(record).length.should == 79
17
+ Lederhosen::Helpers.trim(record).length.should == 58
18
18
  end
19
19
 
20
20
  it 'should be able to trim pairs of qseq files, outputting fasta file' do
21
21
  reads = groups.values.first
22
- Lederhosen::Helpers.trim_pairs reads[0], reads[1], '/tmp/munchen_trim_test.fasta'
22
+ Lederhosen::Helpers.trim_pairs reads[0], reads[1], "#{$test_dir}/munchen_trim_test.fasta"
23
23
  # this test will break if trim parameters change
24
- File.read('/tmp/munchen_trim_test.fasta').grep(/^>/).length.should be_even
24
+ File.read("#{$test_dir}/munchen_trim_test.fasta").grep(/^>/).length.should be_even
25
25
  end
26
26
  end
data/spec/misc_spec.rb CHANGED
@@ -8,4 +8,4 @@ describe String do
8
8
  'test'.to_kmers(5).should == []
9
9
  'test'.to_kmers(0).should == []
10
10
  end
11
- end
11
+ end