lederhosen 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -4,4 +4,4 @@ gem 'thor'
4
4
  gem 'rspec'
5
5
  gem 'dna'
6
6
  gem 'progressbar'
7
- gem 'awesome_print'
7
+ gem 'awesome_print'
data/bin/lederhosen CHANGED
@@ -4,4 +4,4 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'lederho
4
4
 
5
5
  fail "you need to install uclust and have it in your $PATH" if `which uclust` == ''
6
6
 
7
- Lederhosen::CLI.start
7
+ Lederhosen::CLI.start
@@ -0,0 +1,51 @@
1
+ #!/bin/bash
2
+
3
+ set -e
4
+ set -x
5
+
6
+ # Hierarchical OTU clustering
7
+ # Austin G. Davis-Richardson
8
+ # <harekrishna at gmail dot com>
9
+ # http://github.com/audy/lederhosen
10
+
11
+ reads='sorted.fasta'
12
+ out='h_clustering'
13
+
14
+ mkdir -p $out
15
+
16
+ # initial clustering at 80%
17
+ lederhosen cluster --input=$reads --output=$out/clusters_0.80.uc --identity=0.80
18
+
19
+ # filter UC file
20
+ lederhosen uc_filter --input=$out/clusters_0.80.uc --output=$out/clusters_0.80.uc.filtered --reads=1 --samples=1
21
+
22
+ # get reads for each cluster
23
+ mkdir -p $out/split_80
24
+ lederhosen split --clusters=$out/clusters_0.80.uc.filtered --reads=$reads --out-dir=$out/split_80/
25
+
26
+ # now cluster each of those at 90%
27
+ for fasta in $out/split_80/*.fasta
28
+ do
29
+
30
+ # sort (awww, do I really have to do this again?)
31
+ lederhosen sort --input=$fasta --output=$fasta.sorted
32
+
33
+ # cluster
34
+ lederhosen cluster --input=$fasta.sorted --output=$fasta.uc --identity=0.90
35
+
36
+ # split
37
+ split=$out/split_80.90_$(basename $fasta .fasta)
38
+ lederhosen split --clusters=$fasta.uc --reads=$fasta --out-dir=$split
39
+ done
40
+
41
+ # Do it again at 95%
42
+ for fasta in $out/split_80/split_*_90.fasta/*.fasta
43
+ do
44
+ # cluster
45
+ lederhosen cluster --input=$fasta --output=$fasta.uc --identity=90
46
+
47
+ # split
48
+ split=$outdir/80.90.$fasta.fasta
49
+ mkdir -p $split
50
+ lederhosen split --clusters=$fasta.uc --reads=$input --out-dir=$split
51
+ done
data/examples/pipeline.sh CHANGED
@@ -68,4 +68,4 @@ do
68
68
  --database=$taxcollector
69
69
  done
70
70
 
71
- echo "complete!"
71
+ echo "complete!"
data/lederhosen.gemspec CHANGED
@@ -25,4 +25,4 @@ Gem::Specification.new do |s|
25
25
  s.add_dependency('progressbar')
26
26
  s.add_dependency('bundler')
27
27
  s.add_dependency('awesome_print')
28
- end
28
+ end
@@ -48,7 +48,7 @@ module Lederhosen
48
48
  end
49
49
  @buffer = Hash.new { |h, k| h[k] = Array.new }
50
50
  end
51
-
51
+
52
52
  end
53
53
 
54
- end
54
+ end
@@ -17,8 +17,13 @@ module Lederhosen
17
17
  cutoff = args[:cutoff] || 20
18
18
  min_length = args[:min_length] || 70
19
19
 
20
- left_handle = File.open left
21
- right_handle = File.open right
20
+ left_handle, right_handle =
21
+ begin
22
+ [ Zlib::GzipReader.open(left), Zlib::GzipReader.open(right)]
23
+ rescue Zlib::GzipFile::Error
24
+ [ File.open(left), File.open(right) ]
25
+ end
26
+
22
27
  out_handle = File.open out, 'w'
23
28
 
24
29
  left_reads = Dna.new left_handle
@@ -57,9 +62,9 @@ module Lederhosen
57
62
  min = args[:min] || 20
58
63
  offset = args[:cutoff] || 64
59
64
 
60
- _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
61
-
62
- dna.quality.each_byte.each_with_index do |b, a|
65
+ _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
66
+
67
+ dna.quality.each_byte.each_with_index do |b, a|
63
68
  _sum += (b - offset - min)
64
69
  if _sum > _max
65
70
  _max = _sum
@@ -3,84 +3,84 @@
3
3
  #
4
4
 
5
5
  module Lederhosen
6
- class CLI
7
-
8
- desc "add_names",
9
- "add names to otu abundance matrix using blat output"
10
-
11
- method_option :blat, :type => :string, :required => true
12
- method_option :table, :type => :string, :required => true
13
- method_option :level, :type => :string, :required => true
14
- method_option :output, :type => :string, :required => false
15
-
16
- def add_names
17
- blat = options[:blat]
18
- table = options[:table]
19
- level = options[:level]
20
- output = options[:output] || $stdout
21
-
22
- levels = { 'kingdom' => 0,
23
- 'domain' => 0,
24
- 'phylum' => 1,
25
- 'class' => 2,
26
- 'order' => 3,
27
- 'family' => 4,
28
- 'genus' => 5,
29
- 'species' => 6 }
30
-
31
- fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
32
-
33
- # Corresponds with the numbers used in the TaxCollector database
34
- # taxonomic descriptions
35
- level_no = levels[level]
36
-
37
- # map cluster_id to taxonomic description
38
- # default is the cluster_id itself in case
39
- # the cluster was not classified.
40
- clusterid_to_name = Hash.new { |h, k| h[k] = k }
41
-
42
- # map clusterid to name using blat output
43
- ohai "loading BLAT output from #{blat}"
44
- File.open(blat) do |handle|
45
- handle.each do |line|
46
- line = line.strip.split
47
-
48
- # Only get first match
49
- # TODO something smarter here
50
- cluster_id = line[0].split(':')[3]
51
- next if clusterid_to_name.include? cluster_id
52
-
53
- taxonomic_description = line[1]
54
-
55
- # match by level_no
56
- # Example:
57
- # [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
58
- # I want to match Actinobacteria given level_no = 2
59
- level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
60
-
61
- clusterid_to_name[cluster_id] = level_name
62
- end
63
- end
64
-
65
- # load table, replace cluster names with taxonomic descriptions
66
- output = File.open(output, 'w') unless output == $stdout
67
- ohai "replacing names in #{table}"
68
- File.open(table) do |handle|
69
-
70
- # read in header, replace clusterids to names
71
- header = handle.gets.strip.split(',')
72
- header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
73
-
74
- # print new header
75
- output.puts header.join(',')
76
-
77
- # print rest of table
78
- handle.each { |l| output.print l }
79
- end
80
-
81
- # print status message
82
- ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
83
- end
84
-
85
- end
6
+ class CLI
7
+
8
+ desc "add_names",
9
+ "add names to otu abundance matrix using blat output"
10
+
11
+ method_option :blat, :type => :string, :required => true
12
+ method_option :table, :type => :string, :required => true
13
+ method_option :level, :type => :string, :required => true
14
+ method_option :output, :type => :string, :required => false
15
+
16
+ def add_names
17
+ blat = options[:blat]
18
+ table = options[:table]
19
+ level = options[:level]
20
+ output = options[:output] || $stdout
21
+
22
+ levels = { 'kingdom' => 0,
23
+ 'domain' => 0,
24
+ 'phylum' => 1,
25
+ 'class' => 2,
26
+ 'order' => 3,
27
+ 'family' => 4,
28
+ 'genus' => 5,
29
+ 'species' => 6 }
30
+
31
+ fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
32
+
33
+ # Corresponds with the numbers used in the TaxCollector database
34
+ # taxonomic descriptions
35
+ level_no = levels[level]
36
+
37
+ # map cluster_id to taxonomic description
38
+ # default is the cluster_id itself in case
39
+ # the cluster was not classified.
40
+ clusterid_to_name = Hash.new { |h, k| h[k] = k }
41
+
42
+ # map clusterid to name using blat output
43
+ ohai "loading BLAT output from #{blat}"
44
+ File.open(blat) do |handle|
45
+ handle.each do |line|
46
+ line = line.strip.split
47
+
48
+ # Only get first match
49
+ # TODO something smarter here
50
+ cluster_id = line[0].split(':')[3]
51
+ next if clusterid_to_name.include? cluster_id
52
+
53
+ taxonomic_description = line[1]
54
+
55
+ # match by level_no
56
+ # Example:
57
+ # [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
58
+ # I want to match Actinobacteria given level_no = 2
59
+ level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
60
+
61
+ clusterid_to_name[cluster_id] = level_name
62
+ end
63
+ end
64
+
65
+ # load table, replace cluster names with taxonomic descriptions
66
+ output = File.open(output, 'w') unless output == $stdout
67
+ ohai "replacing names in #{table}"
68
+ File.open(table) do |handle|
69
+
70
+ # read in header, replace clusterids to names
71
+ header = handle.gets.strip.split(',')
72
+ header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
73
+
74
+ # print new header
75
+ output.puts header.join(',')
76
+
77
+ # print rest of table
78
+ handle.each { |l| output.print l }
79
+ end
80
+
81
+ # print status message
82
+ ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
83
+ end
84
+
85
+ end
86
86
  end
@@ -16,7 +16,7 @@ module Lederhosen
16
16
  identity = options[:identity]
17
17
  output = options[:output]
18
18
  input = options[:input]
19
-
19
+
20
20
  cmd = [
21
21
  'uclust',
22
22
  "--input #{input}",
@@ -25,9 +25,9 @@ module Lederhosen
25
25
  '-t=dna',
26
26
  '-q=dna',
27
27
  '-out=blast8',
28
- output
28
+ output
29
29
  ]
30
-
30
+
31
31
  exec cmd.join(' ')
32
32
 
33
33
  end
@@ -1,45 +1,45 @@
1
1
  module Lederhosen
2
- class CLI
3
-
4
- desc 'otu_filter', 'works like uc_filter but uses an OTU table as input'
5
-
6
- method_option :input, :type => :string, :required => true
7
- method_option :output, :type => :string, :required => true
8
- method_option :reads, :type => :numeric, :required => true
9
- method_option :samples, :type => :numeric, :required => true
10
-
11
- def otu_filter
12
- input = options[:input]
13
- output = options[:output]
14
- reads = options[:reads]
15
- samples = options[:samples]
16
-
17
- ##
18
- # Iterate over otu table line by line.
19
- # Only print if cluster meets criteria
20
- #
21
- kept = 0
22
- File.open(input) do |handle|
23
- header = handle.gets.strip
24
- header = header.split(',')
25
- samples = header[1..-1]
26
-
27
- puts header.join(',')
28
-
29
- handle.each do |line|
30
- line = line.strip.split(',')
31
- cluster_no = line[0]
32
- counts = line[1..-1].collect { |x| x.to_i }
33
-
34
- # should be the same as uc_filter
35
- if counts.reject { |x| x < reads }.length > samples
36
- puts line.join(',')
37
- kept += 1
38
- end
39
- end
40
- end
41
- ohai "kept #{kept} clusters."
42
- end
43
-
44
- end
2
+ class CLI
3
+
4
+ desc 'otu_filter', 'works like uc_filter but uses an OTU table as input'
5
+
6
+ method_option :input, :type => :string, :required => true
7
+ method_option :output, :type => :string, :required => true
8
+ method_option :reads, :type => :numeric, :required => true
9
+ method_option :samples, :type => :numeric, :required => true
10
+
11
+ def otu_filter
12
+ input = options[:input]
13
+ output = options[:output]
14
+ reads = options[:reads]
15
+ samples = options[:samples]
16
+
17
+ ##
18
+ # Iterate over otu table line by line.
19
+ # Only print if cluster meets criteria
20
+ #
21
+ kept = 0
22
+ File.open(input) do |handle|
23
+ header = handle.gets.strip
24
+ header = header.split(',')
25
+ samples = header[1..-1]
26
+
27
+ puts header.join(',')
28
+
29
+ handle.each do |line|
30
+ line = line.strip.split(',')
31
+ cluster_no = line[0]
32
+ counts = line[1..-1].collect { |x| x.to_i }
33
+
34
+ # should be the same as uc_filter
35
+ if counts.reject { |x| x < reads }.length > samples
36
+ puts line.join(',')
37
+ kept += 1
38
+ end
39
+ end
40
+ end
41
+ ohai "kept #{kept} clusters."
42
+ end
43
+
44
+ end
45
45
  end
@@ -17,32 +17,32 @@ module Lederhosen
17
17
  input = options[:clusters]
18
18
  output = options[:output]
19
19
  joined_reads = options[:joined]
20
-
20
+
21
21
  # Load cluster table
22
22
 
23
- clstr_info = Helpers.load_uc_file input
23
+ clstr_info = Helpers.load_uc_file input
24
24
  clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
25
25
  clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
26
26
  samples = clstr_info[:samples]
27
27
 
28
28
  # print OTU abundance matrix
29
- # clusters as columns
30
- # samples as rows
29
+ # clusters as columns
30
+ # samples as rows
31
31
 
32
32
  File.open("#{output}.csv", 'w') do |h|
33
33
  samples = samples.sort
34
34
  clusters = clstr_counts.keys
35
35
 
36
36
  # print header (cluster names)
37
- h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
38
-
39
- samples.each do |sample|
40
- h.print sample
41
- clusters.each do |cluster|
42
- h.print "#{SEP}#{clstr_counts[cluster][sample]}"
43
- end
44
- h.print "\n"
45
- end
37
+ h.puts '-' + SEP + clusters.map { |x| "cluster-#{x}" }.join(SEP)
38
+
39
+ samples.each do |sample|
40
+ h.print sample
41
+ clusters.each do |cluster|
42
+ h.print "#{SEP}#{clstr_counts[cluster][sample]}"
43
+ end
44
+ h.print "\n"
45
+ end
46
46
  end
47
47
  end
48
48
 
@@ -19,7 +19,7 @@ module Lederhosen
19
19
 
20
20
 
21
21
  # Load cluster table!
22
- clstr_info = Helpers.load_uc_file input
22
+ clstr_info = Helpers.load_uc_file input
23
23
  clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
24
24
  seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
25
25
  samples = clstr_info[:samples]
@@ -36,7 +36,7 @@ module Lederhosen
36
36
  end
37
37
  end
38
38
  end
39
-
39
+
40
40
  out_handle.close
41
41
  end
42
42
 
@@ -3,46 +3,46 @@
3
3
  #
4
4
 
5
5
  module Lederhosen
6
- class CLI
7
-
8
- desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
9
-
10
- method_option :csv_file, :type => :string, :required => true
11
- method_option :output, :type => :string, :required => false
12
-
13
- def squish
14
- csv_file = options[:csv_file]
15
- output = options[:output] || $stdout
16
-
17
- # sample_name -> column name -> total number of reads
18
- total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
19
- column_names = '' # scope
20
- # Load CSV file, merge counts in columns with the same name
21
- File.open(csv_file) do |handle|
22
- column_names = handle.gets.strip.split(',')[1..-1]
23
- handle.each do |line|
24
- line = line.strip.split(',')
25
- sample = line[0]
26
- line[1..-1].zip(column_names) do |reads, column_name|
27
- total_by_sample_by_column[sample][column_name] += reads.to_i
28
- end
29
- end
30
- end
31
-
32
- output = File.open(output, 'w') rescue $stdout
33
-
34
- # print the new, squished csv file
35
- column_names.uniq!.sort!
36
- output.puts "-,#{column_names.join(',')}"
37
- total_by_sample_by_column.each_pair do |sample_id, row|
38
- output.print "#{sample_id}"
39
- column_names.each do |column_name|
40
- output.print ",#{row[column_name]}"
41
- end
42
- output.print "\n"
43
- end
44
-
45
- output.close
46
- end
47
- end
6
+ class CLI
7
+
8
+ desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
9
+
10
+ method_option :csv_file, :type => :string, :required => true
11
+ method_option :output, :type => :string, :required => false
12
+
13
+ def squish
14
+ csv_file = options[:csv_file]
15
+ output = options[:output] || $stdout
16
+
17
+ # sample_name -> column name -> total number of reads
18
+ total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
19
+ column_names = '' # scope
20
+ # Load CSV file, merge counts in columns with the same name
21
+ File.open(csv_file) do |handle|
22
+ column_names = handle.gets.strip.split(',')[1..-1]
23
+ handle.each do |line|
24
+ line = line.strip.split(',')
25
+ sample = line[0]
26
+ line[1..-1].zip(column_names) do |reads, column_name|
27
+ total_by_sample_by_column[sample][column_name] += reads.to_i
28
+ end
29
+ end
30
+ end
31
+
32
+ output = File.open(output, 'w') rescue $stdout
33
+
34
+ # print the new, squished csv file
35
+ column_names.uniq!.sort!
36
+ output.puts "-,#{column_names.join(',')}"
37
+ total_by_sample_by_column.each_pair do |sample_id, row|
38
+ output.print "#{sample_id}"
39
+ column_names.each do |column_name|
40
+ output.print ",#{row[column_name]}"
41
+ end
42
+ output.print "\n"
43
+ end
44
+
45
+ output.close
46
+ end
47
+ end
48
48
  end
data/lib/lederhosen.rb CHANGED
@@ -4,6 +4,7 @@ require 'dna'
4
4
  require 'set'
5
5
  require 'progressbar'
6
6
  require 'awesome_print'
7
+ require 'zlib'
7
8
 
8
9
  Dir.glob(File.join(File.dirname(__FILE__), 'lederhosen', '*.rb')).each { |f| require f }
9
10
 
@@ -13,4 +14,4 @@ class String
13
14
  k -= 1
14
15
  (0..(self.length-k-1)).collect { |i| self[i..i+k] }
15
16
  end
16
- end
17
+ end
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Lederhosen
2
- VERSION = '0.1.6'
2
+ VERSION = '0.1.7'
3
3
  end
data/readme.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Lederhosen
2
2
 
3
- Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own risk.
3
+ Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
4
4
 
5
5
  ## How do I get Lederhosen?
6
6
 
@@ -33,9 +33,9 @@ Lederhosen is invoked by typing `lederhosen [TASK]`
33
33
 
34
34
  ### trim
35
35
 
36
- Trim (Illumina) reads using quality scores. Output will be a directory of fasta files.
36
+ Trim (Illumina) reads using quality scores. Output will be a directory of fasta files. Reads can optionally be gzipped.
37
37
 
38
- lederhosen trim --reads_dir=reads/* --out_dir=trimmed/
38
+ lederhosen trim --reads_dir=reads/*.txt --out_dir=trimmed/
39
39
 
40
40
  ### join
41
41
 
@@ -43,6 +43,10 @@ Join paired reads from all samples end-to-end. This method enables the use of uc
43
43
 
44
44
  lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
45
45
 
46
+ If your reads are not paired, then you do not need to do this step. Instead, concatenate all of the trimmed reads files.
47
+
48
+ cat trimmed/*.fasta > joined.fasta
49
+
46
50
  ### sort
47
51
 
48
52
  Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
Binary file
Binary file
Binary file
Binary file
data/spec/helpers_spec.rb CHANGED
@@ -1,26 +1,26 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe Lederhosen::Helpers do
4
-
5
- let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt') }
6
-
4
+
5
+ let (:groups) { Lederhosen::Helpers.get_grouped_qseq_files('spec/data/IL*.txt.gz') }
6
+
7
7
  it 'should have a method for grouping QSEQ files' do
8
8
  groups.length.should == 2
9
9
  end
10
10
 
11
11
  it 'should have a method for trimming sequences' do
12
12
  reads = groups.values.first.first
13
- record = File.open(reads) do |handle|
13
+ record = Zlib::GzipReader.open(reads) do |handle|
14
14
  Dna.new(handle).first
15
15
  end
16
16
  # I should probably test with a bad read
17
- Lederhosen::Helpers.trim(record).length.should == 79
17
+ Lederhosen::Helpers.trim(record).length.should == 58
18
18
  end
19
19
 
20
20
  it 'should be able to trim pairs of qseq files, outputting fasta file' do
21
21
  reads = groups.values.first
22
- Lederhosen::Helpers.trim_pairs reads[0], reads[1], '/tmp/munchen_trim_test.fasta'
22
+ Lederhosen::Helpers.trim_pairs reads[0], reads[1], "#{$test_dir}/munchen_trim_test.fasta"
23
23
  # this test will break if trim parameters change
24
- File.read('/tmp/munchen_trim_test.fasta').grep(/^>/).length.should be_even
24
+ File.read("#{$test_dir}/munchen_trim_test.fasta").grep(/^>/).length.should be_even
25
25
  end
26
26
  end
data/spec/misc_spec.rb CHANGED
@@ -8,4 +8,4 @@ describe String do
8
8
  'test'.to_kmers(5).should == []
9
9
  'test'.to_kmers(0).should == []
10
10
  end
11
- end
11
+ end