lederhosen 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- #!/bin/bash
1
+ #!/bash
2
2
 
3
3
  # An example OTU clustering pipeline
4
4
  # Austin G. Davis-Richardson
@@ -14,55 +14,55 @@ min_reads=50
14
14
  min_samples=10
15
15
 
16
16
  # trim reads
17
- bin/lederhosen trim \
17
+ lederhosen trim \
18
18
  --reads-dir=$raw_reads \
19
19
  --out-dir=$out_dir/trimmed
20
20
 
21
21
  # join reads
22
- bin/lederhosen join \
22
+ lederhosen join \
23
23
  --trimmed=$out_dir/trimmed/*.fasta \
24
24
  --output=$out_dir/joined.fasta
25
25
 
26
26
  # filter reads
27
- bin/lederhosen k_filter \
27
+ lederhosen k_filter \
28
28
  --input=$out_dir/joined.fasta \
29
29
  --output=$out_dir/filtered.fasta \
30
30
  -k=10 \
31
31
  --cutoff=50
32
32
 
33
33
  # sort
34
- bin/lederhosen sort \
34
+ lederhosen sort \
35
35
  --input=$out_dir/filtered.fasta \
36
36
  --output=$out_dir/sorted.fasta
37
37
 
38
38
  for i in 0.80 0.90 0.95
39
39
  do
40
40
  # cluster
41
- bin/lederhosen cluster \
41
+ lederhosen cluster \
42
42
  --input=$out_dir/sorted.fasta \
43
43
  --output=$out_dir/clusters_"$i".uc \
44
44
  --identity=$i
45
45
 
46
46
  # filter uc file
47
- bin/lederhosen uc_filter \
47
+ lederhosen uc_filter \
48
48
  --input=$out_dir/clusters_"$i".uc \
49
49
  --output=$out_dir/clusters_"$i".uc.filtered \
50
50
  --reads=$min_reads \
51
51
  --samples=$min_samples \
52
52
 
53
53
  # generate otu table
54
- bin/lederhosen otu_table \
54
+ lederhosen otu_table \
55
55
  --clusters=$out_dir/clusters_"$i".uc.filtered \
56
56
  --output=$out_dir/otus_"$i"
57
57
 
58
58
  # get representative reads
59
- bin/lederhosen rep_reads \
59
+ lederhosen rep_reads \
60
60
  --clusters=$out_dir/clusters_"$i".uc.filtered \
61
61
  --joined=$out_dir/sorted.fasta \
62
62
  --output=$out_dir/representatives_"$i".fasta
63
63
 
64
64
  # blast representative reads
65
- bin/lederhosen name \
65
+ lederhosen name \
66
66
  --reps=$out_dir/representatives_"$i".fasta \
67
67
  --output=$out_dir/taxonomies_"$i".txt \
68
68
  --database=$taxcollector
@@ -0,0 +1,86 @@
1
+ ##
2
+ # ADD TAXONOMIC DESCRIPTIONS TO OTU TABLE
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "add_names",
9
+ "--blat=blat_output.txt --table=cluster_table.csv --level=taxonomic level (i.e 6 genus)"
10
+
11
+ method_option :blat, :type => :string, :required => true
12
+ method_option :table, :type => :string, :required => true
13
+ method_option :level, :type => :string, :required => true
14
+ method_option :output, :type => :string, :required => false
15
+
16
+ def add_names
17
+ blat = options[:blat]
18
+ table = options[:table]
19
+ level = options[:level]
20
+ output = options[:output] || $stdout
21
+
22
+ levels = { 'kingdom' => 0,
23
+ 'domain' => 0,
24
+ 'phylum' => 1,
25
+ 'class' => 2,
26
+ 'order' => 3,
27
+ 'family' => 4,
28
+ 'genus' => 5,
29
+ 'species' => 6 }
30
+
31
+ fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
32
+
33
+ # Corresponds with the numbers used in the TaxCollector database
34
+ # taxonomic descriptions
35
+ level_no = levels[level]
36
+
37
+ # map cluster_id to taxonomic description
38
+ # default is the cluster_id itself in case
39
+ # the cluster was not classified.
40
+ clusterid_to_name = Hash.new { |h, k| h[k] = k }
41
+
42
+ # map clusterid to name using blat output
43
+ ohai "loading BLAT output from #{blat}"
44
+ File.open(blat) do |handle|
45
+ handle.each do |line|
46
+ line = line.strip.split
47
+
48
+ # Only get first match
49
+ # TODO something smarter here
50
+ cluster_id = line[0].split(':')[3]
51
+ next if clusterid_to_name.include? cluster_id
52
+
53
+ taxonomic_description = line[1]
54
+
55
+ # match by level_no
56
+ # Example:
57
+ # [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
58
+ # I want to match Actinobacteria given level_no = 2
59
+ level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
60
+
61
+ clusterid_to_name[cluster_id] = level_name
62
+ end
63
+ end
64
+
65
+ # load table, replace cluster names with taxonomic descriptions
66
+ output = File.open(output, 'w') unless output == $stdout
67
+ ohai "replacing names in #{table}"
68
+ File.open(table) do |handle|
69
+
70
+ # read in header, replace clusterids to names
71
+ header = handle.gets.strip.split(',')
72
+ header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
73
+
74
+ # print new header
75
+ output.puts header.join(',')
76
+
77
+ # print rest of table
78
+ handle.each { |l| output.print l }
79
+ end
80
+
81
+ # print status message
82
+ ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
83
+ end
84
+
85
+ end
86
+ end
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "cluster fasta file",
8
+ desc "cluster",
9
9
  "--input=sorted.fasta --identity=0.80 --output=clusters.uc"
10
10
 
11
11
  method_option :input, :type => :string, :required => true
@@ -0,0 +1,40 @@
1
+ ##
2
+ # HIERARCHICAL CLUSTERING FTW
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "h_cluster",
9
+ "--input=sorted.fasta --identity=0.80 --output=clusters.uc --identities=0.80 0.90 0.95"
10
+
11
+ method_option :input, :type => :string, :required => true
12
+ method_option :out_dir, :type => :string, :required => true
13
+ method_option :identities, :type => :array, :required => true
14
+
15
+ def h_cluster
16
+ out_dir = options[:out_dir]
17
+ input = options[:input]
18
+ identities = options[:identities].map(&:to_f).sort
19
+
20
+ `mkdir -p #{out_dir}`
21
+
22
+ # initial clustering
23
+ i = identities.shift
24
+ clusters = File.join(out_dir, "clusters_#{i}.uc")
25
+ clusters_filtered = File.join(out_dir, "clusters_#{i}.uc.filtered")
26
+
27
+ # cluster
28
+ invoke :cluster, [], { :input => input, :output => clusters, :identity => i }
29
+
30
+ # filter
31
+ invoke :uc_filter, [], { :input => clusters, :output => clusters_filtered }
32
+
33
+ # get reads for each cluster
34
+ invoke :split, [], { :clusters => clusters_filtered, :reads => input }
35
+
36
+ [t1, t2, t3].map(&:call)
37
+ end
38
+
39
+ end
40
+ end
@@ -4,7 +4,7 @@ module Lederhosen
4
4
  ##
5
5
  # PAIRED-END READ WORK-AROUND (JOIN THEM)
6
6
  #
7
- desc "join reads end-to-end",
7
+ desc "join",
8
8
  "--trimmed=trimmed/*.fasta --output=joined.fasta"
9
9
 
10
10
  method_option :trimmed, :type => :string, :required => true
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "k_filter khmer filtering",
8
+ desc "k_filter",
9
9
  "--input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50"
10
10
 
11
11
  method_option :input, :type => :string, :required => true
@@ -5,8 +5,8 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "name identify clusters in a taxcollector database",
9
- "--reps representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
8
+ desc "name",
9
+ "--reps --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
10
10
 
11
11
  method_option :reps, :type => :string, :required => true
12
12
  method_option :database, :type => :string, :required => true
@@ -7,7 +7,7 @@ SEP = ','
7
7
  module Lederhosen
8
8
  class CLI
9
9
 
10
- desc "otu_tables generates otu tables",
10
+ desc "otu_table",
11
11
  "--clusters=clusters.uc --output=otu_prefix"
12
12
 
13
13
  method_option :clusters, :type => :string, :required => true
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "rep_reads extract representative reads for each cluster to a fasta file",
8
+ desc "rep_reads",
9
9
  "--clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta"
10
10
 
11
11
  method_option :clusters, :type => :string, :required => true
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "sort fasta file by length",
8
+ desc "sort",
9
9
  "--input=joined.fasta --output=sorted.fasta"
10
10
 
11
11
  method_option :input, :type => :string, :required => true
@@ -5,14 +5,14 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "output separate fasta file containing sequences belonging to each cluster",
9
- "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
8
+ desc "split",
9
+ "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=1 --out-dir=output_directory"
10
10
 
11
11
  method_option :clusters, :type => :string, :required => true
12
12
  method_option :reads, :type => :string, :required => true
13
13
  method_option :out_dir, :type => :string, :required => true
14
14
  method_option :buffer_size, :type => :numeric, :default => 1000
15
- method_option :min_clst_size, :type => :numeric, :default => 100
15
+ method_option :min_clst_size, :type => :numeric, :default => 1
16
16
 
17
17
  def split
18
18
  clusters = options[:clusters]
@@ -0,0 +1,48 @@
1
+ ##
2
+ # SQUISH A CSV FILE BY COLUMN NAME
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
9
+
10
+ method_option :csv_file, :type => :string, :required => true
11
+ method_option :output, :type => :string, :required => false
12
+
13
+ def squish
14
+ csv_file = options[:csv_file]
15
+ output = options[:output] || $stdout
16
+
17
+ # sample_name -> column name -> total number of reads
18
+ total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
19
+ column_names = '' # scope
20
+ # Load CSV file, merge counts in columns with the same name
21
+ File.open(csv_file) do |handle|
22
+ column_names = handle.gets.strip.split(',')[1..-1]
23
+ handle.each do |line|
24
+ line = line.strip.split(',')
25
+ sample = line[0]
26
+ line[1..-1].zip(column_names) do |reads, column_name|
27
+ total_by_sample_by_column[sample][column_name] += reads.to_i
28
+ end
29
+ end
30
+ end
31
+
32
+ output = File.open(output) rescue $stdout
33
+
34
+ # print the new, squished csv file
35
+ column_names.uniq!.sort!
36
+ puts "-,#{column_names.join(',')}"
37
+ total_by_sample_by_column.each_pair do |sample_id, row|
38
+ print "#{sample_id}"
39
+ column_names.each do |column_name|
40
+ print ",#{row[column_name]}"
41
+ end
42
+ print "\n"
43
+ end
44
+
45
+ output.close
46
+ end
47
+ end
48
+ end
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "trim Illumina QSEQ files",
8
+ desc "trim",
9
9
  "--reads_dir=reads/* --out_dir=trimmed.fasta"
10
10
 
11
11
  method_option :reads_dir, :type => :string, :required => true
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "uc_filter filter uc file by min samples",
8
+ desc "uc_filter",
9
9
  "--input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10"
10
10
 
11
11
  method_option :input, :type => :string, :required => true
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Lederhosen
2
- VERSION = '0.1.2'
2
+ VERSION = '0.1.3'
3
3
  end
data/readme.md CHANGED
@@ -12,12 +12,6 @@ Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own ri
12
12
  `sudo gem install lederhosen`
13
13
  4. Check installation by typing `lederhosen`. You should see some help text.
14
14
 
15
- ## How do I use Lederhosen?
16
-
17
- Type `lederhosen help` for complete instructions
18
-
19
- See pipeline.sh for example usage.
20
-
21
15
  ## Features
22
16
 
23
17
  - Sequence trimming (paired-end Illumina).
@@ -27,4 +21,86 @@ See pipeline.sh for example usage.
27
21
  - Separation of representative reads.
28
22
  - Separation of all reads belonging to each cluster.
29
23
  - Identification of clusters using TaxCollector.
30
- - Generation of OTU abundancy matrices.
24
+ - Generation of OTU abundancy matrices.
25
+
26
+ ## How do I use Lederhosen?
27
+
28
+ Lederhosen is just a convenient wrapper for UCLUST and BLAT with some scripts for quality filtering, de-noising of data as well as creation of nice tables. It is similar to QIIME but meant for paired-end Illumina data rather than single-end 454. The basic lederhosen pipeline consists of: trimming, joining, sorting, filtering, clustering, more filtering, and output generation (OTU tables, representative reads, reads by cluster, and taxonomic descriptions for clusters). See the example pipeline in `pipeline.sh`.
29
+
30
+ ## Tasks
31
+
32
+ Lederhosen is invoked by typing `lederhosen [TASK]`
33
+
34
+ ### trim
35
+
36
+ Trim (Illumina) reads using quality scores. Output will be a directory of fasta files.
37
+
38
+ lederhosen trim --reads_dir=reads/* --out_dir=trimmed/
39
+
40
+ ### join
41
+
42
+ Join paired reads from all samples end-to-end. This method enables the use of uclust with paired-end data. Output will be a single fasta file.
43
+
44
+ lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
45
+
46
+ ### sort
47
+
48
+ Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
49
+
50
+ lederhosen sort --input=joined.fasta --output=sorted.fasta
51
+
52
+ ### k_filter
53
+
54
+ K-mer abundance noise filtering. This step is experimental and optional. It may reduce the time it takes to perform the clustering.
55
+
56
+ lederhosen k_filter --input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50
57
+
58
+ ### cluster
59
+
60
+ Cluster reads using UCLUST. Output is a uc file.
61
+
62
+ lederhosen cluster --input=sorted.fasta --identity=0.80 --output=clusters.uc
63
+
64
+ ### uc_filter
65
+
66
+ Filter UC file removing singleton clusters or clusters that are only present in a few samples. This greatly reduces the noise of the data without removing many of the reads.
67
+
68
+ lederhosen uc_filter --input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10
69
+
70
+ ### otu_table
71
+
72
+ Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
73
+
74
+ lederhosen otu_table --clusters=clusters.uc --output=otu_prefix.csv
75
+
76
+ ### rep_reads
77
+
78
+ Get representative reads for each cluster. Output is a single fasta file.
79
+
80
+ lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta
81
+
82
+ ### split
83
+
84
+ Get all reads belonging to each cluster. Output is a directory containing a fasta file for each cluster. The fasta file contains the joined reads.
85
+
86
+ lederhosen split --clusters=clusters.uc --reads=joined.fasta --min-clst-size=100
87
+
88
+ ### name
89
+
90
+ Identify clusters in a database using the representative reads. This is a simple wrapper for BLAT. The output is a tab-delimited file similar to a BLAST output file. For this step you need to have BLAT installed and also a [TaxCollector](http://github.com/audy/taxcollector) database.
91
+
92
+ lederhosen name --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt
93
+
94
+ ### add_names
95
+
96
+ Add phylogenetic classification of clusters to OTU abundance file.
97
+
98
+ lederhosen add_names --blat=blat_output.txt --level=taxonomic_level --table=otu_file.csv --output=named_out_file.csv
99
+
100
+ Where `taxonomic_level` can be: kingdom, domain, phylum, class, order, family, genus or species. This method only works with a TaxCollector database.
101
+
102
+ ### squish
103
+
104
+ Squish an OTU abundance file by column name (phylogenetic description)
105
+
106
+ lederhosen squish --csv-file=named_out_file.csv --output=squished_named_out_file.csv
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- hash: 31
4
+ hash: 29
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 2
10
- version: 0.1.2
9
+ - 3
10
+ version: 0.1.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Austin G. Davis-Richardson
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-05-23 00:00:00 Z
18
+ date: 2012-07-13 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: dna
@@ -129,12 +129,15 @@ files:
129
129
  - .rvmrc
130
130
  - Gemfile
131
131
  - bin/lederhosen
132
+ - examples/pipeline.sh
132
133
  - lederhosen.gemspec
133
134
  - lib/lederhosen.rb
134
135
  - lib/lederhosen/buffer.rb
135
136
  - lib/lederhosen/cli.rb
136
137
  - lib/lederhosen/helpers.rb
138
+ - lib/lederhosen/tasks/add_names.rb
137
139
  - lib/lederhosen/tasks/cluster.rb
140
+ - lib/lederhosen/tasks/hierarchical.rb
138
141
  - lib/lederhosen/tasks/join.rb
139
142
  - lib/lederhosen/tasks/k_filter.rb
140
143
  - lib/lederhosen/tasks/name.rb
@@ -142,10 +145,10 @@ files:
142
145
  - lib/lederhosen/tasks/rep_reads.rb
143
146
  - lib/lederhosen/tasks/sort.rb
144
147
  - lib/lederhosen/tasks/split.rb
148
+ - lib/lederhosen/tasks/squish.rb
145
149
  - lib/lederhosen/tasks/trim.rb
146
150
  - lib/lederhosen/tasks/uc_filter.rb
147
151
  - lib/version.rb
148
- - pipeline.sh
149
152
  - readme.md
150
153
  - spec/data/ILT_L_9_B_001_1.txt
151
154
  - spec/data/ILT_L_9_B_001_3.txt