lederhosen 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,4 @@
1
- #!/bin/bash
1
+ #!/bash
2
2
 
3
3
  # An example OTU clustering pipeline
4
4
  # Austin G. Davis-Richardson
@@ -14,55 +14,55 @@ min_reads=50
14
14
  min_samples=10
15
15
 
16
16
  # trim reads
17
- bin/lederhosen trim \
17
+ lederhosen trim \
18
18
  --reads-dir=$raw_reads \
19
19
  --out-dir=$out_dir/trimmed
20
20
 
21
21
  # join reads
22
- bin/lederhosen join \
22
+ lederhosen join \
23
23
  --trimmed=$out_dir/trimmed/*.fasta \
24
24
  --output=$out_dir/joined.fasta
25
25
 
26
26
  # filter reads
27
- bin/lederhosen k_filter \
27
+ lederhosen k_filter \
28
28
  --input=$out_dir/joined.fasta \
29
29
  --output=$out_dir/filtered.fasta \
30
30
  -k=10 \
31
31
  --cutoff=50
32
32
 
33
33
  # sort
34
- bin/lederhosen sort \
34
+ lederhosen sort \
35
35
  --input=$out_dir/filtered.fasta \
36
36
  --output=$out_dir/sorted.fasta
37
37
 
38
38
  for i in 0.80 0.90 0.95
39
39
  do
40
40
  # cluster
41
- bin/lederhosen cluster \
41
+ lederhosen cluster \
42
42
  --input=$out_dir/sorted.fasta \
43
43
  --output=$out_dir/clusters_"$i".uc \
44
44
  --identity=$i
45
45
 
46
46
  # filter uc file
47
- bin/lederhosen uc_filter \
47
+ lederhosen uc_filter \
48
48
  --input=$out_dir/clusters_"$i".uc \
49
49
  --output=$out_dir/clusters_"$i".uc.filtered \
50
50
  --reads=$min_reads \
51
51
  --samples=$min_samples \
52
52
 
53
53
  # generate otu table
54
- bin/lederhosen otu_table \
54
+ lederhosen otu_table \
55
55
  --clusters=$out_dir/clusters_"$i".uc.filtered \
56
56
  --output=$out_dir/otus_"$i"
57
57
 
58
58
  # get representative reads
59
- bin/lederhosen rep_reads \
59
+ lederhosen rep_reads \
60
60
  --clusters=$out_dir/clusters_"$i".uc.filtered \
61
61
  --joined=$out_dir/sorted.fasta \
62
62
  --output=$out_dir/representatives_"$i".fasta
63
63
 
64
64
  # blast representative reads
65
- bin/lederhosen name \
65
+ lederhosen name \
66
66
  --reps=$out_dir/representatives_"$i".fasta \
67
67
  --output=$out_dir/taxonomies_"$i".txt \
68
68
  --database=$taxcollector
@@ -0,0 +1,86 @@
1
+ ##
2
+ # ADD TAXONOMIC DESCRIPTIONS TO OTU TABLE
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "add_names",
9
+ "--blat=blat_output.txt --table=cluster_table.csv --level=taxonomic level (i.e 6 genus)"
10
+
11
+ method_option :blat, :type => :string, :required => true
12
+ method_option :table, :type => :string, :required => true
13
+ method_option :level, :type => :string, :required => true
14
+ method_option :output, :type => :string, :required => false
15
+
16
+ def add_names
17
+ blat = options[:blat]
18
+ table = options[:table]
19
+ level = options[:level]
20
+ output = options[:output] || $stdout
21
+
22
+ levels = { 'kingdom' => 0,
23
+ 'domain' => 0,
24
+ 'phylum' => 1,
25
+ 'class' => 2,
26
+ 'order' => 3,
27
+ 'family' => 4,
28
+ 'genus' => 5,
29
+ 'species' => 6 }
30
+
31
+ fail "unknown level. try #{levels.keys.join(', ')}" unless levels.include? level
32
+
33
+ # Corresponds with the numbers used in the TaxCollector database
34
+ # taxonomic descriptions
35
+ level_no = levels[level]
36
+
37
+ # map cluster_id to taxonomic description
38
+ # default is the cluster_id itself in case
39
+ # the cluster was not classified.
40
+ clusterid_to_name = Hash.new { |h, k| h[k] = k }
41
+
42
+ # map clusterid to name using blat output
43
+ ohai "loading BLAT output from #{blat}"
44
+ File.open(blat) do |handle|
45
+ handle.each do |line|
46
+ line = line.strip.split
47
+
48
+ # Only get first match
49
+ # TODO something smarter here
50
+ cluster_id = line[0].split(':')[3]
51
+ next if clusterid_to_name.include? cluster_id
52
+
53
+ taxonomic_description = line[1]
54
+
55
+ # match by level_no
56
+ # Example:
57
+ # [0]Bacteria;[1]Actinobacteria;[2]Actinobacteria;[3]Acidimicrobiales;[4]Acidimicrobiaceae;[5]Acidimicrobium;[6]Acidimicrobium_ferrooxidans;
58
+ # I want to match Actinobacteria given level_no = 2
59
+ level_name = taxonomic_description.match(/\[#{level_no}\](\w*)[;\[]/)[1] rescue next
60
+
61
+ clusterid_to_name[cluster_id] = level_name
62
+ end
63
+ end
64
+
65
+ # load table, replace cluster names with taxonomic descriptions
66
+ output = File.open(output, 'w') unless output == $stdout
67
+ ohai "replacing names in #{table}"
68
+ File.open(table) do |handle|
69
+
70
+ # read in header, replace clusterids to names
71
+ header = handle.gets.strip.split(',')
72
+ header[1..-1] = header[1..-1].map { |x| clusterid_to_name[x] }
73
+
74
+ # print new header
75
+ output.puts header.join(',')
76
+
77
+ # print rest of table
78
+ handle.each { |l| output.print l }
79
+ end
80
+
81
+ # print status message
82
+ ohai "Got #{clusterid_to_name.keys.reject { |x| x =~ /cluster/ }.size} names (#{clusterid_to_name.keys.size} total)"
83
+ end
84
+
85
+ end
86
+ end
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "cluster fasta file",
8
+ desc "cluster",
9
9
  "--input=sorted.fasta --identity=0.80 --output=clusters.uc"
10
10
 
11
11
  method_option :input, :type => :string, :required => true
@@ -0,0 +1,40 @@
1
+ ##
2
+ # HIERARCHICAL CLUSTERING FTW
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "h_cluster",
9
+ "--input=sorted.fasta --identity=0.80 --output=clusters.uc --identities=0.80 0.90 0.95"
10
+
11
+ method_option :input, :type => :string, :required => true
12
+ method_option :out_dir, :type => :string, :required => true
13
+ method_option :identities, :type => :array, :required => true
14
+
15
+ def h_cluster
16
+ out_dir = options[:out_dir]
17
+ input = options[:input]
18
+ identities = options[:identities].map(&:to_f).sort
19
+
20
+ `mkdir -p #{out_dir}`
21
+
22
+ # initial clustering
23
+ i = identities.shift
24
+ clusters = File.join(out_dir, "clusters_#{i}.uc")
25
+ clusters_filtered = File.join(out_dir, "clusters_#{i}.uc.filtered")
26
+
27
+ # cluster
28
+ invoke :cluster, [], { :input => input, :output => clusters, :identity => i }
29
+
30
+ # filter
31
+ invoke :uc_filter, [], { :input => clusters, :output => clusters_filtered }
32
+
33
+ # get reads for each cluster
34
+ invoke :split, [], { :clusters => clusters_filtered, :reads => input }
35
+
36
+ [t1, t2, t3].map(&:call)
37
+ end
38
+
39
+ end
40
+ end
@@ -4,7 +4,7 @@ module Lederhosen
4
4
  ##
5
5
  # PAIRED-END READ WORK-AROUND (JOIN THEM)
6
6
  #
7
- desc "join reads end-to-end",
7
+ desc "join",
8
8
  "--trimmed=trimmed/*.fasta --output=joined.fasta"
9
9
 
10
10
  method_option :trimmed, :type => :string, :required => true
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "k_filter khmer filtering",
8
+ desc "k_filter",
9
9
  "--input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50"
10
10
 
11
11
  method_option :input, :type => :string, :required => true
@@ -5,8 +5,8 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "name identify clusters in a taxcollector database",
9
- "--reps representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
8
+ desc "name",
9
+ "--reps --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
10
10
 
11
11
  method_option :reps, :type => :string, :required => true
12
12
  method_option :database, :type => :string, :required => true
@@ -7,7 +7,7 @@ SEP = ','
7
7
  module Lederhosen
8
8
  class CLI
9
9
 
10
- desc "otu_tables generates otu tables",
10
+ desc "otu_table",
11
11
  "--clusters=clusters.uc --output=otu_prefix"
12
12
 
13
13
  method_option :clusters, :type => :string, :required => true
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "rep_reads extract representative reads for each cluster to a fasta file",
8
+ desc "rep_reads",
9
9
  "--clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta"
10
10
 
11
11
  method_option :clusters, :type => :string, :required => true
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "sort fasta file by length",
8
+ desc "sort",
9
9
  "--input=joined.fasta --output=sorted.fasta"
10
10
 
11
11
  method_option :input, :type => :string, :required => true
@@ -5,14 +5,14 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "output separate fasta file containing sequences belonging to each cluster",
9
- "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
8
+ desc "split",
9
+ "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=1 --out-dir=output_directory"
10
10
 
11
11
  method_option :clusters, :type => :string, :required => true
12
12
  method_option :reads, :type => :string, :required => true
13
13
  method_option :out_dir, :type => :string, :required => true
14
14
  method_option :buffer_size, :type => :numeric, :default => 1000
15
- method_option :min_clst_size, :type => :numeric, :default => 100
15
+ method_option :min_clst_size, :type => :numeric, :default => 1
16
16
 
17
17
  def split
18
18
  clusters = options[:clusters]
@@ -0,0 +1,48 @@
1
+ ##
2
+ # SQUISH A CSV FILE BY COLUMN NAME
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc 'squish', 'merge cell values (reads) in a csv file by column name (cluster)'
9
+
10
+ method_option :csv_file, :type => :string, :required => true
11
+ method_option :output, :type => :string, :required => false
12
+
13
+ def squish
14
+ csv_file = options[:csv_file]
15
+ output = options[:output] || $stdout
16
+
17
+ # sample_name -> column name -> total number of reads
18
+ total_by_sample_by_column = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
19
+ column_names = '' # scope
20
+ # Load CSV file, merge counts in columns with the same name
21
+ File.open(csv_file) do |handle|
22
+ column_names = handle.gets.strip.split(',')[1..-1]
23
+ handle.each do |line|
24
+ line = line.strip.split(',')
25
+ sample = line[0]
26
+ line[1..-1].zip(column_names) do |reads, column_name|
27
+ total_by_sample_by_column[sample][column_name] += reads.to_i
28
+ end
29
+ end
30
+ end
31
+
32
+ output = File.open(output) rescue $stdout
33
+
34
+ # print the new, squished csv file
35
+ column_names.uniq!.sort!
36
+ puts "-,#{column_names.join(',')}"
37
+ total_by_sample_by_column.each_pair do |sample_id, row|
38
+ print "#{sample_id}"
39
+ column_names.each do |column_name|
40
+ print ",#{row[column_name]}"
41
+ end
42
+ print "\n"
43
+ end
44
+
45
+ output.close
46
+ end
47
+ end
48
+ end
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "trim Illumina QSEQ files",
8
+ desc "trim",
9
9
  "--reads_dir=reads/* --out_dir=trimmed.fasta"
10
10
 
11
11
  method_option :reads_dir, :type => :string, :required => true
@@ -5,7 +5,7 @@
5
5
  module Lederhosen
6
6
  class CLI
7
7
 
8
- desc "uc_filter filter uc file by min samples",
8
+ desc "uc_filter",
9
9
  "--input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10"
10
10
 
11
11
  method_option :input, :type => :string, :required => true
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Lederhosen
2
- VERSION = '0.1.2'
2
+ VERSION = '0.1.3'
3
3
  end
data/readme.md CHANGED
@@ -12,12 +12,6 @@ Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own ri
12
12
  `sudo gem install lederhosen`
13
13
  4. Check installation by typing `lederhosen`. You should see some help text.
14
14
 
15
- ## How do I use Lederhosen?
16
-
17
- Type `lederhosen help` for complete instructions
18
-
19
- See pipeline.sh for example usage.
20
-
21
15
  ## Features
22
16
 
23
17
  - Sequence trimming (paired-end Illumina).
@@ -27,4 +21,86 @@ See pipeline.sh for example usage.
27
21
  - Separation of representative reads.
28
22
  - Separation of all reads belonging to each cluster.
29
23
  - Identification of clusters using TaxCollector.
30
- - Generation of OTU abundancy matrices.
24
+ - Generation of OTU abundancy matrices.
25
+
26
+ ## How do I use Lederhosen?
27
+
28
+ Lederhosen is just a convenient wrapper for UCLUST and BLAT with some scripts for quality filtering, de-noising of data as well as creation of nice tables. It is similar to QIIME but meant for paired-end Illumina data rather than single-end 454. The basic lederhosen pipeline consists of: trimming, joining, sorting, filtering, clustering, more filtering, and output generation (OTU tables, representative reads, reads by cluster, and taxonomic descriptions for clusters). See the example pipeline in `pipeline.sh`.
29
+
30
+ ## Tasks
31
+
32
+ Lederhosen is invoked by typing `lederhosen [TASK]`
33
+
34
+ ### trim
35
+
36
+ Trim (Illumina) reads using quality scores. Output will be a directory of fasta files.
37
+
38
+ lederhosen trim --reads_dir=reads/* --out_dir=trimmed/
39
+
40
+ ### join
41
+
42
+ Join paired reads from all samples end-to-end. This method enables the use of uclust with paired-end data. Output will be a single fasta file.
43
+
44
+ lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta
45
+
46
+ ### sort
47
+
48
+ Sort reads by length. This is a requirement for uclust's single-linkage clustering algorithim.
49
+
50
+ lederhosen sort --input=joined.fasta --output=sorted.fasta
51
+
52
+ ### k_filter
53
+
54
+ K-mer abundance noise filtering. This step is experimental and optional. It may reduce the time it takes to perform the clustering.
55
+
56
+ lederhosen k_filter --input=joined.fasta --output=filtered.fasta --k=10 --cutoff=50
57
+
58
+ ### cluster
59
+
60
+ Cluster reads using UCLUST. Output is a uc file.
61
+
62
+ lederhosen cluster --input=sorted.fasta --identity=0.80 --output=clusters.uc
63
+
64
+ ### uc_filter
65
+
66
+ Filter UC file removing singleton clusters or clusters that are only present in a few samples. This greatly reduces the noise of the data without removing many of the reads.
67
+
68
+ lederhosen uc_filter --input=clusters.uc --output=clusters.uc.filtered --reads=50 --samples=10
69
+
70
+ ### otu_table
71
+
72
+ Create an OTU abundance table where rows are samples and columns are clusters. The entries are the number of reads for that cluster in a sample.
73
+
74
+ lederhosen otu_table --clusters=clusters.uc --output=otu_prefix.csv
75
+
76
+ ### rep_reads
77
+
78
+ Get representative reads for each cluster. Output is a single fasta file.
79
+
80
+ lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta
81
+
82
+ ### split
83
+
84
+ Get all reads belonging to each cluster. Output is a directory containing a fasta file for each cluster. The fasta file contains the joined reads.
85
+
86
+ lederhosen split --clusters=clusters.uc --reads=joined.fasta --min-clst-size=100
87
+
88
+ ### name
89
+
90
+ Identify clusters in a database using the representative reads. This is a simple wrapper for BLAT. The output is a tab-delimited file similar to a BLAST output file. For this step you need to have BLAT installed and also a [TaxCollector](http://github.com/audy/taxcollector) database.
91
+
92
+ lederhosen name --reps=representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt
93
+
94
+ ### add_names
95
+
96
+ Add phylogenetic classification of clusters to OTU abundance file.
97
+
98
+ lederhosen add_names --blat=blat_output.txt --level=taxonomic_level --table=otu_file.csv --output=named_out_file.csv
99
+
100
+ Where `taxonomic_level` can be: kingdom, domain, phylum, class, order, family, genus or species. This method only works with a TaxCollector database.
101
+
102
+ ### squish
103
+
104
+ Squish an OTU abundance file by column name (phylogenetic description)
105
+
106
+ lederhosen squish --csv-file=named_out_file.csv --output=squished_named_out_file.csv
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- hash: 31
4
+ hash: 29
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 1
9
- - 2
10
- version: 0.1.2
9
+ - 3
10
+ version: 0.1.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Austin G. Davis-Richardson
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-05-23 00:00:00 Z
18
+ date: 2012-07-13 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: dna
@@ -129,12 +129,15 @@ files:
129
129
  - .rvmrc
130
130
  - Gemfile
131
131
  - bin/lederhosen
132
+ - examples/pipeline.sh
132
133
  - lederhosen.gemspec
133
134
  - lib/lederhosen.rb
134
135
  - lib/lederhosen/buffer.rb
135
136
  - lib/lederhosen/cli.rb
136
137
  - lib/lederhosen/helpers.rb
138
+ - lib/lederhosen/tasks/add_names.rb
137
139
  - lib/lederhosen/tasks/cluster.rb
140
+ - lib/lederhosen/tasks/hierarchical.rb
138
141
  - lib/lederhosen/tasks/join.rb
139
142
  - lib/lederhosen/tasks/k_filter.rb
140
143
  - lib/lederhosen/tasks/name.rb
@@ -142,10 +145,10 @@ files:
142
145
  - lib/lederhosen/tasks/rep_reads.rb
143
146
  - lib/lederhosen/tasks/sort.rb
144
147
  - lib/lederhosen/tasks/split.rb
148
+ - lib/lederhosen/tasks/squish.rb
145
149
  - lib/lederhosen/tasks/trim.rb
146
150
  - lib/lederhosen/tasks/uc_filter.rb
147
151
  - lib/version.rb
148
- - pipeline.sh
149
152
  - readme.md
150
153
  - spec/data/ILT_L_9_B_001_1.txt
151
154
  - spec/data/ILT_L_9_B_001_3.txt