lederhosen 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -76,56 +76,54 @@ module Lederhosen
76
76
  def load_uc_file(input)
77
77
  clusters = Hash.new
78
78
 
79
- # store a list of samples
79
+ # keep track of samples
80
+ samples = Set.new
81
+
82
+ # store a list of all the sample IDs
80
83
  clusters[:samples] = Set.new
81
84
 
82
85
  # data for each cluster
83
- # - total size
84
- # - size by sample
85
- # - seed sequence
86
- clusters[:count_data] = Hash.new
86
+ # clstr_counts[:clstr][:sample] = number_of_reads
87
+ clstr_counts = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } }
87
88
 
89
+ # clstrnr_to_seed[seed_sequence_id] = clstr_nr
90
+ seed_to_clstrnr = Hash.new
88
91
  File.open(input) do |handle|
89
92
  handle.each do |line|
90
93
 
91
- # skip comments
92
- next if line =~ /^#/
94
+ next if line =~ /^#/ # skip comments
95
+
93
96
  line = line.strip.split
94
97
 
95
98
  # things we want to know
96
99
  type = line[0]
97
- clusternr = line[1]
100
+ clusternr = line[1].to_i
98
101
  querylabel = line[8]
99
102
  targetlabel = line[9]
100
103
  sample = line[8].split(':')[2]
104
+
105
+ # keep track of samples
106
+ samples.add sample
101
107
 
102
108
  # keep track of all samples
103
- clusters[:samples] << sample
109
+ clusters[:samples].add sample
104
110
 
105
111
  if type == 'S' # = Seed Sequence
106
- clusters[:count_data][clusternr] = { :seed => querylabel, :total => 1, :counts => Hash.new{ |h, k| h[k] = 0 } }
112
+ clstr_counts[clusternr][sample] += 1
113
+ seed_to_clstrnr[querylabel] = clusternr
107
114
  elsif type == 'H' # = Seed Member
108
- clusters[:count_data][clusternr][:total] += 1
109
- clusters[:count_data][clusternr][:counts][sample] += 1
115
+ clstr_counts[clusternr][sample] += 1
110
116
  end
111
117
 
112
118
  end
113
119
  end
114
- clusters
120
+ return {
121
+ :clstr_counts => clstr_counts,
122
+ :seed_to_clstrnr => seed_to_clstrnr,
123
+ :samples => samples
124
+ }
115
125
  end
116
126
 
117
- def cluster_data_as_csv(data)
118
- samples = data[:samples].to_a
119
- counts = data[:count_data]
120
-
121
- sep = ","
122
- csv = []
123
- csv << ['-'] + samples
124
- counts.keys.each do |cluster|
125
- csv << ["cluster-#{cluster}"] + samples.collect { |x| "#{counts[cluster][:counts][x]}" }
126
- end
127
- csv.collect { |x| x.join("\t")}.join("\n")
128
- end
129
127
 
130
128
  end # class << self
131
129
  end # class Helpers
@@ -8,9 +8,9 @@ module Lederhosen
8
8
  desc "cluster fasta file",
9
9
  "--input=sorted.fasta --identity=0.80 --output=clusters.uc"
10
10
 
11
- method_option :input, :type => :string, :default => 'sorted.fasta'
12
- method_option :output, :type => :string, :default => 'clusters.uc'
13
- method_option :identity, :type => :numeric, :default => 0.8
11
+ method_option :input, :type => :string, :required => true
12
+ method_option :output, :type => :string, :required => true
13
+ method_option :identity, :type => :numeric, :required => true
14
14
 
15
15
  def cluster
16
16
  identity = options[:identity]
@@ -7,8 +7,8 @@ module Lederhosen
7
7
  desc "join reads end-to-end",
8
8
  "--trimmed=trimmed/*.fasta --output=joined.fasta"
9
9
 
10
- method_option :trimmed, :type => :string, :default => 'trimmed/*.fasta'
11
- method_option :output, :type => :string, :default => 'joined.fasta'
10
+ method_option :trimmed, :type => :string, :required => true
11
+ method_option :output, :type => :string, :required => true
12
12
 
13
13
  def join
14
14
 
@@ -0,0 +1,29 @@
1
+ ##
2
+ # IDENTIFY CLUSTERS IN A TAXCOLLECTOR DATABASE
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "name identify clusters in a taxcollector database",
9
+ "--reps representative_reads.fasta --database taxcollector.fa --output blast_like_output.txt"
10
+
11
+ method_option :reps, :type => :string, :required => true
12
+ method_option :database, :type => :string, :required => true
13
+ method_option :output, :type => :string, :required => true
14
+
15
+ def name
16
+ reps = options[:reps]
17
+ database = options[:database]
18
+ output = options[:output]
19
+
20
+ # run blat/blast
21
+ cmd = [
22
+ 'blat',
23
+
24
+ ]
25
+
26
+ end
27
+
28
+ end
29
+ end
@@ -2,58 +2,76 @@
2
2
  # MAKE TABLES
3
3
  #
4
4
 
5
+ SEP = ','
6
+
5
7
  module Lederhosen
6
8
  class CLI
7
9
 
8
- desc "otu_tables generates otu tables & representative reads",
9
- "--clusters=clusters.uc --output=otu_prefix --joined=joined.fasta"
10
+ desc "otu_tables generates otu tables",
11
+ "--clusters=clusters.uc --output=otu_prefix"
10
12
 
11
- method_option :clusters, :type => :string, :default => 'clusters.uc'
12
- method_option :output, :type => :string, :default => 'otus'
13
- method_option :joined, :type => :string, :default => 'joined.fasta'
13
+ method_option :clusters, :type => :string, :required => true
14
+ method_option :output, :type => :string, :required => true
14
15
 
15
16
  def otu_table
16
- input = options[:clusters]
17
- output = options[:output]
17
+ input = options[:clusters]
18
+ output = options[:output]
18
19
  joined_reads = options[:joined]
19
-
20
- clusters = Hash.new
20
+
21
21
 
22
22
  # Load cluster table!
23
- clusters = Helpers.load_uc_file(input)
23
+ clstr_info = Helpers.load_uc_file input
24
+ clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
25
+ clstrnr_to_seed = clstr_info[:clstrnr_to_seed]
26
+ samples = clstr_info[:samples]
24
27
 
25
- clusters_total = clusters[:count_data].values.collect{ |x| x[:total] }.inject(:+)
26
-
27
- # Get representative sequences!
28
- reads_total = 0
29
- representatives = {}
30
- clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
28
+ # print OTU abundancy matrix
29
+
30
+ File.open("#{output}.csv", 'w') do |h|
31
+ samples = samples.sort
32
+ clusters = clstr_counts.keys
31
33
 
32
- out_handle = File.open("#{output}.fasta", 'w')
34
+ # print header
35
+ head = samples.join(SEP)
36
+ h.puts "-" + SEP + head
33
37
 
34
- File.open(joined_reads) do |handle|
35
- records = Dna.new handle
36
- records.each do |dna|
37
- reads_total += 1
38
- if !representatives[dna.name].nil?
39
- dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
40
- out_handle.puts dna
38
+ # start printing clusters
39
+ clusters.each do |cluster|
40
+ h.print "cluster-#{cluster}"
41
+ samples.each do |sample|
42
+ h.print "#{SEP}#{clstr_counts[cluster][sample]}"
41
43
  end
44
+ h.print "\n"
42
45
  end
46
+
43
47
  end
44
48
 
45
- out_handle.close
49
+ # # Get representative sequences!
50
+ # reads_total = 0
51
+ # representatives = {}
52
+ # clusters[:count_data].each{ |k, x| representatives[x[:seed]] = k }
53
+ #
54
+ # out_handle = File.open("#{output}.fasta", 'w')
55
+ #
56
+ # File.open(joined_reads) do |handle|
57
+ # records = Dna.new handle
58
+ # records.each do |dna|
59
+ # reads_total += 1
60
+ # if !representatives[dna.name].nil?
61
+ # dna.name = "#{dna.name}:cluster_#{representatives[dna.name]}"
62
+ # out_handle.puts dna
63
+ # end
64
+ # end
65
+ # end
66
+ #
67
+ # out_handle.close
68
+ #
69
+ # # Print some statistics
70
+ # ohai "reads in clusters: #{clusters_total}"
71
+ # ohai "number of reads: #{reads_total}"
72
+ # ohai "unique clusters: #{clusters.keys.length}"
46
73
 
47
- # Print some statistics
48
- ohai "reads in clusters: #{clusters_total}"
49
- ohai "number of reads: #{reads_total}"
50
- ohai "unique clusters: #{clusters.keys.length}"
51
74
 
52
- # print OTU abundancy matrix
53
- csv = Helpers.cluster_data_as_csv(clusters)
54
- File.open("#{output}.csv", 'w') do |h|
55
- h.puts csv
56
- end
57
75
 
58
76
  end
59
77
 
@@ -0,0 +1,44 @@
1
+ ##
2
+ # GET REPRESENTATIVE READS
3
+ #
4
+
5
+ module Lederhosen
6
+ class CLI
7
+
8
+ desc "rep_reads extract representative reads for each cluster to a fasta file",
9
+ "--clusters=clusters.uc --joined=joined.fasta --output=representative_reads.fasta"
10
+
11
+ method_option :clusters, :type => :string, :required => true
12
+ method_option :output, :type => :string, :required => true
13
+ method_option :joined, :type => :string, :required => true
14
+
15
+ def rep_reads
16
+ input = options[:clusters]
17
+ output = options[:output]
18
+ joined_reads = options[:joined]
19
+
20
+
21
+ # Load cluster table!
22
+ clstr_info = Helpers.load_uc_file input
23
+ clstr_counts = clstr_info[:clstr_counts] # clstr_counts[:clstr][sample.to_i] = reads
24
+ seed_to_clstrnr = clstr_info[:seed_to_clstrnr]
25
+ samples = clstr_info[:samples]
26
+
27
+ out_handle = File.open("#{output}", 'w')
28
+
29
+ File.open(joined_reads) do |handle|
30
+ records = Dna.new handle
31
+ records.each do |dna|
32
+ clstrnr = seed_to_clstrnr[dna.name]
33
+ unless clstrnr.nil?
34
+ dna.name = "#{dna.name}:cluster-#{clstrnr}"
35
+ out_handle.puts dna
36
+ end
37
+ end
38
+ end
39
+
40
+ out_handle.close
41
+ end
42
+
43
+ end
44
+ end
@@ -8,9 +8,8 @@ module Lederhosen
8
8
  desc "sort fasta file by length",
9
9
  "--input=joined.fasta --output=sorted.fasta"
10
10
 
11
- method_options :input => :string, :output => :string
12
- method_option :input, :type => :string, :default => 'joined.fasta'
13
- method_option :output, :type => :string, :default => 'sorted.fasta'
11
+ method_option :input, :type => :string, :required => true
12
+ method_option :output, :type => :string, :required => true
14
13
 
15
14
  def sort
16
15
  input = options[:input]
@@ -8,9 +8,9 @@ module Lederhosen
8
8
  desc "output separate fasta file containing sequences belonging to each cluster",
9
9
  "--clusters=clusters.uc --reads=joined.fasta --min-clst-size=100"
10
10
 
11
- method_option :clusters, :type => :string, :default => 'clusters.uc'
12
- method_option :reads, :type => :string, :default => 'joined.fasta'
13
- method_option :out_dir, :type => :string, :default => 'clusters_split'
11
+ method_option :clusters, :type => :string, :required => true
12
+ method_option :reads, :type => :string, :required => true
13
+ method_option :out_dir, :type => :string, :required => true
14
14
  method_option :buffer_size, :type => :numeric, :default => 1000
15
15
  method_option :min_clst_size, :type => :numeric, :default => 100
16
16
 
@@ -9,7 +9,7 @@ module Lederhosen
9
9
  "--reads_dir=reads/* --out_dir=trimmed.fasta"
10
10
 
11
11
  method_option :reads_dir, :type => :string, :required => true
12
- method_option :out_dir, :type => :string, :default => 'trimmed/'
12
+ method_option :out_dir, :type => :string, :required => true
13
13
 
14
14
  def trim
15
15
 
data/lib/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Lederhosen
2
- VERSION = '0.0.8'
2
+ VERSION = '0.0.9'
3
3
  end
data/readme.md CHANGED
@@ -1,11 +1,16 @@
1
1
  # Lederhosen
2
2
 
3
- Cluster raw Illumina 16S rRNA amplicon data to generate OTUs.
3
+ Cluster raw Illumina 16S rRNA amplicon data to generate OTUs. Use at your own risk.
4
4
 
5
5
  ## How do I get Lederhosen?
6
6
 
7
- 0. Obtain & Install uclust (64-bit)
8
- 1. `sudo gem install lederhosen`
7
+ 0. Obtain & Install [UCLUST](http://www.drive5.com/) (64-bit)
8
+ 1. Obtain & Install [BLAT](http://genome.ucsc.edu/FAQ/FAQblat.html#blat3)
9
+ 2. Get a copy of [TaxCollector](http://github.com/audy/taxcollector)
10
+ 3. Install Lederhosen by typing:
11
+
12
+ `sudo gem install lederhosen`
13
+ 4. Check installation by typing `lederhosen`. You should see some help text.
9
14
 
10
15
  ## How do I use Lederhosen?
11
16
 
@@ -13,26 +18,42 @@ Type `lederhosen help` for complete instructions
13
18
 
14
19
  ### 1. Trim raw reads
15
20
 
16
- `$ lederhosen trim --reads-dir=reads-dir/*.txt`
21
+ `$ lederhosen trim --reads-dir=reads-dir/*.txt --out-dir=trimmed`
17
22
 
18
23
  ### 2. Join trimmed reads
19
24
 
20
- `$ lederhosen join`
25
+ `$ lederhosen join --trimmed=trimmed/*.fasta --output=joined.fasta`
21
26
 
22
27
  ### 3. Sort trimmed reads
23
28
 
24
- `$ lederhosen sort`
29
+ `$ lederhosen sort --input=joined.fasta --output=sorted.fasta`
25
30
 
26
31
  ### 4. Cluster sorted reads
27
32
 
28
- `$ lederhosen cluster --idenity=0.975`
33
+ `$ lederhosen cluster --identity=0.975 --input=sorted.fasta --output=clusters`
34
+
35
+ ### 5. Make OTU tables
36
+
37
+ `% lederhosen otu_table --clusters=clusters.uc --output=clusters_975.csv`
38
+
39
+ This will output a csv (`clusters.975.csv`) and a fasta (`clusters.975.fasta`) file. The fasta file can be used to identify clusters in a 16S rRNA database using BLAST or something.
40
+
41
+ ### 6. Get representative reads from each cluster
29
42
 
30
- ### 5. Make tables & Get representative sequences
43
+ `% lederhosen rep_reads --clusters=clusters.uc --joined=joined.fasta --output=representatives.fasta`
31
44
 
32
- `% lederhosen otu_table --clusters=clusters.uc --output=clusters9.75.txt`
45
+ ### 6. Get a fasta file containing all reads for each cluster
33
46
 
34
- ### 6. Get fasta files with reads for each cluster
47
+ (time consuming and probably not necessary)
35
48
 
36
49
  `% lederhosen split --clusters=clusters_97.5.txt --reads=joined.fasta --min-clst-size=100`
37
50
 
38
- `--min-clst-size` is the minimum reads a cluster must have in order to for a fasta file containing its reads to be created. The reason for needing this because it is computationally prohibitive to randomly write millions of files or store all reads in memory, sort, and output non-randomly.
51
+ `--min-clst-size` is the minimum reads a cluster must have in order to for a fasta file containing its reads to be created. The reason for needing this because it is computationally prohibitive to randomly write millions of files or store all reads in memory, sort, and output non-randomly.
52
+
53
+ ### 7. Identifying Clusters
54
+
55
+ (Still under development)
56
+
57
+ You need BLAT (in your `$PATH`) & TaxCollector.
58
+
59
+ `$ lederhosen name --reps=representatives.fasta --db=taxcollector.fa --output=output_prefix`
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- hash: 15
4
+ hash: 13
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 8
10
- version: 0.0.8
9
+ - 9
10
+ version: 0.0.9
11
11
  platform: ruby
12
12
  authors:
13
13
  - Austin G. Davis-Richardson
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-04-13 00:00:00 Z
18
+ date: 2012-05-01 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: dna
@@ -122,7 +122,9 @@ files:
122
122
  - lib/lederhosen/helpers.rb
123
123
  - lib/lederhosen/tasks/cluster.rb
124
124
  - lib/lederhosen/tasks/join.rb
125
+ - lib/lederhosen/tasks/name.rb
125
126
  - lib/lederhosen/tasks/otu_table.rb
127
+ - lib/lederhosen/tasks/rep_reads.rb
126
128
  - lib/lederhosen/tasks/sort.rb
127
129
  - lib/lederhosen/tasks/split.rb
128
130
  - lib/lederhosen/tasks/trim.rb
@@ -164,7 +166,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
164
166
  requirements: []
165
167
 
166
168
  rubyforge_project: lederhosen
167
- rubygems_version: 1.8.21
169
+ rubygems_version: 1.8.24
168
170
  signing_key:
169
171
  specification_version: 3
170
172
  summary: 16S rRNA clustering for paired-end Illumina