lederhosen 1.3.8 → 1.3.10

Sign up to get free protection for your applications and to get access to all the features.
data/lederhosen.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "1.3.8"
8
+ s.version = "1.3.10"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
12
- s.date = "2012-12-07"
12
+ s.date = "2012-12-13"
13
13
  s.description = "Various tools for OTU clustering"
14
14
  s.email = "harekrishna@gmail.com"
15
15
  s.executables = ["lederhosen"]
@@ -1,3 +1,4 @@
1
+
1
2
  module Lederhosen
2
3
  class CLI
3
4
 
@@ -7,7 +8,7 @@ module Lederhosen
7
8
 
8
9
  # parse a line of usearch prefix
9
10
  # return a hash in the form:
10
- # { :taxonomy => '', :identity => 0.00, ... }
11
+ # { :taxonomy => '', :identity => '0.00', ... }
11
12
  # unless the line is not a "hit" in which case
12
13
  # the function returns nil
13
14
  def parse_usearch_line(str)
@@ -18,7 +19,7 @@ module Lederhosen
18
19
  str = str.split
19
20
 
20
21
  taxonomic_description = str[9]
21
- identity = str[3].to_f
22
+ identity = str[3]
22
23
 
23
24
  # parse taxonomic_description
24
25
  taxonomies = parse_taxonomy(taxonomic_description) rescue { 'original' => str[9] }
@@ -59,9 +60,13 @@ module Lederhosen
59
60
  end
60
61
  end
61
62
 
63
+ RE_TAXCOLLECTOR = /^\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/
64
+ RE_GREENGENES = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?(.*);/
65
+ RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
66
+
62
67
  def parse_taxonomy_qiime(taxonomy)
63
68
  levels = %w{kingdom phylum class order family genus species}
64
- match_data = taxonomy.match(/k__(\w*);p__(\w*);c__(\w*);o__(\w*);f__(\w*);g__(\w*);s__(\w*)/)
69
+ match_data = taxonomy.match(RE_QIIME)
65
70
  match_data = match_data[1..-1]
66
71
 
67
72
  names = Hash.new
@@ -74,7 +79,7 @@ module Lederhosen
74
79
 
75
80
  def parse_taxonomy_greengenes(taxonomy)
76
81
  levels = %w{kingdom phylum class order family genus species}
77
- match_data = taxonomy.match(/k__(\w*); ?p__(\w*); ?c__(\w*); ?o__(\w*); ?f__(\w*); ?g__(\w*); ?(\w*);/)
82
+ match_data = taxonomy.match(RE_GREENGENES)
78
83
  match_data = match_data[1..-1]
79
84
 
80
85
  names = Hash.new
@@ -100,9 +105,8 @@ module Lederhosen
100
105
 
101
106
  match_data =
102
107
  begin
103
- taxonomy.match(/\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/)[1..-1]
108
+ taxonomy.match(RE_TAXCOLLECTOR)[1..-1]
104
109
  rescue
105
- $stderr.puts taxonomy.inspect
106
110
  return nil
107
111
  end
108
112
 
@@ -2,8 +2,6 @@
2
2
  # MAKE TABLES
3
3
  #
4
4
 
5
- require 'set'
6
-
7
5
  module Lederhosen
8
6
  class CLI
9
7
 
@@ -30,16 +28,19 @@ module Lederhosen
30
28
  fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
31
29
  end
32
30
 
33
- level_sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } } }
34
-
35
- all_names = Hash.new { |h, k| h[k] = Set.new }
31
+ # there has to be a more efficient way of doing this
32
+ level_sample_cluster_count =
33
+ Hash.new do |h, k|
34
+ h[k] = Hash.new do |h, k|
35
+ h[k] = Hash.new(0)
36
+ end
37
+ end
36
38
 
37
39
  # create a progress bar with the total number of bytes of
38
40
  # the files we're slurping up
39
41
  pbar = ProgressBar.new "loading", input.size
40
42
 
41
43
  # Load cluster table
42
-
43
44
  input.each do |input_file|
44
45
  pbar.inc
45
46
  File.open(input_file) do |handle|
@@ -54,8 +55,8 @@ module Lederhosen
54
55
  dat[level] || 'unparsed_name'
55
56
  end
56
57
 
58
+ # the next two lines are what is slow
57
59
  level_sample_cluster_count[level][input_file][name] += 1
58
- all_names[level] << name
59
60
  end
60
61
 
61
62
  end
@@ -64,6 +65,14 @@ module Lederhosen
64
65
 
65
66
  pbar.finish
66
67
 
68
+ # get all taxonomic names at each level
69
+ all_names = Hash.new.tap do |bar|
70
+ level_sample_cluster_count.each_pair.map do |k, v|
71
+ names = v.each_value.map(&:keys).flatten.uniq
72
+ bar[k] = names
73
+ end
74
+ end
75
+
67
76
  # save to csv(s)
68
77
  levels.each do |level|
69
78
 
@@ -2,6 +2,8 @@
2
2
  # QUALITY TRIMMING
3
3
  #
4
4
 
5
+ # This should probably be broken into its own module or command-line utility.
6
+
5
7
  module Lederhosen
6
8
  class CLI
7
9
 
@@ -10,10 +12,12 @@ module Lederhosen
10
12
 
11
13
  method_option :reads_dir, :type => :string, :required => true
12
14
  method_option :out_dir, :type => :string, :required => true
15
+ method_option :pretrim, :type => :numeric, :default => 11
13
16
 
14
17
  def trim
15
18
  raw_reads = options[:reads_dir]
16
19
  out_dir = options[:out_dir]
20
+ pretrim = options[:pretrim]
17
21
 
18
22
  ohai "trimming #{File.dirname(raw_reads)} and saving to #{out_dir}"
19
23
 
@@ -92,16 +96,21 @@ module Lederhosen
92
96
  # returns just the sequence
93
97
  def trim_seq(dna, args={})
94
98
 
99
+
100
+ pretrim = args[:pretrim] || false
95
101
  # trim primers off of sequence
96
- # (THIS IS EXPERIMENT-SPECIFIC)
97
- dna.sequence = dna.sequence[11..-1]
98
- dna.quality = dna.quality[11..-1]
102
+ # XXX this is experiment-specific and needs to be made
103
+ # into a parameter
104
+ if pretrim
105
+ dna.sequence = dna.sequence[pretrim..-1]
106
+ dna.quality = dna.quality[pretrim..-1]
107
+ end
99
108
 
100
109
  # throw away any read with an ambiguous primer
101
110
  return nil if dna.sequence =~ /N/
102
111
 
103
- min = args[:min] || 20
104
- offset = args[:cutoff] || 64
112
+ min = args[:min] || 20 # what is this constant?
113
+ offset = args[:cutoff] || 64 # XXX depends on sequencing tech.
105
114
 
106
115
  _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
107
116
 
@@ -116,6 +125,8 @@ module Lederhosen
116
125
  first = a
117
126
  end
118
127
  end
128
+
129
+ # XXX why is this rescue statement here?
119
130
  dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
120
131
  end
121
132
  end
@@ -3,7 +3,7 @@ module Lederhosen
3
3
  MAJOR = 1
4
4
  MINOR = 3
5
5
  CODENAME = 'Dirndl' # changes for minor versions
6
- PATCH = 8
6
+ PATCH = 10
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH].join('.')
9
9
  end
data/lib/lederhosen.rb CHANGED
@@ -1,7 +1,9 @@
1
1
  require 'rubygems'
2
2
  require 'bundler'
3
- Bundler.require :default
4
3
  require 'set'
4
+ require 'dna'
5
+ require 'progressbar'
6
+ require 'thor'
5
7
 
6
8
  Dir.glob(File.join(File.dirname(__FILE__), 'lederhosen', '*.rb')).each { |f| require f }
7
9
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.8
4
+ version: 1.3.10
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-07 00:00:00.000000000 Z
12
+ date: 2012-12-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: dna
@@ -176,7 +176,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
176
176
  version: '0'
177
177
  segments:
178
178
  - 0
179
- hash: 1569227273029021963
179
+ hash: 2999187278262571353
180
180
  required_rubygems_version: !ruby/object:Gem::Requirement
181
181
  none: false
182
182
  requirements: