lederhosen 1.3.8 → 1.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lederhosen.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "lederhosen"
8
- s.version = "1.3.8"
8
+ s.version = "1.3.10"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Austin G. Davis-Richardson"]
12
- s.date = "2012-12-07"
12
+ s.date = "2012-12-13"
13
13
  s.description = "Various tools for OTU clustering"
14
14
  s.email = "harekrishna@gmail.com"
15
15
  s.executables = ["lederhosen"]
@@ -1,3 +1,4 @@
1
+
1
2
  module Lederhosen
2
3
  class CLI
3
4
 
@@ -7,7 +8,7 @@ module Lederhosen
7
8
 
8
9
  # parse a line of usearch prefix
9
10
  # return a hash in the form:
10
- # { :taxonomy => '', :identity => 0.00, ... }
11
+ # { :taxonomy => '', :identity => '0.00', ... }
11
12
  # unless the line is not a "hit" in which case
12
13
  # the function returns nil
13
14
  def parse_usearch_line(str)
@@ -18,7 +19,7 @@ module Lederhosen
18
19
  str = str.split
19
20
 
20
21
  taxonomic_description = str[9]
21
- identity = str[3].to_f
22
+ identity = str[3]
22
23
 
23
24
  # parse taxonomic_description
24
25
  taxonomies = parse_taxonomy(taxonomic_description) rescue { 'original' => str[9] }
@@ -59,9 +60,13 @@ module Lederhosen
59
60
  end
60
61
  end
61
62
 
63
+ RE_TAXCOLLECTOR = /^\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/
64
+ RE_GREENGENES = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?(.*);/
65
+ RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
66
+
62
67
  def parse_taxonomy_qiime(taxonomy)
63
68
  levels = %w{kingdom phylum class order family genus species}
64
- match_data = taxonomy.match(/k__(\w*);p__(\w*);c__(\w*);o__(\w*);f__(\w*);g__(\w*);s__(\w*)/)
69
+ match_data = taxonomy.match(RE_QIIME)
65
70
  match_data = match_data[1..-1]
66
71
 
67
72
  names = Hash.new
@@ -74,7 +79,7 @@ module Lederhosen
74
79
 
75
80
  def parse_taxonomy_greengenes(taxonomy)
76
81
  levels = %w{kingdom phylum class order family genus species}
77
- match_data = taxonomy.match(/k__(\w*); ?p__(\w*); ?c__(\w*); ?o__(\w*); ?f__(\w*); ?g__(\w*); ?(\w*);/)
82
+ match_data = taxonomy.match(RE_GREENGENES)
78
83
  match_data = match_data[1..-1]
79
84
 
80
85
  names = Hash.new
@@ -100,9 +105,8 @@ module Lederhosen
100
105
 
101
106
  match_data =
102
107
  begin
103
- taxonomy.match(/\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/)[1..-1]
108
+ taxonomy.match(RE_TAXCOLLECTOR)[1..-1]
104
109
  rescue
105
- $stderr.puts taxonomy.inspect
106
110
  return nil
107
111
  end
108
112
 
@@ -2,8 +2,6 @@
2
2
  # MAKE TABLES
3
3
  #
4
4
 
5
- require 'set'
6
-
7
5
  module Lederhosen
8
6
  class CLI
9
7
 
@@ -30,16 +28,19 @@ module Lederhosen
30
28
  fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
31
29
  end
32
30
 
33
- level_sample_cluster_count = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = Hash.new { |h, k| h[k] = 0 } } }
34
-
35
- all_names = Hash.new { |h, k| h[k] = Set.new }
31
+ # there has to be a more efficient way of doing this
32
+ level_sample_cluster_count =
33
+ Hash.new do |h, k|
34
+ h[k] = Hash.new do |h, k|
35
+ h[k] = Hash.new(0)
36
+ end
37
+ end
36
38
 
37
39
  # create a progress bar with the total number of bytes of
38
40
  # the files we're slurping up
39
41
  pbar = ProgressBar.new "loading", input.size
40
42
 
41
43
  # Load cluster table
42
-
43
44
  input.each do |input_file|
44
45
  pbar.inc
45
46
  File.open(input_file) do |handle|
@@ -54,8 +55,8 @@ module Lederhosen
54
55
  dat[level] || 'unparsed_name'
55
56
  end
56
57
 
58
+ # the next two lines are what is slow
57
59
  level_sample_cluster_count[level][input_file][name] += 1
58
- all_names[level] << name
59
60
  end
60
61
 
61
62
  end
@@ -64,6 +65,14 @@ module Lederhosen
64
65
 
65
66
  pbar.finish
66
67
 
68
+ # get all taxonomic names at each level
69
+ all_names = Hash.new.tap do |bar|
70
+ level_sample_cluster_count.each_pair.map do |k, v|
71
+ names = v.each_value.map(&:keys).flatten.uniq
72
+ bar[k] = names
73
+ end
74
+ end
75
+
67
76
  # save to csv(s)
68
77
  levels.each do |level|
69
78
 
@@ -2,6 +2,8 @@
2
2
  # QUALITY TRIMMING
3
3
  #
4
4
 
5
+ # This should probably be broken into its own module or command-line utility.
6
+
5
7
  module Lederhosen
6
8
  class CLI
7
9
 
@@ -10,10 +12,12 @@ module Lederhosen
10
12
 
11
13
  method_option :reads_dir, :type => :string, :required => true
12
14
  method_option :out_dir, :type => :string, :required => true
15
+ method_option :pretrim, :type => :numeric, :default => 11
13
16
 
14
17
  def trim
15
18
  raw_reads = options[:reads_dir]
16
19
  out_dir = options[:out_dir]
20
+ pretrim = options[:pretrim]
17
21
 
18
22
  ohai "trimming #{File.dirname(raw_reads)} and saving to #{out_dir}"
19
23
 
@@ -92,16 +96,21 @@ module Lederhosen
92
96
  # returns just the sequence
93
97
  def trim_seq(dna, args={})
94
98
 
99
+
100
+ pretrim = args[:pretrim] || false
95
101
  # trim primers off of sequence
96
- # (THIS IS EXPERIMENT-SPECIFIC)
97
- dna.sequence = dna.sequence[11..-1]
98
- dna.quality = dna.quality[11..-1]
102
+ # XXX this is experiment-specific and needs to be made
103
+ # into a parameter
104
+ if pretrim
105
+ dna.sequence = dna.sequence[pretrim..-1]
106
+ dna.quality = dna.quality[pretrim..-1]
107
+ end
99
108
 
100
109
  # throw away any read with an ambiguous primer
101
110
  return nil if dna.sequence =~ /N/
102
111
 
103
- min = args[:min] || 20
104
- offset = args[:cutoff] || 64
112
+ min = args[:min] || 20 # what is this constant?
113
+ offset = args[:cutoff] || 64 # XXX depends on sequencing tech.
105
114
 
106
115
  _sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
107
116
 
@@ -116,6 +125,8 @@ module Lederhosen
116
125
  first = a
117
126
  end
118
127
  end
128
+
129
+ # XXX why is this rescue statement here?
119
130
  dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
120
131
  end
121
132
  end
@@ -3,7 +3,7 @@ module Lederhosen
3
3
  MAJOR = 1
4
4
  MINOR = 3
5
5
  CODENAME = 'Dirndl' # changes for minor versions
6
- PATCH = 8
6
+ PATCH = 10
7
7
 
8
8
  STRING = [MAJOR, MINOR, PATCH].join('.')
9
9
  end
data/lib/lederhosen.rb CHANGED
@@ -1,7 +1,9 @@
1
1
  require 'rubygems'
2
2
  require 'bundler'
3
- Bundler.require :default
4
3
  require 'set'
4
+ require 'dna'
5
+ require 'progressbar'
6
+ require 'thor'
5
7
 
6
8
  Dir.glob(File.join(File.dirname(__FILE__), 'lederhosen', '*.rb')).each { |f| require f }
7
9
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lederhosen
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.8
4
+ version: 1.3.10
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-07 00:00:00.000000000 Z
12
+ date: 2012-12-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: dna
@@ -176,7 +176,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
176
176
  version: '0'
177
177
  segments:
178
178
  - 0
179
- hash: 1569227273029021963
179
+ hash: 2999187278262571353
180
180
  required_rubygems_version: !ruby/object:Gem::Requirement
181
181
  none: false
182
182
  requirements: