lederhosen 1.3.8 → 1.3.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lederhosen.gemspec +2 -2
- data/lib/lederhosen/no_tasks.rb +10 -6
- data/lib/lederhosen/tasks/otu_table.rb +16 -7
- data/lib/lederhosen/tasks/trim.rb +16 -5
- data/lib/lederhosen/version.rb +1 -1
- data/lib/lederhosen.rb +3 -1
- metadata +3 -3
data/lederhosen.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "lederhosen"
|
8
|
-
s.version = "1.3.
|
8
|
+
s.version = "1.3.10"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
12
|
-
s.date = "2012-12-
|
12
|
+
s.date = "2012-12-13"
|
13
13
|
s.description = "Various tools for OTU clustering"
|
14
14
|
s.email = "harekrishna@gmail.com"
|
15
15
|
s.executables = ["lederhosen"]
|
data/lib/lederhosen/no_tasks.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
module Lederhosen
|
2
3
|
class CLI
|
3
4
|
|
@@ -7,7 +8,7 @@ module Lederhosen
|
|
7
8
|
|
8
9
|
# parse a line of usearch prefix
|
9
10
|
# return a hash in the form:
|
10
|
-
# { :taxonomy => '', :identity => 0.00, ... }
|
11
|
+
# { :taxonomy => '', :identity => '0.00', ... }
|
11
12
|
# unless the line is not a "hit" in which case
|
12
13
|
# the function returns nil
|
13
14
|
def parse_usearch_line(str)
|
@@ -18,7 +19,7 @@ module Lederhosen
|
|
18
19
|
str = str.split
|
19
20
|
|
20
21
|
taxonomic_description = str[9]
|
21
|
-
identity = str[3]
|
22
|
+
identity = str[3]
|
22
23
|
|
23
24
|
# parse taxonomic_description
|
24
25
|
taxonomies = parse_taxonomy(taxonomic_description) rescue { 'original' => str[9] }
|
@@ -59,9 +60,13 @@ module Lederhosen
|
|
59
60
|
end
|
60
61
|
end
|
61
62
|
|
63
|
+
RE_TAXCOLLECTOR = /^\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/
|
64
|
+
RE_GREENGENES = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?(.*);/
|
65
|
+
RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
|
66
|
+
|
62
67
|
def parse_taxonomy_qiime(taxonomy)
|
63
68
|
levels = %w{kingdom phylum class order family genus species}
|
64
|
-
match_data = taxonomy.match(
|
69
|
+
match_data = taxonomy.match(RE_QIIME)
|
65
70
|
match_data = match_data[1..-1]
|
66
71
|
|
67
72
|
names = Hash.new
|
@@ -74,7 +79,7 @@ module Lederhosen
|
|
74
79
|
|
75
80
|
def parse_taxonomy_greengenes(taxonomy)
|
76
81
|
levels = %w{kingdom phylum class order family genus species}
|
77
|
-
match_data = taxonomy.match(
|
82
|
+
match_data = taxonomy.match(RE_GREENGENES)
|
78
83
|
match_data = match_data[1..-1]
|
79
84
|
|
80
85
|
names = Hash.new
|
@@ -100,9 +105,8 @@ module Lederhosen
|
|
100
105
|
|
101
106
|
match_data =
|
102
107
|
begin
|
103
|
-
taxonomy.match(
|
108
|
+
taxonomy.match(RE_TAXCOLLECTOR)[1..-1]
|
104
109
|
rescue
|
105
|
-
$stderr.puts taxonomy.inspect
|
106
110
|
return nil
|
107
111
|
end
|
108
112
|
|
@@ -2,8 +2,6 @@
|
|
2
2
|
# MAKE TABLES
|
3
3
|
#
|
4
4
|
|
5
|
-
require 'set'
|
6
|
-
|
7
5
|
module Lederhosen
|
8
6
|
class CLI
|
9
7
|
|
@@ -30,16 +28,19 @@ module Lederhosen
|
|
30
28
|
fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
|
31
29
|
end
|
32
30
|
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
# there has to be a more efficient way of doing this
|
32
|
+
level_sample_cluster_count =
|
33
|
+
Hash.new do |h, k|
|
34
|
+
h[k] = Hash.new do |h, k|
|
35
|
+
h[k] = Hash.new(0)
|
36
|
+
end
|
37
|
+
end
|
36
38
|
|
37
39
|
# create a progress bar with the total number of bytes of
|
38
40
|
# the files we're slurping up
|
39
41
|
pbar = ProgressBar.new "loading", input.size
|
40
42
|
|
41
43
|
# Load cluster table
|
42
|
-
|
43
44
|
input.each do |input_file|
|
44
45
|
pbar.inc
|
45
46
|
File.open(input_file) do |handle|
|
@@ -54,8 +55,8 @@ module Lederhosen
|
|
54
55
|
dat[level] || 'unparsed_name'
|
55
56
|
end
|
56
57
|
|
58
|
+
# the next two lines are what is slow
|
57
59
|
level_sample_cluster_count[level][input_file][name] += 1
|
58
|
-
all_names[level] << name
|
59
60
|
end
|
60
61
|
|
61
62
|
end
|
@@ -64,6 +65,14 @@ module Lederhosen
|
|
64
65
|
|
65
66
|
pbar.finish
|
66
67
|
|
68
|
+
# get all taxonomic names at each level
|
69
|
+
all_names = Hash.new.tap do |bar|
|
70
|
+
level_sample_cluster_count.each_pair.map do |k, v|
|
71
|
+
names = v.each_value.map(&:keys).flatten.uniq
|
72
|
+
bar[k] = names
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
67
76
|
# save to csv(s)
|
68
77
|
levels.each do |level|
|
69
78
|
|
@@ -2,6 +2,8 @@
|
|
2
2
|
# QUALITY TRIMMING
|
3
3
|
#
|
4
4
|
|
5
|
+
# This should probably be broken into its own module or command-line utility.
|
6
|
+
|
5
7
|
module Lederhosen
|
6
8
|
class CLI
|
7
9
|
|
@@ -10,10 +12,12 @@ module Lederhosen
|
|
10
12
|
|
11
13
|
method_option :reads_dir, :type => :string, :required => true
|
12
14
|
method_option :out_dir, :type => :string, :required => true
|
15
|
+
method_option :pretrim, :type => :numeric, :default => 11
|
13
16
|
|
14
17
|
def trim
|
15
18
|
raw_reads = options[:reads_dir]
|
16
19
|
out_dir = options[:out_dir]
|
20
|
+
pretrim = options[:pretrim]
|
17
21
|
|
18
22
|
ohai "trimming #{File.dirname(raw_reads)} and saving to #{out_dir}"
|
19
23
|
|
@@ -92,16 +96,21 @@ module Lederhosen
|
|
92
96
|
# returns just the sequence
|
93
97
|
def trim_seq(dna, args={})
|
94
98
|
|
99
|
+
|
100
|
+
pretrim = args[:pretrim] || false
|
95
101
|
# trim primers off of sequence
|
96
|
-
#
|
97
|
-
|
98
|
-
|
102
|
+
# XXX this is experiment-specific and needs to be made
|
103
|
+
# into a parameter
|
104
|
+
if pretrim
|
105
|
+
dna.sequence = dna.sequence[pretrim..-1]
|
106
|
+
dna.quality = dna.quality[pretrim..-1]
|
107
|
+
end
|
99
108
|
|
100
109
|
# throw away any read with an ambiguous primer
|
101
110
|
return nil if dna.sequence =~ /N/
|
102
111
|
|
103
|
-
min = args[:min] || 20
|
104
|
-
offset = args[:cutoff] || 64
|
112
|
+
min = args[:min] || 20 # what is this constant?
|
113
|
+
offset = args[:cutoff] || 64 # XXX depends on sequencing tech.
|
105
114
|
|
106
115
|
_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
|
107
116
|
|
@@ -116,6 +125,8 @@ module Lederhosen
|
|
116
125
|
first = a
|
117
126
|
end
|
118
127
|
end
|
128
|
+
|
129
|
+
# XXX why is this rescue statement here?
|
119
130
|
dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
|
120
131
|
end
|
121
132
|
end
|
data/lib/lederhosen/version.rb
CHANGED
data/lib/lederhosen.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.10
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: dna
|
@@ -176,7 +176,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
176
176
|
version: '0'
|
177
177
|
segments:
|
178
178
|
- 0
|
179
|
-
hash:
|
179
|
+
hash: 2999187278262571353
|
180
180
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
181
181
|
none: false
|
182
182
|
requirements:
|