lederhosen 1.3.8 → 1.3.10
Sign up to get free protection for your applications and to get access to all the features.
- data/lederhosen.gemspec +2 -2
- data/lib/lederhosen/no_tasks.rb +10 -6
- data/lib/lederhosen/tasks/otu_table.rb +16 -7
- data/lib/lederhosen/tasks/trim.rb +16 -5
- data/lib/lederhosen/version.rb +1 -1
- data/lib/lederhosen.rb +3 -1
- metadata +3 -3
data/lederhosen.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "lederhosen"
|
8
|
-
s.version = "1.3.
|
8
|
+
s.version = "1.3.10"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Austin G. Davis-Richardson"]
|
12
|
-
s.date = "2012-12-
|
12
|
+
s.date = "2012-12-13"
|
13
13
|
s.description = "Various tools for OTU clustering"
|
14
14
|
s.email = "harekrishna@gmail.com"
|
15
15
|
s.executables = ["lederhosen"]
|
data/lib/lederhosen/no_tasks.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
|
1
2
|
module Lederhosen
|
2
3
|
class CLI
|
3
4
|
|
@@ -7,7 +8,7 @@ module Lederhosen
|
|
7
8
|
|
8
9
|
# parse a line of usearch prefix
|
9
10
|
# return a hash in the form:
|
10
|
-
# { :taxonomy => '', :identity => 0.00, ... }
|
11
|
+
# { :taxonomy => '', :identity => '0.00', ... }
|
11
12
|
# unless the line is not a "hit" in which case
|
12
13
|
# the function returns nil
|
13
14
|
def parse_usearch_line(str)
|
@@ -18,7 +19,7 @@ module Lederhosen
|
|
18
19
|
str = str.split
|
19
20
|
|
20
21
|
taxonomic_description = str[9]
|
21
|
-
identity = str[3]
|
22
|
+
identity = str[3]
|
22
23
|
|
23
24
|
# parse taxonomic_description
|
24
25
|
taxonomies = parse_taxonomy(taxonomic_description) rescue { 'original' => str[9] }
|
@@ -59,9 +60,13 @@ module Lederhosen
|
|
59
60
|
end
|
60
61
|
end
|
61
62
|
|
63
|
+
RE_TAXCOLLECTOR = /^\[0\](.*);\[1\](.*);\[2\](.*);\[3\](.*);\[4\](.*);\[5\](.*);\[6\](.*);\[7\](.*);\[8\](.*)/
|
64
|
+
RE_GREENGENES = /k__(.*); ?p__(.*); ?c__(.*); ?o__(.*); ?f__(.*); ?g__(.*); ?(.*);/
|
65
|
+
RE_QIIME = /k__(.*);p__(.*);c__(.*);o__(.*);f__(.*);g__(.*);s__(.*)/
|
66
|
+
|
62
67
|
def parse_taxonomy_qiime(taxonomy)
|
63
68
|
levels = %w{kingdom phylum class order family genus species}
|
64
|
-
match_data = taxonomy.match(
|
69
|
+
match_data = taxonomy.match(RE_QIIME)
|
65
70
|
match_data = match_data[1..-1]
|
66
71
|
|
67
72
|
names = Hash.new
|
@@ -74,7 +79,7 @@ module Lederhosen
|
|
74
79
|
|
75
80
|
def parse_taxonomy_greengenes(taxonomy)
|
76
81
|
levels = %w{kingdom phylum class order family genus species}
|
77
|
-
match_data = taxonomy.match(
|
82
|
+
match_data = taxonomy.match(RE_GREENGENES)
|
78
83
|
match_data = match_data[1..-1]
|
79
84
|
|
80
85
|
names = Hash.new
|
@@ -100,9 +105,8 @@ module Lederhosen
|
|
100
105
|
|
101
106
|
match_data =
|
102
107
|
begin
|
103
|
-
taxonomy.match(
|
108
|
+
taxonomy.match(RE_TAXCOLLECTOR)[1..-1]
|
104
109
|
rescue
|
105
|
-
$stderr.puts taxonomy.inspect
|
106
110
|
return nil
|
107
111
|
end
|
108
112
|
|
@@ -2,8 +2,6 @@
|
|
2
2
|
# MAKE TABLES
|
3
3
|
#
|
4
4
|
|
5
|
-
require 'set'
|
6
|
-
|
7
5
|
module Lederhosen
|
8
6
|
class CLI
|
9
7
|
|
@@ -30,16 +28,19 @@ module Lederhosen
|
|
30
28
|
fail "bad level: #{level}" unless %w{domain phylum class order family genus species kingdom original}.include? level
|
31
29
|
end
|
32
30
|
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
# there has to be a more efficient way of doing this
|
32
|
+
level_sample_cluster_count =
|
33
|
+
Hash.new do |h, k|
|
34
|
+
h[k] = Hash.new do |h, k|
|
35
|
+
h[k] = Hash.new(0)
|
36
|
+
end
|
37
|
+
end
|
36
38
|
|
37
39
|
# create a progress bar with the total number of bytes of
|
38
40
|
# the files we're slurping up
|
39
41
|
pbar = ProgressBar.new "loading", input.size
|
40
42
|
|
41
43
|
# Load cluster table
|
42
|
-
|
43
44
|
input.each do |input_file|
|
44
45
|
pbar.inc
|
45
46
|
File.open(input_file) do |handle|
|
@@ -54,8 +55,8 @@ module Lederhosen
|
|
54
55
|
dat[level] || 'unparsed_name'
|
55
56
|
end
|
56
57
|
|
58
|
+
# the next two lines are what is slow
|
57
59
|
level_sample_cluster_count[level][input_file][name] += 1
|
58
|
-
all_names[level] << name
|
59
60
|
end
|
60
61
|
|
61
62
|
end
|
@@ -64,6 +65,14 @@ module Lederhosen
|
|
64
65
|
|
65
66
|
pbar.finish
|
66
67
|
|
68
|
+
# get all taxonomic names at each level
|
69
|
+
all_names = Hash.new.tap do |bar|
|
70
|
+
level_sample_cluster_count.each_pair.map do |k, v|
|
71
|
+
names = v.each_value.map(&:keys).flatten.uniq
|
72
|
+
bar[k] = names
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
67
76
|
# save to csv(s)
|
68
77
|
levels.each do |level|
|
69
78
|
|
@@ -2,6 +2,8 @@
|
|
2
2
|
# QUALITY TRIMMING
|
3
3
|
#
|
4
4
|
|
5
|
+
# This should probably be broken into its own module or command-line utility.
|
6
|
+
|
5
7
|
module Lederhosen
|
6
8
|
class CLI
|
7
9
|
|
@@ -10,10 +12,12 @@ module Lederhosen
|
|
10
12
|
|
11
13
|
method_option :reads_dir, :type => :string, :required => true
|
12
14
|
method_option :out_dir, :type => :string, :required => true
|
15
|
+
method_option :pretrim, :type => :numeric, :default => 11
|
13
16
|
|
14
17
|
def trim
|
15
18
|
raw_reads = options[:reads_dir]
|
16
19
|
out_dir = options[:out_dir]
|
20
|
+
pretrim = options[:pretrim]
|
17
21
|
|
18
22
|
ohai "trimming #{File.dirname(raw_reads)} and saving to #{out_dir}"
|
19
23
|
|
@@ -92,16 +96,21 @@ module Lederhosen
|
|
92
96
|
# returns just the sequence
|
93
97
|
def trim_seq(dna, args={})
|
94
98
|
|
99
|
+
|
100
|
+
pretrim = args[:pretrim] || false
|
95
101
|
# trim primers off of sequence
|
96
|
-
#
|
97
|
-
|
98
|
-
|
102
|
+
# XXX this is experiment-specific and needs to be made
|
103
|
+
# into a parameter
|
104
|
+
if pretrim
|
105
|
+
dna.sequence = dna.sequence[pretrim..-1]
|
106
|
+
dna.quality = dna.quality[pretrim..-1]
|
107
|
+
end
|
99
108
|
|
100
109
|
# throw away any read with an ambiguous primer
|
101
110
|
return nil if dna.sequence =~ /N/
|
102
111
|
|
103
|
-
min = args[:min] || 20
|
104
|
-
offset = args[:cutoff] || 64
|
112
|
+
min = args[:min] || 20 # what is this constant?
|
113
|
+
offset = args[:cutoff] || 64 # XXX depends on sequencing tech.
|
105
114
|
|
106
115
|
_sum, _max, first, last, start, _end = 0, 0, 0, 0, 0
|
107
116
|
|
@@ -116,6 +125,8 @@ module Lederhosen
|
|
116
125
|
first = a
|
117
126
|
end
|
118
127
|
end
|
128
|
+
|
129
|
+
# XXX why is this rescue statement here?
|
119
130
|
dna.sequence[start + 11, _end - start].gsub('.', 'N') rescue nil
|
120
131
|
end
|
121
132
|
end
|
data/lib/lederhosen/version.rb
CHANGED
data/lib/lederhosen.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lederhosen
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.10
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: dna
|
@@ -176,7 +176,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
176
176
|
version: '0'
|
177
177
|
segments:
|
178
178
|
- 0
|
179
|
-
hash:
|
179
|
+
hash: 2999187278262571353
|
180
180
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
181
181
|
none: false
|
182
182
|
requirements:
|