tree_clusters 0.8.2 → 0.8.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/exe/key_cols +32 -12
- data/lib/tree_clusters.rb +45 -31
- data/lib/tree_clusters/version.rb +1 -1
- data/test_files/key_cols/expected_output/small2/ARST.tree_clusters.annotated_tree.txt +1 -0
- data/test_files/key_cols/expected_output/small2/ARST.tree_clusters.clade_members.txt +5 -0
- data/test_files/key_cols/expected_output/small2/ARST.tree_clusters.key_cols.txt +5 -0
- metadata +4 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: be727e6384cf510290ddab3bf5aabd5075505e77
|
4
|
+
data.tar.gz: 69e94311ac35dcb26579edada3d6555b56048cd1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5aacaa9b69741eff4f23b62b7be230befc444847cbf869ed0a5394e1fd768b34fc150483a71f079aef6fd683a9c58fe170e8ffdfc6f229b677862ce7b851c875
|
7
|
+
data.tar.gz: c3e639c1bced1b5c480e99f0d9e19ecb65a27dc3cf09e81d429714951d47b7b66905b82b9ed20d14ac05d36be0067b9b3de51796447acfd91a814ce73ed07fdc
|
data/.gitignore
CHANGED
data/exe/key_cols
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
# TODO the block variables called "clade" aren't actually instances of the TreeClusters::Clade class. It's kind of confusing.
|
4
|
+
|
3
5
|
Signal.trap("PIPE", "EXIT")
|
4
6
|
|
5
7
|
require "tree_clusters"
|
@@ -54,6 +56,8 @@ The key_cols program
|
|
54
56
|
alignment file (fasta format), I will tell you key columns for all
|
55
57
|
clades/clusters that have them.
|
56
58
|
|
59
|
+
Version: v#{TreeClusters::VERSION}
|
60
|
+
|
57
61
|
Overview
|
58
62
|
--------
|
59
63
|
|
@@ -147,9 +151,10 @@ abort_unless opts[:clade_size_cutoff] >= 1,
|
|
147
151
|
|
148
152
|
FileUtils.mkdir_p opts[:outdir]
|
149
153
|
|
150
|
-
AbortIf.logger.info { "
|
154
|
+
AbortIf.logger.info { "Reading tree" }
|
155
|
+
tree = NewickTree.fromFile opts[:tree]
|
151
156
|
|
152
|
-
|
157
|
+
AbortIf.logger.info { "Reading alignment" }
|
153
158
|
leaf2attrs = TreeClusters.read_alignment opts[:aln]
|
154
159
|
|
155
160
|
members_fname =
|
@@ -169,22 +174,37 @@ key_cols_f =
|
|
169
174
|
annotated_tree_f =
|
170
175
|
File.open(annotated_tree_fname, "w")
|
171
176
|
|
172
|
-
key_col_sets
|
173
|
-
clade_sizes
|
174
|
-
|
177
|
+
key_col_sets = {}
|
178
|
+
clade_sizes = {}
|
179
|
+
# AbortIf.logger.info { "Counting clades" }
|
180
|
+
# clade_count = TreeClusters.all_clades(tree).count
|
175
181
|
change_these_names = Set.new
|
176
182
|
|
177
183
|
def all_clades_helper tree
|
178
|
-
TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
|
184
|
+
# TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
|
185
|
+
tree.clade_nodes.
|
186
|
+
reverse.
|
187
|
+
sort_by { |node| node.all_leaves.count }.
|
188
|
+
reverse
|
179
189
|
end
|
180
190
|
|
181
|
-
AbortIf.logger.info { "
|
191
|
+
AbortIf.logger.info { "Getting all clades" }
|
192
|
+
clades = all_clades_helper(tree)
|
193
|
+
clade_count = clades.count
|
194
|
+
|
195
|
+
AbortIf.logger.info { "Processing clades (The first few clades go reaaally slowly, but then I speed up!)" }
|
182
196
|
begin
|
183
|
-
|
184
|
-
|
197
|
+
clades.each_with_index do |clade, idx|
|
198
|
+
# It starts off really slowly, then speeds up a lot.
|
199
|
+
if (idx+1) < 100 ||
|
200
|
+
((idx+1) < 1000 && ((idx + 1) % 10).zero?) ||
|
201
|
+
((idx+1) < 10000 && ((idx + 1) % 100).zero?) ||
|
202
|
+
((idx+1) < 100000 && ((idx + 1) % 1000).zero?) ||
|
203
|
+
((idx + 1) % 10000).zero?
|
204
|
+
|
185
205
|
perc = ((idx + 1) / clade_count.to_f * 100).round 2
|
186
206
|
|
187
|
-
STDERR.printf("Processing clades: #{perc}
|
207
|
+
STDERR.printf("Processing clades: #{idx + 1} of #{clade_count} (#{perc}%%)\r")
|
188
208
|
end
|
189
209
|
|
190
210
|
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
|
@@ -224,12 +244,12 @@ begin
|
|
224
244
|
|
225
245
|
AbortIf.logger.info { "Annotating tree" }
|
226
246
|
|
227
|
-
|
247
|
+
clades.each_with_index do |clade, idx|
|
228
248
|
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
|
229
249
|
|
230
250
|
if change_these_names.include? clade_id
|
231
251
|
# This will change the node in the original NewickTree
|
232
|
-
clade.
|
252
|
+
clade.name = "'#{clade_id}'"
|
233
253
|
end
|
234
254
|
end
|
235
255
|
|
data/lib/tree_clusters.rb
CHANGED
@@ -27,12 +27,20 @@ class NewickTree
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def unquoted_taxa
|
30
|
+
# @note self.taxa calls taxa method on the root of the tree.
|
30
31
|
self.taxa.map { |str| str.tr %q{"'}, "" }
|
31
32
|
end
|
32
33
|
end
|
33
34
|
|
35
|
+
class NewickNode
|
36
|
+
def all_leaves
|
37
|
+
self.leaves.map { |n| n.name.tr %q{"'}, "" }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
34
41
|
# Top level namespace of the Gem.
|
35
42
|
module TreeClusters
|
43
|
+
PROJ_ROOT = File.join __dir__, ".."
|
36
44
|
|
37
45
|
# Given an ary of strings, find the most common string in the ary.
|
38
46
|
#
|
@@ -48,18 +56,24 @@ module TreeClusters
|
|
48
56
|
# @note Each string is upcase'd before frequencies are calculated.
|
49
57
|
def consensus bases
|
50
58
|
bases.
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
59
|
+
map(&:upcase).
|
60
|
+
group_by(&:itself).
|
61
|
+
sort_by { |_, bases| bases.count }.
|
62
|
+
reverse.
|
63
|
+
first.
|
64
|
+
first
|
57
65
|
end
|
58
66
|
|
59
67
|
def read_alignment aln_fname
|
60
68
|
leaf2attrs = TreeClusters::Attrs.new
|
61
|
-
aln_len
|
69
|
+
aln_len = nil
|
70
|
+
seq_num = 0
|
62
71
|
ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
|
72
|
+
seq_num += 1
|
73
|
+
if ((seq_num + 1) % 1000).zero?
|
74
|
+
STDERR.printf("Reading alignment: #{seq_num + 1}\r")
|
75
|
+
end
|
76
|
+
|
63
77
|
leaf2attrs[rec.id] = { aln: rec.seq.chars }
|
64
78
|
|
65
79
|
aln_len ||= rec.seq.length
|
@@ -73,13 +87,13 @@ module TreeClusters
|
|
73
87
|
|
74
88
|
def low_ent_cols leaves, leaf2attrs, entropy_cutoff
|
75
89
|
low_ent_cols = []
|
76
|
-
alns
|
77
|
-
aln_cols
|
90
|
+
alns = leaf2attrs.attrs leaves, :aln
|
91
|
+
aln_cols = alns.transpose
|
78
92
|
|
79
93
|
aln_cols.each_with_index do |aln_col, aln_col_idx|
|
80
|
-
has_gaps
|
94
|
+
has_gaps = aln_col.any? { |aa| aa == "-" }
|
81
95
|
low_entropy =
|
82
|
-
|
96
|
+
Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
|
83
97
|
|
84
98
|
if !has_gaps && low_entropy
|
85
99
|
low_ent_cols << (aln_col_idx + 1)
|
@@ -92,11 +106,11 @@ module TreeClusters
|
|
92
106
|
# Like low_ent_cols method but also returns the bases at the positions.
|
93
107
|
def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
|
94
108
|
low_ent_cols = []
|
95
|
-
alns
|
96
|
-
aln_cols
|
109
|
+
alns = leaf2attrs.attrs leaves, :aln
|
110
|
+
aln_cols = alns.transpose
|
97
111
|
|
98
112
|
aln_cols.each_with_index do |aln_col, aln_col_idx|
|
99
|
-
has_gaps
|
113
|
+
has_gaps = aln_col.any? { |aa| aa == "-" }
|
100
114
|
low_entropy =
|
101
115
|
Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
|
102
116
|
|
@@ -130,9 +144,9 @@ module TreeClusters
|
|
130
144
|
if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
|
131
145
|
AbortIf::logger.error { "Seq IDs did not match in all input files" }
|
132
146
|
|
133
|
-
tree_ids
|
147
|
+
tree_ids = tree_ids.to_a.sort
|
134
148
|
mapping_ids = mapping_ids.to_a.sort
|
135
|
-
aln_ids
|
149
|
+
aln_ids = aln_ids.to_a.sort
|
136
150
|
|
137
151
|
AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
|
138
152
|
AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
|
@@ -152,7 +166,7 @@ module TreeClusters
|
|
152
166
|
# @yieldparam clade [Clade] a clade of the tree
|
153
167
|
#
|
154
168
|
# @return [Enumerator<Clade>] enumerator of Clade objects
|
155
|
-
def all_clades tree, metadata=nil
|
169
|
+
def all_clades tree, metadata = nil
|
156
170
|
return enum_for(:all_clades, tree, metadata) unless block_given?
|
157
171
|
|
158
172
|
tree.clade_nodes.reverse.each do |node|
|
@@ -164,12 +178,12 @@ module TreeClusters
|
|
164
178
|
snazzy_clades = {}
|
165
179
|
|
166
180
|
clades = self.
|
167
|
-
|
168
|
-
|
169
|
-
|
181
|
+
all_clades(tree, metadata).
|
182
|
+
sort_by { |clade| clade.all_leaves.count }.
|
183
|
+
reverse
|
170
184
|
|
171
185
|
metadata.each do |md_cat, leaf2mdtag|
|
172
|
-
already_checked
|
186
|
+
already_checked = Set.new
|
173
187
|
single_tag_clades = {}
|
174
188
|
|
175
189
|
clades.each do |clade|
|
@@ -177,8 +191,8 @@ module TreeClusters
|
|
177
191
|
"A clade cannot also be a leaf"
|
178
192
|
|
179
193
|
unless clade.all_leaves.all? do |leaf|
|
180
|
-
|
181
|
-
|
194
|
+
already_checked.include? leaf
|
195
|
+
end
|
182
196
|
md_tags = clade.all_leaves.map do |leaf|
|
183
197
|
assert leaf2mdtag.has_key?(leaf),
|
184
198
|
"leaf #{leaf} is missing from leaf2mdtag ht"
|
@@ -224,9 +238,9 @@ module TreeClusters
|
|
224
238
|
snazzy_info = {}
|
225
239
|
|
226
240
|
clades = self.
|
227
|
-
|
228
|
-
|
229
|
-
|
241
|
+
all_clades(tree, metadata).
|
242
|
+
sort_by { |clade| clade.all_leaves.count }.
|
243
|
+
reverse
|
230
244
|
|
231
245
|
# Non snazzy clades have a value of nil, so set all to nil and the
|
232
246
|
# snazzy ones will be overwritten.
|
@@ -235,7 +249,7 @@ module TreeClusters
|
|
235
249
|
end
|
236
250
|
|
237
251
|
metadata.each do |md_cat, leaf2mdtag|
|
238
|
-
already_checked
|
252
|
+
already_checked = Set.new
|
239
253
|
single_tag_clades = {}
|
240
254
|
|
241
255
|
clades.each do |clade|
|
@@ -243,8 +257,8 @@ module TreeClusters
|
|
243
257
|
"A clade cannot also be a leaf"
|
244
258
|
|
245
259
|
unless clade.all_leaves.all? do |leaf|
|
246
|
-
|
247
|
-
|
260
|
+
already_checked.include? leaf
|
261
|
+
end
|
248
262
|
md_tags = clade.all_leaves.map do |leaf|
|
249
263
|
assert leaf2mdtag.has_key?(leaf),
|
250
264
|
"leaf #{leaf} is missing from leaf2mdtag ht"
|
@@ -289,7 +303,7 @@ module TreeClusters
|
|
289
303
|
|
290
304
|
def read_mapping_file fname
|
291
305
|
md_cat_names = nil
|
292
|
-
metadata
|
306
|
+
metadata = TreeClusters::Attrs.new
|
293
307
|
|
294
308
|
File.open(fname, "rt").each_line.with_index do |line, idx|
|
295
309
|
leaf_name, *metadata_vals = line.chomp.split "\t"
|
@@ -342,6 +356,6 @@ module TreeClusters
|
|
342
356
|
end
|
343
357
|
end
|
344
358
|
|
345
|
-
|
359
|
+
[attr_names, attrs]
|
346
360
|
end
|
347
361
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
(('a-1','a-2')'clade_3___cluster_A',(("b-1",b-2)'clade_4___cluster_B1',(bb-1,(bbb-1,bbb-2)'clade_5___cluster_B3')'clade_2___cluster_B2')'clade_1___cluster_B')cluster_C;
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tree_clusters
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
@@ -201,6 +201,9 @@ files:
|
|
201
201
|
- lib/tree_clusters/clade.rb
|
202
202
|
- lib/tree_clusters/version.rb
|
203
203
|
- test_files/bad.aln
|
204
|
+
- test_files/key_cols/expected_output/small2/ARST.tree_clusters.annotated_tree.txt
|
205
|
+
- test_files/key_cols/expected_output/small2/ARST.tree_clusters.clade_members.txt
|
206
|
+
- test_files/key_cols/expected_output/small2/ARST.tree_clusters.key_cols.txt
|
204
207
|
- test_files/non_bifurcating.aln
|
205
208
|
- test_files/non_bifurcating.tre
|
206
209
|
- test_files/small.aln
|