tree_clusters 0.8.2 → 0.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/exe/key_cols +32 -12
- data/lib/tree_clusters.rb +45 -31
- data/lib/tree_clusters/version.rb +1 -1
- data/test_files/key_cols/expected_output/small2/ARST.tree_clusters.annotated_tree.txt +1 -0
- data/test_files/key_cols/expected_output/small2/ARST.tree_clusters.clade_members.txt +5 -0
- data/test_files/key_cols/expected_output/small2/ARST.tree_clusters.key_cols.txt +5 -0
- metadata +4 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: be727e6384cf510290ddab3bf5aabd5075505e77
|
4
|
+
data.tar.gz: 69e94311ac35dcb26579edada3d6555b56048cd1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5aacaa9b69741eff4f23b62b7be230befc444847cbf869ed0a5394e1fd768b34fc150483a71f079aef6fd683a9c58fe170e8ffdfc6f229b677862ce7b851c875
|
7
|
+
data.tar.gz: c3e639c1bced1b5c480e99f0d9e19ecb65a27dc3cf09e81d429714951d47b7b66905b82b9ed20d14ac05d36be0067b9b3de51796447acfd91a814ce73ed07fdc
|
data/.gitignore
CHANGED
data/exe/key_cols
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
+
# TODO the block variables called "clade" aren't actually instances of the TreeClusters::Clade class. It's kind of confusing.
|
4
|
+
|
3
5
|
Signal.trap("PIPE", "EXIT")
|
4
6
|
|
5
7
|
require "tree_clusters"
|
@@ -54,6 +56,8 @@ The key_cols program
|
|
54
56
|
alignment file (fasta format), I will tell you key columns for all
|
55
57
|
clades/clusters that have them.
|
56
58
|
|
59
|
+
Version: v#{TreeClusters::VERSION}
|
60
|
+
|
57
61
|
Overview
|
58
62
|
--------
|
59
63
|
|
@@ -147,9 +151,10 @@ abort_unless opts[:clade_size_cutoff] >= 1,
|
|
147
151
|
|
148
152
|
FileUtils.mkdir_p opts[:outdir]
|
149
153
|
|
150
|
-
AbortIf.logger.info { "
|
154
|
+
AbortIf.logger.info { "Reading tree" }
|
155
|
+
tree = NewickTree.fromFile opts[:tree]
|
151
156
|
|
152
|
-
|
157
|
+
AbortIf.logger.info { "Reading alignment" }
|
153
158
|
leaf2attrs = TreeClusters.read_alignment opts[:aln]
|
154
159
|
|
155
160
|
members_fname =
|
@@ -169,22 +174,37 @@ key_cols_f =
|
|
169
174
|
annotated_tree_f =
|
170
175
|
File.open(annotated_tree_fname, "w")
|
171
176
|
|
172
|
-
key_col_sets
|
173
|
-
clade_sizes
|
174
|
-
|
177
|
+
key_col_sets = {}
|
178
|
+
clade_sizes = {}
|
179
|
+
# AbortIf.logger.info { "Counting clades" }
|
180
|
+
# clade_count = TreeClusters.all_clades(tree).count
|
175
181
|
change_these_names = Set.new
|
176
182
|
|
177
183
|
def all_clades_helper tree
|
178
|
-
TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
|
184
|
+
# TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
|
185
|
+
tree.clade_nodes.
|
186
|
+
reverse.
|
187
|
+
sort_by { |node| node.all_leaves.count }.
|
188
|
+
reverse
|
179
189
|
end
|
180
190
|
|
181
|
-
AbortIf.logger.info { "
|
191
|
+
AbortIf.logger.info { "Getting all clades" }
|
192
|
+
clades = all_clades_helper(tree)
|
193
|
+
clade_count = clades.count
|
194
|
+
|
195
|
+
AbortIf.logger.info { "Processing clades (The first few clades go reaaally slowly, but then I speed up!)" }
|
182
196
|
begin
|
183
|
-
|
184
|
-
|
197
|
+
clades.each_with_index do |clade, idx|
|
198
|
+
# It starts off really slowly, then speeds up a lot.
|
199
|
+
if (idx+1) < 100 ||
|
200
|
+
((idx+1) < 1000 && ((idx + 1) % 10).zero?) ||
|
201
|
+
((idx+1) < 10000 && ((idx + 1) % 100).zero?) ||
|
202
|
+
((idx+1) < 100000 && ((idx + 1) % 1000).zero?) ||
|
203
|
+
((idx + 1) % 10000).zero?
|
204
|
+
|
185
205
|
perc = ((idx + 1) / clade_count.to_f * 100).round 2
|
186
206
|
|
187
|
-
STDERR.printf("Processing clades: #{perc}
|
207
|
+
STDERR.printf("Processing clades: #{idx + 1} of #{clade_count} (#{perc}%%)\r")
|
188
208
|
end
|
189
209
|
|
190
210
|
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
|
@@ -224,12 +244,12 @@ begin
|
|
224
244
|
|
225
245
|
AbortIf.logger.info { "Annotating tree" }
|
226
246
|
|
227
|
-
|
247
|
+
clades.each_with_index do |clade, idx|
|
228
248
|
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
|
229
249
|
|
230
250
|
if change_these_names.include? clade_id
|
231
251
|
# This will change the node in the original NewickTree
|
232
|
-
clade.
|
252
|
+
clade.name = "'#{clade_id}'"
|
233
253
|
end
|
234
254
|
end
|
235
255
|
|
data/lib/tree_clusters.rb
CHANGED
@@ -27,12 +27,20 @@ class NewickTree
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def unquoted_taxa
|
30
|
+
# @note self.taxa calls taxa method on the root of the tree.
|
30
31
|
self.taxa.map { |str| str.tr %q{"'}, "" }
|
31
32
|
end
|
32
33
|
end
|
33
34
|
|
35
|
+
class NewickNode
|
36
|
+
def all_leaves
|
37
|
+
self.leaves.map { |n| n.name.tr %q{"'}, "" }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
34
41
|
# Top level namespace of the Gem.
|
35
42
|
module TreeClusters
|
43
|
+
PROJ_ROOT = File.join __dir__, ".."
|
36
44
|
|
37
45
|
# Given an ary of strings, find the most common string in the ary.
|
38
46
|
#
|
@@ -48,18 +56,24 @@ module TreeClusters
|
|
48
56
|
# @note Each string is upcase'd before frequencies are calculated.
|
49
57
|
def consensus bases
|
50
58
|
bases.
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
59
|
+
map(&:upcase).
|
60
|
+
group_by(&:itself).
|
61
|
+
sort_by { |_, bases| bases.count }.
|
62
|
+
reverse.
|
63
|
+
first.
|
64
|
+
first
|
57
65
|
end
|
58
66
|
|
59
67
|
def read_alignment aln_fname
|
60
68
|
leaf2attrs = TreeClusters::Attrs.new
|
61
|
-
aln_len
|
69
|
+
aln_len = nil
|
70
|
+
seq_num = 0
|
62
71
|
ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
|
72
|
+
seq_num += 1
|
73
|
+
if ((seq_num + 1) % 1000).zero?
|
74
|
+
STDERR.printf("Reading alignment: #{seq_num + 1}\r")
|
75
|
+
end
|
76
|
+
|
63
77
|
leaf2attrs[rec.id] = { aln: rec.seq.chars }
|
64
78
|
|
65
79
|
aln_len ||= rec.seq.length
|
@@ -73,13 +87,13 @@ module TreeClusters
|
|
73
87
|
|
74
88
|
def low_ent_cols leaves, leaf2attrs, entropy_cutoff
|
75
89
|
low_ent_cols = []
|
76
|
-
alns
|
77
|
-
aln_cols
|
90
|
+
alns = leaf2attrs.attrs leaves, :aln
|
91
|
+
aln_cols = alns.transpose
|
78
92
|
|
79
93
|
aln_cols.each_with_index do |aln_col, aln_col_idx|
|
80
|
-
has_gaps
|
94
|
+
has_gaps = aln_col.any? { |aa| aa == "-" }
|
81
95
|
low_entropy =
|
82
|
-
|
96
|
+
Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
|
83
97
|
|
84
98
|
if !has_gaps && low_entropy
|
85
99
|
low_ent_cols << (aln_col_idx + 1)
|
@@ -92,11 +106,11 @@ module TreeClusters
|
|
92
106
|
# Like low_ent_cols method but also returns the bases at the positions.
|
93
107
|
def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
|
94
108
|
low_ent_cols = []
|
95
|
-
alns
|
96
|
-
aln_cols
|
109
|
+
alns = leaf2attrs.attrs leaves, :aln
|
110
|
+
aln_cols = alns.transpose
|
97
111
|
|
98
112
|
aln_cols.each_with_index do |aln_col, aln_col_idx|
|
99
|
-
has_gaps
|
113
|
+
has_gaps = aln_col.any? { |aa| aa == "-" }
|
100
114
|
low_entropy =
|
101
115
|
Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
|
102
116
|
|
@@ -130,9 +144,9 @@ module TreeClusters
|
|
130
144
|
if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
|
131
145
|
AbortIf::logger.error { "Seq IDs did not match in all input files" }
|
132
146
|
|
133
|
-
tree_ids
|
147
|
+
tree_ids = tree_ids.to_a.sort
|
134
148
|
mapping_ids = mapping_ids.to_a.sort
|
135
|
-
aln_ids
|
149
|
+
aln_ids = aln_ids.to_a.sort
|
136
150
|
|
137
151
|
AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
|
138
152
|
AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
|
@@ -152,7 +166,7 @@ module TreeClusters
|
|
152
166
|
# @yieldparam clade [Clade] a clade of the tree
|
153
167
|
#
|
154
168
|
# @return [Enumerator<Clade>] enumerator of Clade objects
|
155
|
-
def all_clades tree, metadata=nil
|
169
|
+
def all_clades tree, metadata = nil
|
156
170
|
return enum_for(:all_clades, tree, metadata) unless block_given?
|
157
171
|
|
158
172
|
tree.clade_nodes.reverse.each do |node|
|
@@ -164,12 +178,12 @@ module TreeClusters
|
|
164
178
|
snazzy_clades = {}
|
165
179
|
|
166
180
|
clades = self.
|
167
|
-
|
168
|
-
|
169
|
-
|
181
|
+
all_clades(tree, metadata).
|
182
|
+
sort_by { |clade| clade.all_leaves.count }.
|
183
|
+
reverse
|
170
184
|
|
171
185
|
metadata.each do |md_cat, leaf2mdtag|
|
172
|
-
already_checked
|
186
|
+
already_checked = Set.new
|
173
187
|
single_tag_clades = {}
|
174
188
|
|
175
189
|
clades.each do |clade|
|
@@ -177,8 +191,8 @@ module TreeClusters
|
|
177
191
|
"A clade cannot also be a leaf"
|
178
192
|
|
179
193
|
unless clade.all_leaves.all? do |leaf|
|
180
|
-
|
181
|
-
|
194
|
+
already_checked.include? leaf
|
195
|
+
end
|
182
196
|
md_tags = clade.all_leaves.map do |leaf|
|
183
197
|
assert leaf2mdtag.has_key?(leaf),
|
184
198
|
"leaf #{leaf} is missing from leaf2mdtag ht"
|
@@ -224,9 +238,9 @@ module TreeClusters
|
|
224
238
|
snazzy_info = {}
|
225
239
|
|
226
240
|
clades = self.
|
227
|
-
|
228
|
-
|
229
|
-
|
241
|
+
all_clades(tree, metadata).
|
242
|
+
sort_by { |clade| clade.all_leaves.count }.
|
243
|
+
reverse
|
230
244
|
|
231
245
|
# Non snazzy clades have a value of nil, so set all to nil and the
|
232
246
|
# snazzy ones will be overwritten.
|
@@ -235,7 +249,7 @@ module TreeClusters
|
|
235
249
|
end
|
236
250
|
|
237
251
|
metadata.each do |md_cat, leaf2mdtag|
|
238
|
-
already_checked
|
252
|
+
already_checked = Set.new
|
239
253
|
single_tag_clades = {}
|
240
254
|
|
241
255
|
clades.each do |clade|
|
@@ -243,8 +257,8 @@ module TreeClusters
|
|
243
257
|
"A clade cannot also be a leaf"
|
244
258
|
|
245
259
|
unless clade.all_leaves.all? do |leaf|
|
246
|
-
|
247
|
-
|
260
|
+
already_checked.include? leaf
|
261
|
+
end
|
248
262
|
md_tags = clade.all_leaves.map do |leaf|
|
249
263
|
assert leaf2mdtag.has_key?(leaf),
|
250
264
|
"leaf #{leaf} is missing from leaf2mdtag ht"
|
@@ -289,7 +303,7 @@ module TreeClusters
|
|
289
303
|
|
290
304
|
def read_mapping_file fname
|
291
305
|
md_cat_names = nil
|
292
|
-
metadata
|
306
|
+
metadata = TreeClusters::Attrs.new
|
293
307
|
|
294
308
|
File.open(fname, "rt").each_line.with_index do |line, idx|
|
295
309
|
leaf_name, *metadata_vals = line.chomp.split "\t"
|
@@ -342,6 +356,6 @@ module TreeClusters
|
|
342
356
|
end
|
343
357
|
end
|
344
358
|
|
345
|
-
|
359
|
+
[attr_names, attrs]
|
346
360
|
end
|
347
361
|
end
|
@@ -0,0 +1 @@
|
|
1
|
+
(('a-1','a-2')'clade_3___cluster_A',(("b-1",b-2)'clade_4___cluster_B1',(bb-1,(bbb-1,bbb-2)'clade_5___cluster_B3')'clade_2___cluster_B2')'clade_1___cluster_B')cluster_C;
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tree_clusters
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
@@ -201,6 +201,9 @@ files:
|
|
201
201
|
- lib/tree_clusters/clade.rb
|
202
202
|
- lib/tree_clusters/version.rb
|
203
203
|
- test_files/bad.aln
|
204
|
+
- test_files/key_cols/expected_output/small2/ARST.tree_clusters.annotated_tree.txt
|
205
|
+
- test_files/key_cols/expected_output/small2/ARST.tree_clusters.clade_members.txt
|
206
|
+
- test_files/key_cols/expected_output/small2/ARST.tree_clusters.key_cols.txt
|
204
207
|
- test_files/non_bifurcating.aln
|
205
208
|
- test_files/non_bifurcating.tre
|
206
209
|
- test_files/small.aln
|