tree_clusters 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/exe/clade_attrs +229 -0
- data/lib/tree_clusters/attr_array.rb +18 -0
- data/lib/tree_clusters/attrs.rb +47 -0
- data/lib/tree_clusters/clade.rb +128 -0
- data/lib/tree_clusters/version.rb +1 -1
- data/lib/tree_clusters.rb +70 -190
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6f0523128e1d9efede01f9af494d99e7e51a48c
|
4
|
+
data.tar.gz: 58bac03495f585ad2ea755abaac5ceefdb90e2d3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 73a82096be4f5be8199be28bf3e12b0ac63c08b9da1cfea1fded6cd4da7c448ffc7cdc8a285c681b88493b39fa32fcace02be0fff23190a314050d107db4b2f5
|
7
|
+
data.tar.gz: a5b00bec9a4f567efc098e664e6479e691f96e761a5c57ac5e4f198159c9d1cfd320dbb2033a1a458597460f621815e03ee6ecef351f31db4597f4a0a7aac838
|
data/exe/clade_attrs
ADDED
@@ -0,0 +1,229 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
Signal.trap("PIPE", "EXIT")
|
4
|
+
|
5
|
+
require "tree_clusters"
|
6
|
+
require "trollop"
|
7
|
+
require "parse_fasta"
|
8
|
+
require "shannon"
|
9
|
+
require "fileutils"
|
10
|
+
|
11
|
+
TreeClusters.extend TreeClusters
|
12
|
+
|
13
|
+
def puts_info outf, clade_id, attr_cat, attr_set
|
14
|
+
outf.puts [clade_id, attr_cat, attr_set.to_a].join "\t"
|
15
|
+
end
|
16
|
+
|
17
|
+
opts = Trollop.options do
|
18
|
+
version TreeClusters::VERSION
|
19
|
+
|
20
|
+
banner <<-EOS
|
21
|
+
|
22
|
+
|
23
|
+
Checking IDs
|
24
|
+
------------
|
25
|
+
|
26
|
+
IDs for the sequences must match between the three input files.
|
27
|
+
|
28
|
+
The tree file is allowed to have quoted taxa names, but the mapping
|
29
|
+
file and alignment file are not.
|
30
|
+
|
31
|
+
If your alignment file has spaces in the name, the ID part of the
|
32
|
+
header (i.e., the part up until the space) must match with the
|
33
|
+
sequence IDs in the tree and the mapping file.
|
34
|
+
|
35
|
+
Example: This would be okay.
|
36
|
+
|
37
|
+
tree file:
|
38
|
+
('genome_A', 'genome_B');
|
39
|
+
|
40
|
+
aln file:
|
41
|
+
>genome_A apple pie
|
42
|
+
AAAAA
|
43
|
+
>genome_B brown sugar
|
44
|
+
AATTA
|
45
|
+
|
46
|
+
mapping file:
|
47
|
+
name coolness
|
48
|
+
genome_A cool
|
49
|
+
genome_B notcool
|
50
|
+
|
51
|
+
|
52
|
+
Subtracting parent nodes
|
53
|
+
------------------------
|
54
|
+
|
55
|
+
If a clade's parent would be the root of the tree, no columns will
|
56
|
+
be subtracted when removing the parent columns as it would be the
|
57
|
+
entire alignment.
|
58
|
+
|
59
|
+
Options:
|
60
|
+
EOS
|
61
|
+
|
62
|
+
opt(:tree,
|
63
|
+
"Newick tree file",
|
64
|
+
type: :string)
|
65
|
+
opt(:mapping,
|
66
|
+
"Mapping file",
|
67
|
+
type: :string)
|
68
|
+
opt(:attrs,
|
69
|
+
"Attributes file",
|
70
|
+
type: :string)
|
71
|
+
|
72
|
+
opt(:clade_size_cutoff,
|
73
|
+
"Consider only clades with at least this many leaves",
|
74
|
+
default: 1)
|
75
|
+
|
76
|
+
opt(:outdir,
|
77
|
+
"Output directory",
|
78
|
+
default: ".")
|
79
|
+
opt(:base,
|
80
|
+
"Basename for output",
|
81
|
+
default: "clade_attrs")
|
82
|
+
end
|
83
|
+
|
84
|
+
abort_if opts[:tree].nil?,
|
85
|
+
"--tree is a required arg"
|
86
|
+
abort_if opts[:mapping].nil?,
|
87
|
+
"--mapping is a required arg"
|
88
|
+
abort_if opts[:attrs].nil?,
|
89
|
+
"--attrs is a required arg"
|
90
|
+
|
91
|
+
abort_unless_file_exists opts[:tree]
|
92
|
+
abort_unless_file_exists opts[:mapping]
|
93
|
+
abort_unless_file_exists opts[:attrs]
|
94
|
+
|
95
|
+
# TODO check IDs when attrs is not a fasta file
|
96
|
+
# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:attrs]
|
97
|
+
|
98
|
+
abort_unless opts[:clade_size_cutoff] >= 1,
|
99
|
+
"--clade-size-cutoff must be >= 1"
|
100
|
+
|
101
|
+
FileUtils.mkdir_p opts[:outdir]
|
102
|
+
|
103
|
+
tree = NewickTree.fromFile opts[:tree]
|
104
|
+
metadata = TreeClusters.read_mapping_file opts[:mapping]
|
105
|
+
snazzy_info = TreeClusters.snazzy_info tree, metadata
|
106
|
+
attr_names, leaf2attrs = TreeClusters.read_attrs_file opts[:attrs]
|
107
|
+
|
108
|
+
ext_base = "clade_attrs"
|
109
|
+
|
110
|
+
clades_fname =
|
111
|
+
File.join opts[:outdir],
|
112
|
+
"#{opts[:base]}.#{ext_base}.txt"
|
113
|
+
members_fname =
|
114
|
+
File.join opts[:outdir],
|
115
|
+
"#{opts[:base]}.#{ext_base}_clade_members.txt"
|
116
|
+
attrs_fname =
|
117
|
+
File.join opts[:outdir],
|
118
|
+
"#{opts[:base]}.#{ext_base}_attrs_union.txt"
|
119
|
+
attrs_intersection_fname =
|
120
|
+
File.join opts[:outdir],
|
121
|
+
"#{opts[:base]}.#{ext_base}_attrs_intersection.txt"
|
122
|
+
attrs_minus_parent_attrs_fname =
|
123
|
+
File.join opts[:outdir],
|
124
|
+
"#{opts[:base]}.#{ext_base}_attrs_minus_parent_attrs.txt"
|
125
|
+
attrs_minus_sibling_attrs_fname =
|
126
|
+
File.join opts[:outdir],
|
127
|
+
"#{opts[:base]}.#{ext_base}_attrs_minus_sibling_attrs.txt"
|
128
|
+
attrs_minus_other_attrs_fname =
|
129
|
+
File.join opts[:outdir],
|
130
|
+
"#{opts[:base]}.#{ext_base}_attrs_minus_other_attrs.txt"
|
131
|
+
|
132
|
+
|
133
|
+
info_f =
|
134
|
+
File.open(clades_fname, "w")
|
135
|
+
clade_members_f =
|
136
|
+
File.open(members_fname, "w")
|
137
|
+
attrs_f =
|
138
|
+
File.open(attrs_fname, "w")
|
139
|
+
attrs_intersection_f =
|
140
|
+
File.open(attrs_intersection_fname, "w")
|
141
|
+
attrs_minus_parent_attrs_f =
|
142
|
+
File.open(attrs_minus_parent_attrs_fname, "w")
|
143
|
+
attrs_minus_sibling_attrs_f =
|
144
|
+
File.open(attrs_minus_sibling_attrs_fname, "w")
|
145
|
+
attrs_minus_other_attrs_f =
|
146
|
+
File.open(attrs_minus_other_attrs_fname, "w")
|
147
|
+
|
148
|
+
|
149
|
+
begin
|
150
|
+
# info is { metadata_category => metadata_tag , ... }
|
151
|
+
snazzy_info.each_with_index do |(clade, info), idx|
|
152
|
+
assert clade.all_leaves.all? { |leaf| leaf2attrs.has_key? leaf },
|
153
|
+
"Not all leaves are present in the leaf2attrs hash table"
|
154
|
+
|
155
|
+
clade_id = "clade_#{idx+1}___#{clade.name}"
|
156
|
+
|
157
|
+
is_snazzy = info.nil? ? false : true
|
158
|
+
snazzy = is_snazzy ? "snazzy" : "not_snazzy"
|
159
|
+
|
160
|
+
if is_snazzy
|
161
|
+
info_f.puts [clade_id,
|
162
|
+
info.count,
|
163
|
+
info.map { |pair| pair.join("|")}].join "\t"
|
164
|
+
else
|
165
|
+
info_f.puts [clade_id,
|
166
|
+
0,
|
167
|
+
"not_snazzy"].join "\t"
|
168
|
+
end
|
169
|
+
|
170
|
+
clade_members_f.puts [clade_id,
|
171
|
+
clade.all_leaves.count,
|
172
|
+
clade.all_leaves].join "\t"
|
173
|
+
|
174
|
+
attr_names.each do |attr_category|
|
175
|
+
attrs_all_leaves =
|
176
|
+
leaf2attrs.attrs clade.all_leaves, attr_category
|
177
|
+
|
178
|
+
attrs_all_sibling_leaves =
|
179
|
+
leaf2attrs.attrs clade.all_sibling_leaves,
|
180
|
+
attr_category
|
181
|
+
attrs_parent_leaves =
|
182
|
+
leaf2attrs.attrs clade.parent_leaves,
|
183
|
+
attr_category
|
184
|
+
attrs_other_leaves =
|
185
|
+
leaf2attrs.attrs clade.other_leaves,
|
186
|
+
attr_category
|
187
|
+
|
188
|
+
attrs_all_minus_parent =
|
189
|
+
attrs_all_leaves.union - attrs_parent_leaves.union
|
190
|
+
attrs_all_minus_sibling =
|
191
|
+
attrs_all_leaves.union - attrs_all_sibling_leaves.union
|
192
|
+
attrs_all_minus_other =
|
193
|
+
attrs_all_leaves.union - attrs_other_leaves.union
|
194
|
+
|
195
|
+
|
196
|
+
puts_info attrs_f,
|
197
|
+
clade_id,
|
198
|
+
attr_category,
|
199
|
+
attrs_all_leaves.union
|
200
|
+
|
201
|
+
puts_info attrs_intersection_f,
|
202
|
+
clade_id,
|
203
|
+
attr_category,
|
204
|
+
attrs_all_leaves.intersection
|
205
|
+
|
206
|
+
puts_info attrs_minus_parent_attrs_f,
|
207
|
+
clade_id,
|
208
|
+
attr_category,
|
209
|
+
attrs_all_minus_parent
|
210
|
+
|
211
|
+
puts_info attrs_minus_sibling_attrs_f,
|
212
|
+
clade_id,
|
213
|
+
attr_category,
|
214
|
+
attrs_all_minus_sibling
|
215
|
+
|
216
|
+
puts_info attrs_minus_other_attrs_f,
|
217
|
+
clade_id,
|
218
|
+
attr_category,
|
219
|
+
attrs_all_minus_other
|
220
|
+
end
|
221
|
+
end
|
222
|
+
ensure
|
223
|
+
info_f.close
|
224
|
+
clade_members_f.close
|
225
|
+
attrs_f.close
|
226
|
+
attrs_minus_parent_attrs_f.close
|
227
|
+
attrs_minus_sibling_attrs_f.close
|
228
|
+
attrs_minus_other_attrs_f.close
|
229
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module TreeClusters
|
2
|
+
# Provides convenience methods for working with Arrays of Sets
|
3
|
+
class AttrArray < Object::Array
|
4
|
+
# Takes the union of all sets in the AttrArray
|
5
|
+
#
|
6
|
+
# @return [Set]
|
7
|
+
def union
|
8
|
+
self.reduce(&:union)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Takes the intersection of all sets in the AttrArray
|
12
|
+
#
|
13
|
+
# @return [Set]
|
14
|
+
def intersection
|
15
|
+
self.reduce(&:intersection)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module TreeClusters
|
2
|
+
# A Hash table for genome/leaf/taxa attributes
|
3
|
+
class Attrs < Hash
|
4
|
+
|
5
|
+
# Returns the an AttrArray of Sets for the given genomes and
|
6
|
+
# attribute.
|
7
|
+
#
|
8
|
+
# @note If a genome is in the leaves array, but is not in the hash
|
9
|
+
# table, NO error will be raised. Rather that genome will be
|
10
|
+
# skipped. This is for cases in which not all genomes have
|
11
|
+
# attributes.
|
12
|
+
#
|
13
|
+
# @param leaves [Array<String>] names of the leaves for which you
|
14
|
+
# need attributes
|
15
|
+
# @param attr [Symbol] the attribute you are interested in eg,
|
16
|
+
# :genes
|
17
|
+
#
|
18
|
+
# @return [AttrArray<Set>] an AttrArray of Sets of
|
19
|
+
# attributes
|
20
|
+
#
|
21
|
+
# @raise [AbortIf::Exit] if they leaf is present but doesn't have
|
22
|
+
# the requested attr
|
23
|
+
def attrs leaves, attr
|
24
|
+
ary = leaves.map do |leaf|
|
25
|
+
|
26
|
+
if self.has_key? leaf
|
27
|
+
abort_unless self[leaf].has_key?(attr),
|
28
|
+
"Missing attr #{attr.inspect} for leaf '#{leaf}'"
|
29
|
+
|
30
|
+
self[leaf][attr]
|
31
|
+
else
|
32
|
+
nil
|
33
|
+
end
|
34
|
+
end.compact
|
35
|
+
|
36
|
+
TreeClusters::AttrArray.new ary
|
37
|
+
end
|
38
|
+
|
39
|
+
def add leaf, attr, val
|
40
|
+
if self.has_key? leaf
|
41
|
+
self[leaf][attr] = val
|
42
|
+
else
|
43
|
+
self[leaf] = { attr => val }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
module TreeClusters
|
2
|
+
# Represents a clade in a NewickTree
|
3
|
+
class Clade
|
4
|
+
attr_accessor :name,
|
5
|
+
:all_leaves,
|
6
|
+
:left_leaves,
|
7
|
+
:right_leaves,
|
8
|
+
:all_sibling_leaves,
|
9
|
+
:each_sibling_leaf_set,
|
10
|
+
:parent_leaves,
|
11
|
+
:non_parent_leaves,
|
12
|
+
:other_leaves,
|
13
|
+
:single_tag_info,
|
14
|
+
:all_tags
|
15
|
+
|
16
|
+
# @note If a node name is quoted, then those quotes are removed
|
17
|
+
# first.
|
18
|
+
#
|
19
|
+
# @param node [NewickNode] a NewickNode from a NewickTree
|
20
|
+
# @param tree [NewickTree] a NewickTree
|
21
|
+
def initialize node, tree, metadata=nil
|
22
|
+
tree_taxa = tree.unquoted_taxa
|
23
|
+
|
24
|
+
@name = unquote node.name
|
25
|
+
@all_leaves = descendant_leaves node
|
26
|
+
|
27
|
+
if (children = node.children).count == 2
|
28
|
+
lchild, rchild = node.children
|
29
|
+
|
30
|
+
@left_leaves = descendant_leaves lchild
|
31
|
+
|
32
|
+
@right_leaves = descendant_leaves rchild
|
33
|
+
end
|
34
|
+
|
35
|
+
siblings = node.siblings
|
36
|
+
# assert siblings.count == 1,
|
37
|
+
# "Node #{node.name} has more than one sibling."
|
38
|
+
|
39
|
+
@each_sibling_leaf_set = siblings.
|
40
|
+
map { |node| descendant_leaves node }
|
41
|
+
|
42
|
+
@all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq
|
43
|
+
|
44
|
+
parent = node.parent
|
45
|
+
assert parent,
|
46
|
+
"Noge #{node.name} has no parent. Is it the root?"
|
47
|
+
@parent_leaves = descendant_leaves parent
|
48
|
+
|
49
|
+
@other_leaves =
|
50
|
+
Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
|
51
|
+
|
52
|
+
@non_parent_leaves =
|
53
|
+
Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
|
54
|
+
|
55
|
+
if metadata
|
56
|
+
@metadata = metadata
|
57
|
+
@all_tags ||= get_all_tags
|
58
|
+
@single_tag_info ||= get_single_tag_info
|
59
|
+
else
|
60
|
+
@single_tag_info = nil
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Compares two Clades field by field.
|
65
|
+
#
|
66
|
+
# If all instance variables are == than the two clades are == as
|
67
|
+
# well.
|
68
|
+
def == clade
|
69
|
+
(
|
70
|
+
self.name == clade.name &&
|
71
|
+
self.all_leaves == clade.all_leaves &&
|
72
|
+
self.left_leaves == clade.left_leaves &&
|
73
|
+
self.right_leaves == clade.right_leaves &&
|
74
|
+
self.all_sibling_leaves == clade.all_sibling_leaves &&
|
75
|
+
self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
|
76
|
+
self.parent_leaves == clade.parent_leaves &&
|
77
|
+
self.other_leaves == clade.other_leaves &&
|
78
|
+
self.single_tag_info == clade.single_tag_info &&
|
79
|
+
self.all_tags == clade.all_tags
|
80
|
+
)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Alias for ==
|
84
|
+
def eql? clade
|
85
|
+
self == clade
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def get_single_tag_info
|
91
|
+
@all_tags.map do |md_cat, set|
|
92
|
+
[md_cat, set.count == 1 ? set.to_a.first : nil]
|
93
|
+
end.to_h
|
94
|
+
end
|
95
|
+
|
96
|
+
def get_all_tags
|
97
|
+
# name2tag has leaf names => metadata tag and is an Attrs
|
98
|
+
@metadata.map do |md_cat, name2tag|
|
99
|
+
tag_info = self.all_leaves.map do |leaf|
|
100
|
+
assert name2tag.has_key?(leaf),
|
101
|
+
"leaf #{leaf} is not present in name2tag ht for " +
|
102
|
+
"md_cat #{md_cat}"
|
103
|
+
|
104
|
+
name2tag[leaf]
|
105
|
+
end
|
106
|
+
|
107
|
+
[md_cat, Set.new(tag_info)]
|
108
|
+
end.to_h
|
109
|
+
end
|
110
|
+
|
111
|
+
def descendant_leaves node
|
112
|
+
if node.leaf?
|
113
|
+
[unquote(node.name)]
|
114
|
+
else
|
115
|
+
node.
|
116
|
+
descendants.
|
117
|
+
flatten.
|
118
|
+
uniq.
|
119
|
+
select { |node| node.leaf? }.
|
120
|
+
map { |node| unquote(node.name) }
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def unquote str
|
125
|
+
str.tr %q{"'}, ""
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
data/lib/tree_clusters.rb
CHANGED
@@ -3,6 +3,9 @@ require "Newick"
|
|
3
3
|
require "set"
|
4
4
|
require "parse_fasta"
|
5
5
|
require "shannon"
|
6
|
+
require "tree_clusters/attrs"
|
7
|
+
require "tree_clusters/attr_array"
|
8
|
+
require "tree_clusters/clade"
|
6
9
|
require "tree_clusters/version"
|
7
10
|
|
8
11
|
include AbortIf
|
@@ -198,6 +201,73 @@ module TreeClusters
|
|
198
201
|
snazzy_clades
|
199
202
|
end
|
200
203
|
|
204
|
+
def snazzy_info tree, metadata
|
205
|
+
snazzy_info = {}
|
206
|
+
|
207
|
+
clades = self.
|
208
|
+
all_clades(tree, metadata).
|
209
|
+
sort_by { |clade| clade.all_leaves.count }.
|
210
|
+
reverse
|
211
|
+
|
212
|
+
# Non snazzy clades have a value of nil, so set all to nil and the
|
213
|
+
# snazzy ones will be overwritten.
|
214
|
+
clades.each do |clade|
|
215
|
+
snazzy_info[clade] = nil
|
216
|
+
end
|
217
|
+
|
218
|
+
metadata.each do |md_cat, leaf2mdtag|
|
219
|
+
already_checked = Set.new
|
220
|
+
single_tag_clades = {}
|
221
|
+
|
222
|
+
clades.each do |clade|
|
223
|
+
assert clade.all_leaves.count > 1,
|
224
|
+
"A clade cannot also be a leaf"
|
225
|
+
|
226
|
+
unless clade.all_leaves.all? do |leaf|
|
227
|
+
already_checked.include? leaf
|
228
|
+
end
|
229
|
+
md_tags = clade.all_leaves.map do |leaf|
|
230
|
+
assert leaf2mdtag.has_key?(leaf),
|
231
|
+
"leaf #{leaf} is missing from leaf2mdtag ht"
|
232
|
+
|
233
|
+
leaf2mdtag[leaf]
|
234
|
+
end
|
235
|
+
|
236
|
+
# this clade is mono-phyletic w.r.t. this metadata category.
|
237
|
+
if md_tags.uniq.count == 1
|
238
|
+
clade.all_leaves.each do |leaf|
|
239
|
+
already_checked << leaf
|
240
|
+
end
|
241
|
+
|
242
|
+
assert !single_tag_clades.has_key?(clade),
|
243
|
+
"clade #{clade.name} is repeated in single_tag_clades for #{md_cat}"
|
244
|
+
|
245
|
+
single_tag_clades[clade] = md_tags.first
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
single_tag_clades.each do |clade, md_tag|
|
251
|
+
non_clade_leaves = tree.unquoted_taxa - clade.all_leaves
|
252
|
+
|
253
|
+
non_clade_leaves_with_this_md_tag = non_clade_leaves.map do |leaf|
|
254
|
+
[leaf, leaf2mdtag[leaf]]
|
255
|
+
end.select { |ary| ary.last == md_tag }
|
256
|
+
|
257
|
+
is_snazzy_clade = non_clade_leaves_with_this_md_tag.count.zero?
|
258
|
+
if is_snazzy_clade
|
259
|
+
if !snazzy_info[clade].nil?
|
260
|
+
snazzy_info[clade][md_cat] = md_tag
|
261
|
+
else
|
262
|
+
snazzy_info[clade] = { md_cat => md_tag }
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
snazzy_info
|
269
|
+
end
|
270
|
+
|
201
271
|
def read_mapping_file fname
|
202
272
|
md_cat_names = nil
|
203
273
|
metadata = TreeClusters::Attrs.new
|
@@ -255,194 +325,4 @@ module TreeClusters
|
|
255
325
|
|
256
326
|
[attr_names, attrs]
|
257
327
|
end
|
258
|
-
|
259
|
-
# A Hash table for genome/leaf/taxa attributes
|
260
|
-
class Attrs < Hash
|
261
|
-
|
262
|
-
# Returns the an AttrArray of Sets for the given genomes and
|
263
|
-
# attribute.
|
264
|
-
#
|
265
|
-
# @note If a genome is in the leaves array, but is not in the hash
|
266
|
-
# table, NO error will be raised. Rather that genome will be
|
267
|
-
# skipped. This is for cases in which not all genomes have
|
268
|
-
# attributes.
|
269
|
-
#
|
270
|
-
# @param leaves [Array<String>] names of the leaves for which you
|
271
|
-
# need attributes
|
272
|
-
# @param attr [Symbol] the attribute you are interested in eg,
|
273
|
-
# :genes
|
274
|
-
#
|
275
|
-
# @return [AttrArray<Set>] an AttrArray of Sets of
|
276
|
-
# attributes
|
277
|
-
#
|
278
|
-
# @raise [AbortIf::Exit] if they leaf is present but doesn't have
|
279
|
-
# the requested attr
|
280
|
-
def attrs leaves, attr
|
281
|
-
ary = leaves.map do |leaf|
|
282
|
-
|
283
|
-
if self.has_key? leaf
|
284
|
-
abort_unless self[leaf].has_key?(attr),
|
285
|
-
"Missing attr #{attr.inspect} for leaf '#{leaf}'"
|
286
|
-
|
287
|
-
self[leaf][attr]
|
288
|
-
else
|
289
|
-
nil
|
290
|
-
end
|
291
|
-
end.compact
|
292
|
-
|
293
|
-
TreeClusters::AttrArray.new ary
|
294
|
-
end
|
295
|
-
|
296
|
-
def add leaf, attr, val
|
297
|
-
if self.has_key? leaf
|
298
|
-
self[leaf][attr] = val
|
299
|
-
else
|
300
|
-
self[leaf] = { attr => val }
|
301
|
-
end
|
302
|
-
end
|
303
|
-
end
|
304
|
-
|
305
|
-
# Provides convenience methods for working with Arrays of Sets
|
306
|
-
class AttrArray < Object::Array
|
307
|
-
# Takes the union of all sets in the AttrArray
|
308
|
-
#
|
309
|
-
# @return [Set]
|
310
|
-
def union
|
311
|
-
self.reduce(&:union)
|
312
|
-
end
|
313
|
-
|
314
|
-
# Takes the intersection of all sets in the AttrArray
|
315
|
-
#
|
316
|
-
# @return [Set]
|
317
|
-
def intersection
|
318
|
-
self.reduce(&:intersection)
|
319
|
-
end
|
320
|
-
end
|
321
|
-
|
322
|
-
# Represents a clade in a NewickTree
|
323
|
-
class Clade
|
324
|
-
attr_accessor :name,
|
325
|
-
:all_leaves,
|
326
|
-
:left_leaves,
|
327
|
-
:right_leaves,
|
328
|
-
:all_sibling_leaves,
|
329
|
-
:each_sibling_leaf_set,
|
330
|
-
:parent_leaves,
|
331
|
-
:non_parent_leaves,
|
332
|
-
:other_leaves,
|
333
|
-
:single_tag_info,
|
334
|
-
:all_tags
|
335
|
-
|
336
|
-
# @note If a node name is quoted, then those quotes are removed
|
337
|
-
# first.
|
338
|
-
#
|
339
|
-
# @param node [NewickNode] a NewickNode from a NewickTree
|
340
|
-
# @param tree [NewickTree] a NewickTree
|
341
|
-
def initialize node, tree, metadata=nil
|
342
|
-
tree_taxa = tree.unquoted_taxa
|
343
|
-
|
344
|
-
@name = unquote node.name
|
345
|
-
@all_leaves = descendant_leaves node
|
346
|
-
|
347
|
-
if (children = node.children).count == 2
|
348
|
-
lchild, rchild = node.children
|
349
|
-
|
350
|
-
@left_leaves = descendant_leaves lchild
|
351
|
-
|
352
|
-
@right_leaves = descendant_leaves rchild
|
353
|
-
end
|
354
|
-
|
355
|
-
siblings = node.siblings
|
356
|
-
# assert siblings.count == 1,
|
357
|
-
# "Node #{node.name} has more than one sibling."
|
358
|
-
|
359
|
-
@each_sibling_leaf_set = siblings.
|
360
|
-
map { |node| descendant_leaves node }
|
361
|
-
|
362
|
-
@all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq
|
363
|
-
|
364
|
-
parent = node.parent
|
365
|
-
assert parent,
|
366
|
-
"Noge #{node.name} has no parent. Is it the root?"
|
367
|
-
@parent_leaves = descendant_leaves parent
|
368
|
-
|
369
|
-
@other_leaves =
|
370
|
-
Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
|
371
|
-
|
372
|
-
@non_parent_leaves =
|
373
|
-
Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
|
374
|
-
|
375
|
-
if metadata
|
376
|
-
@metadata = metadata
|
377
|
-
@all_tags ||= get_all_tags
|
378
|
-
@single_tag_info ||= get_single_tag_info
|
379
|
-
else
|
380
|
-
@single_tag_info = nil
|
381
|
-
end
|
382
|
-
end
|
383
|
-
|
384
|
-
# Compares two Clades field by field.
|
385
|
-
#
|
386
|
-
# If all instance variables are == than the two clades are == as
|
387
|
-
# well.
|
388
|
-
def == clade
|
389
|
-
(
|
390
|
-
self.name == clade.name &&
|
391
|
-
self.all_leaves == clade.all_leaves &&
|
392
|
-
self.left_leaves == clade.left_leaves &&
|
393
|
-
self.right_leaves == clade.right_leaves &&
|
394
|
-
self.all_sibling_leaves == clade.all_sibling_leaves &&
|
395
|
-
self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
|
396
|
-
self.parent_leaves == clade.parent_leaves &&
|
397
|
-
self.other_leaves == clade.other_leaves &&
|
398
|
-
self.single_tag_info == clade.single_tag_info &&
|
399
|
-
self.all_tags == clade.all_tags
|
400
|
-
)
|
401
|
-
end
|
402
|
-
|
403
|
-
# Alias for ==
|
404
|
-
def eql? clade
|
405
|
-
self == clade
|
406
|
-
end
|
407
|
-
|
408
|
-
private
|
409
|
-
|
410
|
-
def get_single_tag_info
|
411
|
-
@all_tags.map do |md_cat, set|
|
412
|
-
[md_cat, set.count == 1 ? set.to_a.first : nil]
|
413
|
-
end.to_h
|
414
|
-
end
|
415
|
-
|
416
|
-
def get_all_tags
|
417
|
-
# name2tag has leaf names => metadata tag and is an Attrs
|
418
|
-
@metadata.map do |md_cat, name2tag|
|
419
|
-
tag_info = self.all_leaves.map do |leaf|
|
420
|
-
assert name2tag.has_key?(leaf),
|
421
|
-
"leaf #{leaf} is not present in name2tag ht for " +
|
422
|
-
"md_cat #{md_cat}"
|
423
|
-
|
424
|
-
name2tag[leaf]
|
425
|
-
end
|
426
|
-
|
427
|
-
[md_cat, Set.new(tag_info)]
|
428
|
-
end.to_h
|
429
|
-
end
|
430
|
-
|
431
|
-
def descendant_leaves node
|
432
|
-
if node.leaf?
|
433
|
-
[unquote(node.name)]
|
434
|
-
else
|
435
|
-
node.
|
436
|
-
descendants.
|
437
|
-
flatten.
|
438
|
-
uniq.
|
439
|
-
select { |node| node.leaf? }.
|
440
|
-
map { |node| unquote(node.name) }
|
441
|
-
end
|
442
|
-
end
|
443
|
-
|
444
|
-
def unquote str
|
445
|
-
str.tr %q{"'}, ""
|
446
|
-
end
|
447
|
-
end
|
448
328
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tree_clusters
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-11-
|
11
|
+
date: 2017-11-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -172,6 +172,7 @@ description: Snazzy code for working with each cluster in a tree.
|
|
172
172
|
email:
|
173
173
|
- moorer@udel.edu
|
174
174
|
executables:
|
175
|
+
- clade_attrs
|
175
176
|
- snazzy_clades
|
176
177
|
- snazzy_clades_attrs
|
177
178
|
- snazzy_clades_key_cols
|
@@ -188,10 +189,14 @@ files:
|
|
188
189
|
- Rakefile
|
189
190
|
- bin/console
|
190
191
|
- bin/setup
|
192
|
+
- exe/clade_attrs
|
191
193
|
- exe/snazzy_clades
|
192
194
|
- exe/snazzy_clades_attrs
|
193
195
|
- exe/snazzy_clades_key_cols
|
194
196
|
- lib/tree_clusters.rb
|
197
|
+
- lib/tree_clusters/attr_array.rb
|
198
|
+
- lib/tree_clusters/attrs.rb
|
199
|
+
- lib/tree_clusters/clade.rb
|
195
200
|
- lib/tree_clusters/version.rb
|
196
201
|
- test_files/bad.aln
|
197
202
|
- test_files/non_bifurcating.aln
|