tree_clusters 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/exe/clade_attrs +229 -0
- data/lib/tree_clusters/attr_array.rb +18 -0
- data/lib/tree_clusters/attrs.rb +47 -0
- data/lib/tree_clusters/clade.rb +128 -0
- data/lib/tree_clusters/version.rb +1 -1
- data/lib/tree_clusters.rb +70 -190
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e6f0523128e1d9efede01f9af494d99e7e51a48c
|
4
|
+
data.tar.gz: 58bac03495f585ad2ea755abaac5ceefdb90e2d3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 73a82096be4f5be8199be28bf3e12b0ac63c08b9da1cfea1fded6cd4da7c448ffc7cdc8a285c681b88493b39fa32fcace02be0fff23190a314050d107db4b2f5
|
7
|
+
data.tar.gz: a5b00bec9a4f567efc098e664e6479e691f96e761a5c57ac5e4f198159c9d1cfd320dbb2033a1a458597460f621815e03ee6ecef351f31db4597f4a0a7aac838
|
data/exe/clade_attrs
ADDED
@@ -0,0 +1,229 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
Signal.trap("PIPE", "EXIT")
|
4
|
+
|
5
|
+
require "tree_clusters"
|
6
|
+
require "trollop"
|
7
|
+
require "parse_fasta"
|
8
|
+
require "shannon"
|
9
|
+
require "fileutils"
|
10
|
+
|
11
|
+
TreeClusters.extend TreeClusters
|
12
|
+
|
13
|
+
def puts_info outf, clade_id, attr_cat, attr_set
|
14
|
+
outf.puts [clade_id, attr_cat, attr_set.to_a].join "\t"
|
15
|
+
end
|
16
|
+
|
17
|
+
opts = Trollop.options do
|
18
|
+
version TreeClusters::VERSION
|
19
|
+
|
20
|
+
banner <<-EOS
|
21
|
+
|
22
|
+
|
23
|
+
Checking IDs
|
24
|
+
------------
|
25
|
+
|
26
|
+
IDs for the sequences must match between the three input files.
|
27
|
+
|
28
|
+
The tree file is allowed to have quoted taxa names, but the mapping
|
29
|
+
file and alignment file are not.
|
30
|
+
|
31
|
+
If your alignment file has spaces in the name, the ID part of the
|
32
|
+
header (i.e., the part up until the space) must match with the
|
33
|
+
sequence IDs in the tree and the mapping file.
|
34
|
+
|
35
|
+
Example: This would be okay.
|
36
|
+
|
37
|
+
tree file:
|
38
|
+
('genome_A', 'genome_B');
|
39
|
+
|
40
|
+
aln file:
|
41
|
+
>genome_A apple pie
|
42
|
+
AAAAA
|
43
|
+
>genome_B brown sugar
|
44
|
+
AATTA
|
45
|
+
|
46
|
+
mapping file:
|
47
|
+
name coolness
|
48
|
+
genome_A cool
|
49
|
+
genome_B notcool
|
50
|
+
|
51
|
+
|
52
|
+
Subtracting parent nodes
|
53
|
+
------------------------
|
54
|
+
|
55
|
+
If a clade's parent would be the root of the tree, no columns will
|
56
|
+
be subtracted when removing the parent columns as it would be the
|
57
|
+
entire alignment.
|
58
|
+
|
59
|
+
Options:
|
60
|
+
EOS
|
61
|
+
|
62
|
+
opt(:tree,
|
63
|
+
"Newick tree file",
|
64
|
+
type: :string)
|
65
|
+
opt(:mapping,
|
66
|
+
"Mapping file",
|
67
|
+
type: :string)
|
68
|
+
opt(:attrs,
|
69
|
+
"Attributes file",
|
70
|
+
type: :string)
|
71
|
+
|
72
|
+
opt(:clade_size_cutoff,
|
73
|
+
"Consider only clades with at least this many leaves",
|
74
|
+
default: 1)
|
75
|
+
|
76
|
+
opt(:outdir,
|
77
|
+
"Output directory",
|
78
|
+
default: ".")
|
79
|
+
opt(:base,
|
80
|
+
"Basename for output",
|
81
|
+
default: "clade_attrs")
|
82
|
+
end
|
83
|
+
|
84
|
+
abort_if opts[:tree].nil?,
|
85
|
+
"--tree is a required arg"
|
86
|
+
abort_if opts[:mapping].nil?,
|
87
|
+
"--mapping is a required arg"
|
88
|
+
abort_if opts[:attrs].nil?,
|
89
|
+
"--attrs is a required arg"
|
90
|
+
|
91
|
+
abort_unless_file_exists opts[:tree]
|
92
|
+
abort_unless_file_exists opts[:mapping]
|
93
|
+
abort_unless_file_exists opts[:attrs]
|
94
|
+
|
95
|
+
# TODO check IDs when attrs is not a fasta file
|
96
|
+
# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:attrs]
|
97
|
+
|
98
|
+
abort_unless opts[:clade_size_cutoff] >= 1,
|
99
|
+
"--clade-size-cutoff must be >= 1"
|
100
|
+
|
101
|
+
FileUtils.mkdir_p opts[:outdir]
|
102
|
+
|
103
|
+
tree = NewickTree.fromFile opts[:tree]
|
104
|
+
metadata = TreeClusters.read_mapping_file opts[:mapping]
|
105
|
+
snazzy_info = TreeClusters.snazzy_info tree, metadata
|
106
|
+
attr_names, leaf2attrs = TreeClusters.read_attrs_file opts[:attrs]
|
107
|
+
|
108
|
+
ext_base = "clade_attrs"
|
109
|
+
|
110
|
+
clades_fname =
|
111
|
+
File.join opts[:outdir],
|
112
|
+
"#{opts[:base]}.#{ext_base}.txt"
|
113
|
+
members_fname =
|
114
|
+
File.join opts[:outdir],
|
115
|
+
"#{opts[:base]}.#{ext_base}_clade_members.txt"
|
116
|
+
attrs_fname =
|
117
|
+
File.join opts[:outdir],
|
118
|
+
"#{opts[:base]}.#{ext_base}_attrs_union.txt"
|
119
|
+
attrs_intersection_fname =
|
120
|
+
File.join opts[:outdir],
|
121
|
+
"#{opts[:base]}.#{ext_base}_attrs_intersection.txt"
|
122
|
+
attrs_minus_parent_attrs_fname =
|
123
|
+
File.join opts[:outdir],
|
124
|
+
"#{opts[:base]}.#{ext_base}_attrs_minus_parent_attrs.txt"
|
125
|
+
attrs_minus_sibling_attrs_fname =
|
126
|
+
File.join opts[:outdir],
|
127
|
+
"#{opts[:base]}.#{ext_base}_attrs_minus_sibling_attrs.txt"
|
128
|
+
attrs_minus_other_attrs_fname =
|
129
|
+
File.join opts[:outdir],
|
130
|
+
"#{opts[:base]}.#{ext_base}_attrs_minus_other_attrs.txt"
|
131
|
+
|
132
|
+
|
133
|
+
info_f =
|
134
|
+
File.open(clades_fname, "w")
|
135
|
+
clade_members_f =
|
136
|
+
File.open(members_fname, "w")
|
137
|
+
attrs_f =
|
138
|
+
File.open(attrs_fname, "w")
|
139
|
+
attrs_intersection_f =
|
140
|
+
File.open(attrs_intersection_fname, "w")
|
141
|
+
attrs_minus_parent_attrs_f =
|
142
|
+
File.open(attrs_minus_parent_attrs_fname, "w")
|
143
|
+
attrs_minus_sibling_attrs_f =
|
144
|
+
File.open(attrs_minus_sibling_attrs_fname, "w")
|
145
|
+
attrs_minus_other_attrs_f =
|
146
|
+
File.open(attrs_minus_other_attrs_fname, "w")
|
147
|
+
|
148
|
+
|
149
|
+
begin
|
150
|
+
# info is { metadata_category => metadata_tag , ... }
|
151
|
+
snazzy_info.each_with_index do |(clade, info), idx|
|
152
|
+
assert clade.all_leaves.all? { |leaf| leaf2attrs.has_key? leaf },
|
153
|
+
"Not all leaves are present in the leaf2attrs hash table"
|
154
|
+
|
155
|
+
clade_id = "clade_#{idx+1}___#{clade.name}"
|
156
|
+
|
157
|
+
is_snazzy = info.nil? ? false : true
|
158
|
+
snazzy = is_snazzy ? "snazzy" : "not_snazzy"
|
159
|
+
|
160
|
+
if is_snazzy
|
161
|
+
info_f.puts [clade_id,
|
162
|
+
info.count,
|
163
|
+
info.map { |pair| pair.join("|")}].join "\t"
|
164
|
+
else
|
165
|
+
info_f.puts [clade_id,
|
166
|
+
0,
|
167
|
+
"not_snazzy"].join "\t"
|
168
|
+
end
|
169
|
+
|
170
|
+
clade_members_f.puts [clade_id,
|
171
|
+
clade.all_leaves.count,
|
172
|
+
clade.all_leaves].join "\t"
|
173
|
+
|
174
|
+
attr_names.each do |attr_category|
|
175
|
+
attrs_all_leaves =
|
176
|
+
leaf2attrs.attrs clade.all_leaves, attr_category
|
177
|
+
|
178
|
+
attrs_all_sibling_leaves =
|
179
|
+
leaf2attrs.attrs clade.all_sibling_leaves,
|
180
|
+
attr_category
|
181
|
+
attrs_parent_leaves =
|
182
|
+
leaf2attrs.attrs clade.parent_leaves,
|
183
|
+
attr_category
|
184
|
+
attrs_other_leaves =
|
185
|
+
leaf2attrs.attrs clade.other_leaves,
|
186
|
+
attr_category
|
187
|
+
|
188
|
+
attrs_all_minus_parent =
|
189
|
+
attrs_all_leaves.union - attrs_parent_leaves.union
|
190
|
+
attrs_all_minus_sibling =
|
191
|
+
attrs_all_leaves.union - attrs_all_sibling_leaves.union
|
192
|
+
attrs_all_minus_other =
|
193
|
+
attrs_all_leaves.union - attrs_other_leaves.union
|
194
|
+
|
195
|
+
|
196
|
+
puts_info attrs_f,
|
197
|
+
clade_id,
|
198
|
+
attr_category,
|
199
|
+
attrs_all_leaves.union
|
200
|
+
|
201
|
+
puts_info attrs_intersection_f,
|
202
|
+
clade_id,
|
203
|
+
attr_category,
|
204
|
+
attrs_all_leaves.intersection
|
205
|
+
|
206
|
+
puts_info attrs_minus_parent_attrs_f,
|
207
|
+
clade_id,
|
208
|
+
attr_category,
|
209
|
+
attrs_all_minus_parent
|
210
|
+
|
211
|
+
puts_info attrs_minus_sibling_attrs_f,
|
212
|
+
clade_id,
|
213
|
+
attr_category,
|
214
|
+
attrs_all_minus_sibling
|
215
|
+
|
216
|
+
puts_info attrs_minus_other_attrs_f,
|
217
|
+
clade_id,
|
218
|
+
attr_category,
|
219
|
+
attrs_all_minus_other
|
220
|
+
end
|
221
|
+
end
|
222
|
+
ensure
|
223
|
+
info_f.close
|
224
|
+
clade_members_f.close
|
225
|
+
attrs_f.close
|
226
|
+
attrs_minus_parent_attrs_f.close
|
227
|
+
attrs_minus_sibling_attrs_f.close
|
228
|
+
attrs_minus_other_attrs_f.close
|
229
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module TreeClusters
|
2
|
+
# Provides convenience methods for working with Arrays of Sets
|
3
|
+
class AttrArray < Object::Array
|
4
|
+
# Takes the union of all sets in the AttrArray
|
5
|
+
#
|
6
|
+
# @return [Set]
|
7
|
+
def union
|
8
|
+
self.reduce(&:union)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Takes the intersection of all sets in the AttrArray
|
12
|
+
#
|
13
|
+
# @return [Set]
|
14
|
+
def intersection
|
15
|
+
self.reduce(&:intersection)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module TreeClusters
|
2
|
+
# A Hash table for genome/leaf/taxa attributes
|
3
|
+
class Attrs < Hash
|
4
|
+
|
5
|
+
# Returns the an AttrArray of Sets for the given genomes and
|
6
|
+
# attribute.
|
7
|
+
#
|
8
|
+
# @note If a genome is in the leaves array, but is not in the hash
|
9
|
+
# table, NO error will be raised. Rather that genome will be
|
10
|
+
# skipped. This is for cases in which not all genomes have
|
11
|
+
# attributes.
|
12
|
+
#
|
13
|
+
# @param leaves [Array<String>] names of the leaves for which you
|
14
|
+
# need attributes
|
15
|
+
# @param attr [Symbol] the attribute you are interested in eg,
|
16
|
+
# :genes
|
17
|
+
#
|
18
|
+
# @return [AttrArray<Set>] an AttrArray of Sets of
|
19
|
+
# attributes
|
20
|
+
#
|
21
|
+
# @raise [AbortIf::Exit] if they leaf is present but doesn't have
|
22
|
+
# the requested attr
|
23
|
+
def attrs leaves, attr
|
24
|
+
ary = leaves.map do |leaf|
|
25
|
+
|
26
|
+
if self.has_key? leaf
|
27
|
+
abort_unless self[leaf].has_key?(attr),
|
28
|
+
"Missing attr #{attr.inspect} for leaf '#{leaf}'"
|
29
|
+
|
30
|
+
self[leaf][attr]
|
31
|
+
else
|
32
|
+
nil
|
33
|
+
end
|
34
|
+
end.compact
|
35
|
+
|
36
|
+
TreeClusters::AttrArray.new ary
|
37
|
+
end
|
38
|
+
|
39
|
+
def add leaf, attr, val
|
40
|
+
if self.has_key? leaf
|
41
|
+
self[leaf][attr] = val
|
42
|
+
else
|
43
|
+
self[leaf] = { attr => val }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
module TreeClusters
|
2
|
+
# Represents a clade in a NewickTree
|
3
|
+
class Clade
|
4
|
+
attr_accessor :name,
|
5
|
+
:all_leaves,
|
6
|
+
:left_leaves,
|
7
|
+
:right_leaves,
|
8
|
+
:all_sibling_leaves,
|
9
|
+
:each_sibling_leaf_set,
|
10
|
+
:parent_leaves,
|
11
|
+
:non_parent_leaves,
|
12
|
+
:other_leaves,
|
13
|
+
:single_tag_info,
|
14
|
+
:all_tags
|
15
|
+
|
16
|
+
# @note If a node name is quoted, then those quotes are removed
|
17
|
+
# first.
|
18
|
+
#
|
19
|
+
# @param node [NewickNode] a NewickNode from a NewickTree
|
20
|
+
# @param tree [NewickTree] a NewickTree
|
21
|
+
def initialize node, tree, metadata=nil
|
22
|
+
tree_taxa = tree.unquoted_taxa
|
23
|
+
|
24
|
+
@name = unquote node.name
|
25
|
+
@all_leaves = descendant_leaves node
|
26
|
+
|
27
|
+
if (children = node.children).count == 2
|
28
|
+
lchild, rchild = node.children
|
29
|
+
|
30
|
+
@left_leaves = descendant_leaves lchild
|
31
|
+
|
32
|
+
@right_leaves = descendant_leaves rchild
|
33
|
+
end
|
34
|
+
|
35
|
+
siblings = node.siblings
|
36
|
+
# assert siblings.count == 1,
|
37
|
+
# "Node #{node.name} has more than one sibling."
|
38
|
+
|
39
|
+
@each_sibling_leaf_set = siblings.
|
40
|
+
map { |node| descendant_leaves node }
|
41
|
+
|
42
|
+
@all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq
|
43
|
+
|
44
|
+
parent = node.parent
|
45
|
+
assert parent,
|
46
|
+
"Noge #{node.name} has no parent. Is it the root?"
|
47
|
+
@parent_leaves = descendant_leaves parent
|
48
|
+
|
49
|
+
@other_leaves =
|
50
|
+
Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
|
51
|
+
|
52
|
+
@non_parent_leaves =
|
53
|
+
Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
|
54
|
+
|
55
|
+
if metadata
|
56
|
+
@metadata = metadata
|
57
|
+
@all_tags ||= get_all_tags
|
58
|
+
@single_tag_info ||= get_single_tag_info
|
59
|
+
else
|
60
|
+
@single_tag_info = nil
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Compares two Clades field by field.
|
65
|
+
#
|
66
|
+
# If all instance variables are == than the two clades are == as
|
67
|
+
# well.
|
68
|
+
def == clade
|
69
|
+
(
|
70
|
+
self.name == clade.name &&
|
71
|
+
self.all_leaves == clade.all_leaves &&
|
72
|
+
self.left_leaves == clade.left_leaves &&
|
73
|
+
self.right_leaves == clade.right_leaves &&
|
74
|
+
self.all_sibling_leaves == clade.all_sibling_leaves &&
|
75
|
+
self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
|
76
|
+
self.parent_leaves == clade.parent_leaves &&
|
77
|
+
self.other_leaves == clade.other_leaves &&
|
78
|
+
self.single_tag_info == clade.single_tag_info &&
|
79
|
+
self.all_tags == clade.all_tags
|
80
|
+
)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Alias for ==
|
84
|
+
def eql? clade
|
85
|
+
self == clade
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
|
90
|
+
def get_single_tag_info
|
91
|
+
@all_tags.map do |md_cat, set|
|
92
|
+
[md_cat, set.count == 1 ? set.to_a.first : nil]
|
93
|
+
end.to_h
|
94
|
+
end
|
95
|
+
|
96
|
+
def get_all_tags
|
97
|
+
# name2tag has leaf names => metadata tag and is an Attrs
|
98
|
+
@metadata.map do |md_cat, name2tag|
|
99
|
+
tag_info = self.all_leaves.map do |leaf|
|
100
|
+
assert name2tag.has_key?(leaf),
|
101
|
+
"leaf #{leaf} is not present in name2tag ht for " +
|
102
|
+
"md_cat #{md_cat}"
|
103
|
+
|
104
|
+
name2tag[leaf]
|
105
|
+
end
|
106
|
+
|
107
|
+
[md_cat, Set.new(tag_info)]
|
108
|
+
end.to_h
|
109
|
+
end
|
110
|
+
|
111
|
+
def descendant_leaves node
|
112
|
+
if node.leaf?
|
113
|
+
[unquote(node.name)]
|
114
|
+
else
|
115
|
+
node.
|
116
|
+
descendants.
|
117
|
+
flatten.
|
118
|
+
uniq.
|
119
|
+
select { |node| node.leaf? }.
|
120
|
+
map { |node| unquote(node.name) }
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def unquote str
|
125
|
+
str.tr %q{"'}, ""
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
data/lib/tree_clusters.rb
CHANGED
@@ -3,6 +3,9 @@ require "Newick"
|
|
3
3
|
require "set"
|
4
4
|
require "parse_fasta"
|
5
5
|
require "shannon"
|
6
|
+
require "tree_clusters/attrs"
|
7
|
+
require "tree_clusters/attr_array"
|
8
|
+
require "tree_clusters/clade"
|
6
9
|
require "tree_clusters/version"
|
7
10
|
|
8
11
|
include AbortIf
|
@@ -198,6 +201,73 @@ module TreeClusters
|
|
198
201
|
snazzy_clades
|
199
202
|
end
|
200
203
|
|
204
|
+
def snazzy_info tree, metadata
|
205
|
+
snazzy_info = {}
|
206
|
+
|
207
|
+
clades = self.
|
208
|
+
all_clades(tree, metadata).
|
209
|
+
sort_by { |clade| clade.all_leaves.count }.
|
210
|
+
reverse
|
211
|
+
|
212
|
+
# Non snazzy clades have a value of nil, so set all to nil and the
|
213
|
+
# snazzy ones will be overwritten.
|
214
|
+
clades.each do |clade|
|
215
|
+
snazzy_info[clade] = nil
|
216
|
+
end
|
217
|
+
|
218
|
+
metadata.each do |md_cat, leaf2mdtag|
|
219
|
+
already_checked = Set.new
|
220
|
+
single_tag_clades = {}
|
221
|
+
|
222
|
+
clades.each do |clade|
|
223
|
+
assert clade.all_leaves.count > 1,
|
224
|
+
"A clade cannot also be a leaf"
|
225
|
+
|
226
|
+
unless clade.all_leaves.all? do |leaf|
|
227
|
+
already_checked.include? leaf
|
228
|
+
end
|
229
|
+
md_tags = clade.all_leaves.map do |leaf|
|
230
|
+
assert leaf2mdtag.has_key?(leaf),
|
231
|
+
"leaf #{leaf} is missing from leaf2mdtag ht"
|
232
|
+
|
233
|
+
leaf2mdtag[leaf]
|
234
|
+
end
|
235
|
+
|
236
|
+
# this clade is mono-phyletic w.r.t. this metadata category.
|
237
|
+
if md_tags.uniq.count == 1
|
238
|
+
clade.all_leaves.each do |leaf|
|
239
|
+
already_checked << leaf
|
240
|
+
end
|
241
|
+
|
242
|
+
assert !single_tag_clades.has_key?(clade),
|
243
|
+
"clade #{clade.name} is repeated in single_tag_clades for #{md_cat}"
|
244
|
+
|
245
|
+
single_tag_clades[clade] = md_tags.first
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
single_tag_clades.each do |clade, md_tag|
|
251
|
+
non_clade_leaves = tree.unquoted_taxa - clade.all_leaves
|
252
|
+
|
253
|
+
non_clade_leaves_with_this_md_tag = non_clade_leaves.map do |leaf|
|
254
|
+
[leaf, leaf2mdtag[leaf]]
|
255
|
+
end.select { |ary| ary.last == md_tag }
|
256
|
+
|
257
|
+
is_snazzy_clade = non_clade_leaves_with_this_md_tag.count.zero?
|
258
|
+
if is_snazzy_clade
|
259
|
+
if !snazzy_info[clade].nil?
|
260
|
+
snazzy_info[clade][md_cat] = md_tag
|
261
|
+
else
|
262
|
+
snazzy_info[clade] = { md_cat => md_tag }
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
snazzy_info
|
269
|
+
end
|
270
|
+
|
201
271
|
def read_mapping_file fname
|
202
272
|
md_cat_names = nil
|
203
273
|
metadata = TreeClusters::Attrs.new
|
@@ -255,194 +325,4 @@ module TreeClusters
|
|
255
325
|
|
256
326
|
[attr_names, attrs]
|
257
327
|
end
|
258
|
-
|
259
|
-
# A Hash table for genome/leaf/taxa attributes
|
260
|
-
class Attrs < Hash
|
261
|
-
|
262
|
-
# Returns the an AttrArray of Sets for the given genomes and
|
263
|
-
# attribute.
|
264
|
-
#
|
265
|
-
# @note If a genome is in the leaves array, but is not in the hash
|
266
|
-
# table, NO error will be raised. Rather that genome will be
|
267
|
-
# skipped. This is for cases in which not all genomes have
|
268
|
-
# attributes.
|
269
|
-
#
|
270
|
-
# @param leaves [Array<String>] names of the leaves for which you
|
271
|
-
# need attributes
|
272
|
-
# @param attr [Symbol] the attribute you are interested in eg,
|
273
|
-
# :genes
|
274
|
-
#
|
275
|
-
# @return [AttrArray<Set>] an AttrArray of Sets of
|
276
|
-
# attributes
|
277
|
-
#
|
278
|
-
# @raise [AbortIf::Exit] if they leaf is present but doesn't have
|
279
|
-
# the requested attr
|
280
|
-
def attrs leaves, attr
|
281
|
-
ary = leaves.map do |leaf|
|
282
|
-
|
283
|
-
if self.has_key? leaf
|
284
|
-
abort_unless self[leaf].has_key?(attr),
|
285
|
-
"Missing attr #{attr.inspect} for leaf '#{leaf}'"
|
286
|
-
|
287
|
-
self[leaf][attr]
|
288
|
-
else
|
289
|
-
nil
|
290
|
-
end
|
291
|
-
end.compact
|
292
|
-
|
293
|
-
TreeClusters::AttrArray.new ary
|
294
|
-
end
|
295
|
-
|
296
|
-
def add leaf, attr, val
|
297
|
-
if self.has_key? leaf
|
298
|
-
self[leaf][attr] = val
|
299
|
-
else
|
300
|
-
self[leaf] = { attr => val }
|
301
|
-
end
|
302
|
-
end
|
303
|
-
end
|
304
|
-
|
305
|
-
# Provides convenience methods for working with Arrays of Sets
|
306
|
-
class AttrArray < Object::Array
|
307
|
-
# Takes the union of all sets in the AttrArray
|
308
|
-
#
|
309
|
-
# @return [Set]
|
310
|
-
def union
|
311
|
-
self.reduce(&:union)
|
312
|
-
end
|
313
|
-
|
314
|
-
# Takes the intersection of all sets in the AttrArray
|
315
|
-
#
|
316
|
-
# @return [Set]
|
317
|
-
def intersection
|
318
|
-
self.reduce(&:intersection)
|
319
|
-
end
|
320
|
-
end
|
321
|
-
|
322
|
-
# Represents a clade in a NewickTree
|
323
|
-
class Clade
|
324
|
-
attr_accessor :name,
|
325
|
-
:all_leaves,
|
326
|
-
:left_leaves,
|
327
|
-
:right_leaves,
|
328
|
-
:all_sibling_leaves,
|
329
|
-
:each_sibling_leaf_set,
|
330
|
-
:parent_leaves,
|
331
|
-
:non_parent_leaves,
|
332
|
-
:other_leaves,
|
333
|
-
:single_tag_info,
|
334
|
-
:all_tags
|
335
|
-
|
336
|
-
# @note If a node name is quoted, then those quotes are removed
|
337
|
-
# first.
|
338
|
-
#
|
339
|
-
# @param node [NewickNode] a NewickNode from a NewickTree
|
340
|
-
# @param tree [NewickTree] a NewickTree
|
341
|
-
def initialize node, tree, metadata=nil
|
342
|
-
tree_taxa = tree.unquoted_taxa
|
343
|
-
|
344
|
-
@name = unquote node.name
|
345
|
-
@all_leaves = descendant_leaves node
|
346
|
-
|
347
|
-
if (children = node.children).count == 2
|
348
|
-
lchild, rchild = node.children
|
349
|
-
|
350
|
-
@left_leaves = descendant_leaves lchild
|
351
|
-
|
352
|
-
@right_leaves = descendant_leaves rchild
|
353
|
-
end
|
354
|
-
|
355
|
-
siblings = node.siblings
|
356
|
-
# assert siblings.count == 1,
|
357
|
-
# "Node #{node.name} has more than one sibling."
|
358
|
-
|
359
|
-
@each_sibling_leaf_set = siblings.
|
360
|
-
map { |node| descendant_leaves node }
|
361
|
-
|
362
|
-
@all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq
|
363
|
-
|
364
|
-
parent = node.parent
|
365
|
-
assert parent,
|
366
|
-
"Noge #{node.name} has no parent. Is it the root?"
|
367
|
-
@parent_leaves = descendant_leaves parent
|
368
|
-
|
369
|
-
@other_leaves =
|
370
|
-
Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
|
371
|
-
|
372
|
-
@non_parent_leaves =
|
373
|
-
Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
|
374
|
-
|
375
|
-
if metadata
|
376
|
-
@metadata = metadata
|
377
|
-
@all_tags ||= get_all_tags
|
378
|
-
@single_tag_info ||= get_single_tag_info
|
379
|
-
else
|
380
|
-
@single_tag_info = nil
|
381
|
-
end
|
382
|
-
end
|
383
|
-
|
384
|
-
# Compares two Clades field by field.
|
385
|
-
#
|
386
|
-
# If all instance variables are == than the two clades are == as
|
387
|
-
# well.
|
388
|
-
def == clade
|
389
|
-
(
|
390
|
-
self.name == clade.name &&
|
391
|
-
self.all_leaves == clade.all_leaves &&
|
392
|
-
self.left_leaves == clade.left_leaves &&
|
393
|
-
self.right_leaves == clade.right_leaves &&
|
394
|
-
self.all_sibling_leaves == clade.all_sibling_leaves &&
|
395
|
-
self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
|
396
|
-
self.parent_leaves == clade.parent_leaves &&
|
397
|
-
self.other_leaves == clade.other_leaves &&
|
398
|
-
self.single_tag_info == clade.single_tag_info &&
|
399
|
-
self.all_tags == clade.all_tags
|
400
|
-
)
|
401
|
-
end
|
402
|
-
|
403
|
-
# Alias for ==
|
404
|
-
def eql? clade
|
405
|
-
self == clade
|
406
|
-
end
|
407
|
-
|
408
|
-
private
|
409
|
-
|
410
|
-
def get_single_tag_info
|
411
|
-
@all_tags.map do |md_cat, set|
|
412
|
-
[md_cat, set.count == 1 ? set.to_a.first : nil]
|
413
|
-
end.to_h
|
414
|
-
end
|
415
|
-
|
416
|
-
def get_all_tags
|
417
|
-
# name2tag has leaf names => metadata tag and is an Attrs
|
418
|
-
@metadata.map do |md_cat, name2tag|
|
419
|
-
tag_info = self.all_leaves.map do |leaf|
|
420
|
-
assert name2tag.has_key?(leaf),
|
421
|
-
"leaf #{leaf} is not present in name2tag ht for " +
|
422
|
-
"md_cat #{md_cat}"
|
423
|
-
|
424
|
-
name2tag[leaf]
|
425
|
-
end
|
426
|
-
|
427
|
-
[md_cat, Set.new(tag_info)]
|
428
|
-
end.to_h
|
429
|
-
end
|
430
|
-
|
431
|
-
def descendant_leaves node
|
432
|
-
if node.leaf?
|
433
|
-
[unquote(node.name)]
|
434
|
-
else
|
435
|
-
node.
|
436
|
-
descendants.
|
437
|
-
flatten.
|
438
|
-
uniq.
|
439
|
-
select { |node| node.leaf? }.
|
440
|
-
map { |node| unquote(node.name) }
|
441
|
-
end
|
442
|
-
end
|
443
|
-
|
444
|
-
def unquote str
|
445
|
-
str.tr %q{"'}, ""
|
446
|
-
end
|
447
|
-
end
|
448
328
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tree_clusters
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-11-
|
11
|
+
date: 2017-11-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -172,6 +172,7 @@ description: Snazzy code for working with each cluster in a tree.
|
|
172
172
|
email:
|
173
173
|
- moorer@udel.edu
|
174
174
|
executables:
|
175
|
+
- clade_attrs
|
175
176
|
- snazzy_clades
|
176
177
|
- snazzy_clades_attrs
|
177
178
|
- snazzy_clades_key_cols
|
@@ -188,10 +189,14 @@ files:
|
|
188
189
|
- Rakefile
|
189
190
|
- bin/console
|
190
191
|
- bin/setup
|
192
|
+
- exe/clade_attrs
|
191
193
|
- exe/snazzy_clades
|
192
194
|
- exe/snazzy_clades_attrs
|
193
195
|
- exe/snazzy_clades_key_cols
|
194
196
|
- lib/tree_clusters.rb
|
197
|
+
- lib/tree_clusters/attr_array.rb
|
198
|
+
- lib/tree_clusters/attrs.rb
|
199
|
+
- lib/tree_clusters/clade.rb
|
195
200
|
- lib/tree_clusters/version.rb
|
196
201
|
- test_files/bad.aln
|
197
202
|
- test_files/non_bifurcating.aln
|