tree_clusters 0.7.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +4 -0
- data/exe/key_cols +206 -0
- data/lib/tree_clusters.rb +19 -0
- data/lib/tree_clusters/clade.rb +26 -24
- data/lib/tree_clusters/version.rb +1 -1
- data/test_files/small2.aln +14 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 588e0cf7da9cd87056e339593b2de594aa7f7e34
|
4
|
+
data.tar.gz: a786a20ee672a2372ca3f1a1d1e8d7259acf4b16
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbf4875a27201c9cb1b406427e3c1fde6b195f71493b62116bcf676f97bcaf590fec7a8cfb4433a14a861093d7106a6cdacf2b190d03b7487c40bcac4726ef3a
|
7
|
+
data.tar.gz: 2bc0115fbeda56da3e8455cde72d859895e0e7b7670ceeeeac57ecc8434c656eedda846a653fbcfe69fdf33867c6b940a0ddaad55d0aaf8788606dfe261a702e
|
data/.gitignore
CHANGED
data/exe/key_cols
ADDED
@@ -0,0 +1,206 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
Signal.trap("PIPE", "EXIT")
|
4
|
+
|
5
|
+
require "tree_clusters"
|
6
|
+
require "trollop"
|
7
|
+
require "parse_fasta"
|
8
|
+
require "fileutils"
|
9
|
+
|
10
|
+
TreeClusters.extend TreeClusters
|
11
|
+
|
12
|
+
GREETING = "The '#{__FILE__}' program"
|
13
|
+
UNDERLINE = "=" * GREETING.length
|
14
|
+
|
15
|
+
opts = Trollop.options do
|
16
|
+
version TreeClusters::VERSION
|
17
|
+
|
18
|
+
# banner <<-EOS
|
19
|
+
|
20
|
+
|
21
|
+
# Checking IDs
|
22
|
+
# ------------
|
23
|
+
|
24
|
+
# IDs for the sequences must match between the three input files.
|
25
|
+
|
26
|
+
# The tree file is allowed to have quoted taxa names, but the
|
27
|
+
# mapping file and alignment file are not.
|
28
|
+
|
29
|
+
# If your alignment file has spaces in the name, the ID part of the
|
30
|
+
# header (i.e., the part up until the space) must match with the
|
31
|
+
# sequence IDs in the tree and the mapping file.
|
32
|
+
|
33
|
+
# Example: This would be okay.
|
34
|
+
|
35
|
+
# tree file:
|
36
|
+
# ('genome_A', 'genome_B');
|
37
|
+
|
38
|
+
# aln file:
|
39
|
+
# >genome_A apple pie
|
40
|
+
# AAAAA
|
41
|
+
# >genome_B brown sugar
|
42
|
+
# AATTA
|
43
|
+
|
44
|
+
# Options:
|
45
|
+
# EOS
|
46
|
+
|
47
|
+
banner <<-EOS
|
48
|
+
|
49
|
+
#{GREETING}
|
50
|
+
#{UNDERLINE}
|
51
|
+
|
52
|
+
Hi. My name is #{__FILE__}. If you give me a Newick tree file and
|
53
|
+
an alignment file (fasta format), I will tell you key columns for
|
54
|
+
all clades/clusters that have them.
|
55
|
+
|
56
|
+
Overview
|
57
|
+
--------
|
58
|
+
|
59
|
+
A clade has key columns if you can use the residue/nucleotide at
|
60
|
+
those columns to tell sequences in the clade from sequences outside
|
61
|
+
of the clade.
|
62
|
+
|
63
|
+
Here's an example....
|
64
|
+
|
65
|
+
After you run me (#{__FILE__} is my name), you'll get an output file
|
66
|
+
with the extension, '*.tree_clusters.key_cols.txt'. It may look
|
67
|
+
something like this:
|
68
|
+
|
69
|
+
cluster_A 4 1-A 2-A 3-A 5-G
|
70
|
+
cluster_B 4 1-C 2-C 3-C 5-A
|
71
|
+
|
72
|
+
This file has the clade name, the number of key columns for that
|
73
|
+
clade, and then the rest of the columns tell you the position
|
74
|
+
(1-based) and the nucleotide or residue in that column in all
|
75
|
+
sequences of that clade.
|
76
|
+
|
77
|
+
In this case we have only two clades. The key columns for both are
|
78
|
+
1, 2, 3, and 5. So you can use columns 1, 2, 3, and 5 to classify a
|
79
|
+
sequence as belonging to one of these clades. If it has A, A, A,
|
80
|
+
and G in those positions, it'll be in cluster_A, and if it has C, C,
|
81
|
+
C, and A in those positions, it'll be in cluster_B. If it has any
|
82
|
+
other combination in those 4 columns of the alignment, it won't be
|
83
|
+
in either clade.
|
84
|
+
|
85
|
+
This is just a silly example and most of the time you'll get
|
86
|
+
different key columns for different clades. Note that every clade
|
87
|
+
may not have key columns listed depending on your data and the
|
88
|
+
options you select.
|
89
|
+
|
90
|
+
Notes & Gotchas
|
91
|
+
--------------
|
92
|
+
|
93
|
+
- I ignore columns with gap chars (currently just '-') regardless of
|
94
|
+
column entropy.
|
95
|
+
|
96
|
+
Option info
|
97
|
+
-----------
|
98
|
+
|
99
|
+
--entropy-cutoff: A cutoff of 0 means that you allow no variation at
|
100
|
+
any column.
|
101
|
+
|
102
|
+
--clade-size-cutoff: Use this option to ignore tiny clades.
|
103
|
+
|
104
|
+
Options:
|
105
|
+
EOS
|
106
|
+
|
107
|
+
opt(:tree,
|
108
|
+
"Newick tree file",
|
109
|
+
type: :string)
|
110
|
+
opt(:aln,
|
111
|
+
"Alignment file",
|
112
|
+
type: :string)
|
113
|
+
|
114
|
+
opt(:entropy_cutoff,
|
115
|
+
"Cutoff to consider a column low entropy",
|
116
|
+
default: 0.0)
|
117
|
+
opt(:clade_size_cutoff,
|
118
|
+
"Consider only clades with at least this many leaves",
|
119
|
+
default: 1)
|
120
|
+
|
121
|
+
opt(:outdir,
|
122
|
+
"Output directory",
|
123
|
+
default: ".")
|
124
|
+
opt(:base,
|
125
|
+
"Basename for output",
|
126
|
+
default: "snazzy_clades")
|
127
|
+
end
|
128
|
+
|
129
|
+
abort_if opts[:tree].nil?,
|
130
|
+
"--tree is a required arg. Try running: #{__FILE__} --help"
|
131
|
+
abort_if opts[:aln].nil?,
|
132
|
+
"--aln is a required arg. Try running: #{__FILE__} --help"
|
133
|
+
|
134
|
+
abort_unless_file_exists opts[:tree]
|
135
|
+
abort_unless_file_exists opts[:aln]
|
136
|
+
|
137
|
+
# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]
|
138
|
+
|
139
|
+
abort_unless opts[:entropy_cutoff] >= 0,
|
140
|
+
"--entropy-cutoff must be >= 0"
|
141
|
+
abort_unless opts[:clade_size_cutoff] >= 1,
|
142
|
+
"--clade-size-cutoff must be >= 1"
|
143
|
+
|
144
|
+
FileUtils.mkdir_p opts[:outdir]
|
145
|
+
|
146
|
+
tree = NewickTree.fromFile opts[:tree]
|
147
|
+
leaf2attrs = TreeClusters.read_alignment opts[:aln]
|
148
|
+
|
149
|
+
members_fname =
|
150
|
+
File.join opts[:outdir],
|
151
|
+
"#{opts[:base]}.tree_clusters.clade_members.txt"
|
152
|
+
key_cols_fname =
|
153
|
+
File.join opts[:outdir],
|
154
|
+
"#{opts[:base]}.tree_clusters.key_cols.txt"
|
155
|
+
annotated_tree_fname =
|
156
|
+
File.join opts[:outdir],
|
157
|
+
"#{opts[:base]}.tree_clusters.annotated_tree.txt"
|
158
|
+
|
159
|
+
clade_members_f =
|
160
|
+
File.open(members_fname, "w")
|
161
|
+
key_cols_f =
|
162
|
+
File.open(key_cols_fname, "w")
|
163
|
+
annotated_tree_f =
|
164
|
+
File.open(annotated_tree_fname, "w")
|
165
|
+
|
166
|
+
key_col_sets = {}
|
167
|
+
|
168
|
+
begin
|
169
|
+
TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
|
170
|
+
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
|
171
|
+
|
172
|
+
clade_members_f.puts [clade_id,
|
173
|
+
clade.all_leaves.count,
|
174
|
+
clade.all_leaves].join "\t"
|
175
|
+
|
176
|
+
key_cols_all_leaves =
|
177
|
+
TreeClusters.low_ent_cols_with_bases clade.all_leaves,
|
178
|
+
leaf2attrs,
|
179
|
+
opts[:entropy_cutoff]
|
180
|
+
|
181
|
+
unless key_col_sets.has_key? key_cols_all_leaves
|
182
|
+
key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
|
183
|
+
end
|
184
|
+
key_col_sets[key_cols_all_leaves] << clade_id
|
185
|
+
|
186
|
+
# This will change the node in the original NewickTree
|
187
|
+
clade.node.name = "'#{clade_id}'"
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
# We only want key column sets that are unique to a single clade.
|
192
|
+
key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades|
|
193
|
+
clade_id = clades.first
|
194
|
+
key_cols_f.puts [
|
195
|
+
clade_id,
|
196
|
+
kc_set.count,
|
197
|
+
kc_set.map { |pos, bases| "#{pos}-#{bases.join}" }
|
198
|
+
].join "\t"
|
199
|
+
end
|
200
|
+
|
201
|
+
annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
|
202
|
+
ensure
|
203
|
+
clade_members_f.close
|
204
|
+
key_cols_f.close
|
205
|
+
annotated_tree_f.close
|
206
|
+
end
|
data/lib/tree_clusters.rb
CHANGED
@@ -89,6 +89,25 @@ module TreeClusters
|
|
89
89
|
Set.new low_ent_cols
|
90
90
|
end
|
91
91
|
|
92
|
+
# Like low_ent_cols method but also returns the bases at the positions.
|
93
|
+
def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
|
94
|
+
low_ent_cols = []
|
95
|
+
alns = leaf2attrs.attrs leaves, :aln
|
96
|
+
aln_cols = alns.transpose
|
97
|
+
|
98
|
+
aln_cols.each_with_index do |aln_col, aln_col_idx|
|
99
|
+
has_gaps = aln_col.any? { |aa| aa == "-" }
|
100
|
+
low_entropy =
|
101
|
+
Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
|
102
|
+
|
103
|
+
if !has_gaps && low_entropy
|
104
|
+
low_ent_cols << [(aln_col_idx + 1), aln_col.map(&:upcase).uniq.sort]
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
Set.new low_ent_cols
|
109
|
+
end
|
110
|
+
|
92
111
|
# @note If there are quoted names in the tree file, they are
|
93
112
|
# unquoted first.
|
94
113
|
def check_ids tree, mapping, aln
|
data/lib/tree_clusters/clade.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
module TreeClusters
|
2
2
|
# Represents a clade in a NewickTree
|
3
3
|
class Clade
|
4
|
-
attr_accessor :
|
4
|
+
attr_accessor :node,
|
5
|
+
:name,
|
5
6
|
:all_leaves,
|
6
7
|
:left_leaves,
|
7
8
|
:right_leaves,
|
@@ -18,10 +19,11 @@ module TreeClusters
|
|
18
19
|
#
|
19
20
|
# @param node [NewickNode] a NewickNode from a NewickTree
|
20
21
|
# @param tree [NewickTree] a NewickTree
|
21
|
-
def initialize node, tree, metadata=nil
|
22
|
+
def initialize node, tree, metadata = nil
|
22
23
|
tree_taxa = tree.unquoted_taxa
|
23
24
|
|
24
|
-
@
|
25
|
+
@node = node
|
26
|
+
@name = unquote node.name
|
25
27
|
@all_leaves = descendant_leaves node
|
26
28
|
|
27
29
|
if (children = node.children).count == 2
|
@@ -37,7 +39,7 @@ module TreeClusters
|
|
37
39
|
# "Node #{node.name} has more than one sibling."
|
38
40
|
|
39
41
|
@each_sibling_leaf_set = siblings.
|
40
|
-
|
42
|
+
map {|node| descendant_leaves node}
|
41
43
|
|
42
44
|
@all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq
|
43
45
|
|
@@ -47,14 +49,14 @@ module TreeClusters
|
|
47
49
|
@parent_leaves = descendant_leaves parent
|
48
50
|
|
49
51
|
@other_leaves =
|
50
|
-
|
52
|
+
Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
|
51
53
|
|
52
54
|
@non_parent_leaves =
|
53
|
-
|
55
|
+
Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
|
54
56
|
|
55
57
|
if metadata
|
56
|
-
@metadata
|
57
|
-
@all_tags
|
58
|
+
@metadata = metadata
|
59
|
+
@all_tags ||= get_all_tags
|
58
60
|
@single_tag_info ||= get_single_tag_info
|
59
61
|
else
|
60
62
|
@single_tag_info = nil
|
@@ -67,16 +69,16 @@ module TreeClusters
|
|
67
69
|
# well.
|
68
70
|
def == clade
|
69
71
|
(
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
72
|
+
self.name == clade.name &&
|
73
|
+
self.all_leaves == clade.all_leaves &&
|
74
|
+
self.left_leaves == clade.left_leaves &&
|
75
|
+
self.right_leaves == clade.right_leaves &&
|
76
|
+
self.all_sibling_leaves == clade.all_sibling_leaves &&
|
77
|
+
self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
|
78
|
+
self.parent_leaves == clade.parent_leaves &&
|
79
|
+
self.other_leaves == clade.other_leaves &&
|
80
|
+
self.single_tag_info == clade.single_tag_info &&
|
81
|
+
self.all_tags == clade.all_tags
|
80
82
|
)
|
81
83
|
end
|
82
84
|
|
@@ -99,7 +101,7 @@ module TreeClusters
|
|
99
101
|
tag_info = self.all_leaves.map do |leaf|
|
100
102
|
assert name2tag.has_key?(leaf),
|
101
103
|
"leaf #{leaf} is not present in name2tag ht for " +
|
102
|
-
|
104
|
+
"md_cat #{md_cat}"
|
103
105
|
|
104
106
|
name2tag[leaf]
|
105
107
|
end
|
@@ -113,11 +115,11 @@ module TreeClusters
|
|
113
115
|
[unquote(node.name)]
|
114
116
|
else
|
115
117
|
node.
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
118
|
+
descendants.
|
119
|
+
flatten.
|
120
|
+
uniq.
|
121
|
+
select {|node| node.leaf?}.
|
122
|
+
map {|node| unquote(node.name)}
|
121
123
|
end
|
122
124
|
end
|
123
125
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tree_clusters
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-07-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -173,6 +173,7 @@ email:
|
|
173
173
|
- moorer@udel.edu
|
174
174
|
executables:
|
175
175
|
- clade_attrs
|
176
|
+
- key_cols
|
176
177
|
- snazzy_clades
|
177
178
|
- snazzy_clades_attrs
|
178
179
|
- snazzy_clades_key_cols
|
@@ -190,6 +191,7 @@ files:
|
|
190
191
|
- bin/console
|
191
192
|
- bin/setup
|
192
193
|
- exe/clade_attrs
|
194
|
+
- exe/key_cols
|
193
195
|
- exe/snazzy_clades
|
194
196
|
- exe/snazzy_clades_attrs
|
195
197
|
- exe/snazzy_clades_key_cols
|
@@ -205,6 +207,7 @@ files:
|
|
205
207
|
- test_files/small.attrs
|
206
208
|
- test_files/small.mapping
|
207
209
|
- test_files/small.tre
|
210
|
+
- test_files/small2.aln
|
208
211
|
- test_files/small_aln_bad_ids
|
209
212
|
- test_files/small_mapping_bad_ids
|
210
213
|
- test_files/small_tree_bad_ids
|