tree_clusters 0.7.0 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +4 -0
- data/exe/key_cols +206 -0
- data/lib/tree_clusters.rb +19 -0
- data/lib/tree_clusters/clade.rb +26 -24
- data/lib/tree_clusters/version.rb +1 -1
- data/test_files/small2.aln +14 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 588e0cf7da9cd87056e339593b2de594aa7f7e34
|
4
|
+
data.tar.gz: a786a20ee672a2372ca3f1a1d1e8d7259acf4b16
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbf4875a27201c9cb1b406427e3c1fde6b195f71493b62116bcf676f97bcaf590fec7a8cfb4433a14a861093d7106a6cdacf2b190d03b7487c40bcac4726ef3a
|
7
|
+
data.tar.gz: 2bc0115fbeda56da3e8455cde72d859895e0e7b7670ceeeeac57ecc8434c656eedda846a653fbcfe69fdf33867c6b940a0ddaad55d0aaf8788606dfe261a702e
|
data/.gitignore
CHANGED
data/exe/key_cols
ADDED
@@ -0,0 +1,206 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
Signal.trap("PIPE", "EXIT")
|
4
|
+
|
5
|
+
require "tree_clusters"
|
6
|
+
require "trollop"
|
7
|
+
require "parse_fasta"
|
8
|
+
require "fileutils"
|
9
|
+
|
10
|
+
TreeClusters.extend TreeClusters
|
11
|
+
|
12
|
+
GREETING = "The '#{__FILE__}' program"
|
13
|
+
UNDERLINE = "=" * GREETING.length
|
14
|
+
|
15
|
+
opts = Trollop.options do
|
16
|
+
version TreeClusters::VERSION
|
17
|
+
|
18
|
+
# banner <<-EOS
|
19
|
+
|
20
|
+
|
21
|
+
# Checking IDs
|
22
|
+
# ------------
|
23
|
+
|
24
|
+
# IDs for the sequences must match between the three input files.
|
25
|
+
|
26
|
+
# The tree file is allowed to have quoted taxa names, but the
|
27
|
+
# mapping file and alignment file are not.
|
28
|
+
|
29
|
+
# If your alignment file has spaces in the name, the ID part of the
|
30
|
+
# header (i.e., the part up until the space) must match with the
|
31
|
+
# sequence IDs in the tree and the mapping file.
|
32
|
+
|
33
|
+
# Example: This would be okay.
|
34
|
+
|
35
|
+
# tree file:
|
36
|
+
# ('genome_A', 'genome_B');
|
37
|
+
|
38
|
+
# aln file:
|
39
|
+
# >genome_A apple pie
|
40
|
+
# AAAAA
|
41
|
+
# >genome_B brown sugar
|
42
|
+
# AATTA
|
43
|
+
|
44
|
+
# Options:
|
45
|
+
# EOS
|
46
|
+
|
47
|
+
banner <<-EOS
|
48
|
+
|
49
|
+
#{GREETING}
|
50
|
+
#{UNDERLINE}
|
51
|
+
|
52
|
+
Hi. My name is #{__FILE__}. If you give me a Newick tree file and
|
53
|
+
an alignment file (fasta format), I will tell you key columns for
|
54
|
+
all clades/clusters that have them.
|
55
|
+
|
56
|
+
Overview
|
57
|
+
--------
|
58
|
+
|
59
|
+
A clade has key columns if you can use the residue/nucleotide at
|
60
|
+
those columns to tell sequences in the clade from sequences outside
|
61
|
+
of the clade.
|
62
|
+
|
63
|
+
Here's an example....
|
64
|
+
|
65
|
+
After you run me (#{__FILE__} is my name), you'll get an output file
|
66
|
+
with the extension, '*.tree_clusters.key_cols.txt'. It may look
|
67
|
+
something like this:
|
68
|
+
|
69
|
+
cluster_A 4 1-A 2-A 3-A 5-G
|
70
|
+
cluster_B 4 1-C 2-C 3-C 5-A
|
71
|
+
|
72
|
+
This file has the clade name, the number of key columns for that
|
73
|
+
clade, and then the rest of the columns tell you the position
|
74
|
+
(1-based) and the nucleotide or residue in that column in all
|
75
|
+
sequences of that clade.
|
76
|
+
|
77
|
+
In this case we have only two clades. The key columns for both are
|
78
|
+
1, 2, 3, and 5. So you can use columns 1, 2, 3, and 5 to classify a
|
79
|
+
sequence as belonging to one of these clades. If it has A, A, A,
|
80
|
+
and G in those positions, it'll be in cluster_A, and if it has C, C,
|
81
|
+
C, and A in those positions, it'll be in cluster_B. If it has any
|
82
|
+
other combination in those 4 columns of the alignment, it won't be
|
83
|
+
in either clade.
|
84
|
+
|
85
|
+
This is just a silly example and most of the time you'll get
|
86
|
+
different key columns for different clades. Note that every clade
|
87
|
+
may not have key columns listed depending on your data and the
|
88
|
+
options you select.
|
89
|
+
|
90
|
+
Notes & Gotchas
|
91
|
+
--------------
|
92
|
+
|
93
|
+
- I ignore columns with gap chars (currently just '-') regardless of
|
94
|
+
column entropy.
|
95
|
+
|
96
|
+
Option info
|
97
|
+
-----------
|
98
|
+
|
99
|
+
--entropy-cutoff: A cutoff of 0 means that you allow no variation at
|
100
|
+
any column.
|
101
|
+
|
102
|
+
--clade-size-cutoff: Use this option to ignore tiny clades.
|
103
|
+
|
104
|
+
Options:
|
105
|
+
EOS
|
106
|
+
|
107
|
+
opt(:tree,
|
108
|
+
"Newick tree file",
|
109
|
+
type: :string)
|
110
|
+
opt(:aln,
|
111
|
+
"Alignment file",
|
112
|
+
type: :string)
|
113
|
+
|
114
|
+
opt(:entropy_cutoff,
|
115
|
+
"Cutoff to consider a column low entropy",
|
116
|
+
default: 0.0)
|
117
|
+
opt(:clade_size_cutoff,
|
118
|
+
"Consider only clades with at least this many leaves",
|
119
|
+
default: 1)
|
120
|
+
|
121
|
+
opt(:outdir,
|
122
|
+
"Output directory",
|
123
|
+
default: ".")
|
124
|
+
opt(:base,
|
125
|
+
"Basename for output",
|
126
|
+
default: "snazzy_clades")
|
127
|
+
end
|
128
|
+
|
129
|
+
abort_if opts[:tree].nil?,
|
130
|
+
"--tree is a required arg. Try running: #{__FILE__} --help"
|
131
|
+
abort_if opts[:aln].nil?,
|
132
|
+
"--aln is a required arg. Try running: #{__FILE__} --help"
|
133
|
+
|
134
|
+
abort_unless_file_exists opts[:tree]
|
135
|
+
abort_unless_file_exists opts[:aln]
|
136
|
+
|
137
|
+
# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]
|
138
|
+
|
139
|
+
abort_unless opts[:entropy_cutoff] >= 0,
|
140
|
+
"--entropy-cutoff must be >= 0"
|
141
|
+
abort_unless opts[:clade_size_cutoff] >= 1,
|
142
|
+
"--clade-size-cutoff must be >= 1"
|
143
|
+
|
144
|
+
FileUtils.mkdir_p opts[:outdir]
|
145
|
+
|
146
|
+
tree = NewickTree.fromFile opts[:tree]
|
147
|
+
leaf2attrs = TreeClusters.read_alignment opts[:aln]
|
148
|
+
|
149
|
+
members_fname =
|
150
|
+
File.join opts[:outdir],
|
151
|
+
"#{opts[:base]}.tree_clusters.clade_members.txt"
|
152
|
+
key_cols_fname =
|
153
|
+
File.join opts[:outdir],
|
154
|
+
"#{opts[:base]}.tree_clusters.key_cols.txt"
|
155
|
+
annotated_tree_fname =
|
156
|
+
File.join opts[:outdir],
|
157
|
+
"#{opts[:base]}.tree_clusters.annotated_tree.txt"
|
158
|
+
|
159
|
+
clade_members_f =
|
160
|
+
File.open(members_fname, "w")
|
161
|
+
key_cols_f =
|
162
|
+
File.open(key_cols_fname, "w")
|
163
|
+
annotated_tree_f =
|
164
|
+
File.open(annotated_tree_fname, "w")
|
165
|
+
|
166
|
+
key_col_sets = {}
|
167
|
+
|
168
|
+
begin
|
169
|
+
TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
|
170
|
+
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
|
171
|
+
|
172
|
+
clade_members_f.puts [clade_id,
|
173
|
+
clade.all_leaves.count,
|
174
|
+
clade.all_leaves].join "\t"
|
175
|
+
|
176
|
+
key_cols_all_leaves =
|
177
|
+
TreeClusters.low_ent_cols_with_bases clade.all_leaves,
|
178
|
+
leaf2attrs,
|
179
|
+
opts[:entropy_cutoff]
|
180
|
+
|
181
|
+
unless key_col_sets.has_key? key_cols_all_leaves
|
182
|
+
key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
|
183
|
+
end
|
184
|
+
key_col_sets[key_cols_all_leaves] << clade_id
|
185
|
+
|
186
|
+
# This will change the node in the original NewickTree
|
187
|
+
clade.node.name = "'#{clade_id}'"
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
# We only want key column sets that are unique to a single clade.
|
192
|
+
key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades|
|
193
|
+
clade_id = clades.first
|
194
|
+
key_cols_f.puts [
|
195
|
+
clade_id,
|
196
|
+
kc_set.count,
|
197
|
+
kc_set.map { |pos, bases| "#{pos}-#{bases.join}" }
|
198
|
+
].join "\t"
|
199
|
+
end
|
200
|
+
|
201
|
+
annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
|
202
|
+
ensure
|
203
|
+
clade_members_f.close
|
204
|
+
key_cols_f.close
|
205
|
+
annotated_tree_f.close
|
206
|
+
end
|
data/lib/tree_clusters.rb
CHANGED
@@ -89,6 +89,25 @@ module TreeClusters
|
|
89
89
|
Set.new low_ent_cols
|
90
90
|
end
|
91
91
|
|
92
|
+
# Like low_ent_cols method but also returns the bases at the positions.
|
93
|
+
def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
|
94
|
+
low_ent_cols = []
|
95
|
+
alns = leaf2attrs.attrs leaves, :aln
|
96
|
+
aln_cols = alns.transpose
|
97
|
+
|
98
|
+
aln_cols.each_with_index do |aln_col, aln_col_idx|
|
99
|
+
has_gaps = aln_col.any? { |aa| aa == "-" }
|
100
|
+
low_entropy =
|
101
|
+
Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
|
102
|
+
|
103
|
+
if !has_gaps && low_entropy
|
104
|
+
low_ent_cols << [(aln_col_idx + 1), aln_col.map(&:upcase).uniq.sort]
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
Set.new low_ent_cols
|
109
|
+
end
|
110
|
+
|
92
111
|
# @note If there are quoted names in the tree file, they are
|
93
112
|
# unquoted first.
|
94
113
|
def check_ids tree, mapping, aln
|
data/lib/tree_clusters/clade.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
module TreeClusters
|
2
2
|
# Represents a clade in a NewickTree
|
3
3
|
class Clade
|
4
|
-
attr_accessor :
|
4
|
+
attr_accessor :node,
|
5
|
+
:name,
|
5
6
|
:all_leaves,
|
6
7
|
:left_leaves,
|
7
8
|
:right_leaves,
|
@@ -18,10 +19,11 @@ module TreeClusters
|
|
18
19
|
#
|
19
20
|
# @param node [NewickNode] a NewickNode from a NewickTree
|
20
21
|
# @param tree [NewickTree] a NewickTree
|
21
|
-
def initialize node, tree, metadata=nil
|
22
|
+
def initialize node, tree, metadata = nil
|
22
23
|
tree_taxa = tree.unquoted_taxa
|
23
24
|
|
24
|
-
@
|
25
|
+
@node = node
|
26
|
+
@name = unquote node.name
|
25
27
|
@all_leaves = descendant_leaves node
|
26
28
|
|
27
29
|
if (children = node.children).count == 2
|
@@ -37,7 +39,7 @@ module TreeClusters
|
|
37
39
|
# "Node #{node.name} has more than one sibling."
|
38
40
|
|
39
41
|
@each_sibling_leaf_set = siblings.
|
40
|
-
|
42
|
+
map {|node| descendant_leaves node}
|
41
43
|
|
42
44
|
@all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq
|
43
45
|
|
@@ -47,14 +49,14 @@ module TreeClusters
|
|
47
49
|
@parent_leaves = descendant_leaves parent
|
48
50
|
|
49
51
|
@other_leaves =
|
50
|
-
|
52
|
+
Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
|
51
53
|
|
52
54
|
@non_parent_leaves =
|
53
|
-
|
55
|
+
Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
|
54
56
|
|
55
57
|
if metadata
|
56
|
-
@metadata
|
57
|
-
@all_tags
|
58
|
+
@metadata = metadata
|
59
|
+
@all_tags ||= get_all_tags
|
58
60
|
@single_tag_info ||= get_single_tag_info
|
59
61
|
else
|
60
62
|
@single_tag_info = nil
|
@@ -67,16 +69,16 @@ module TreeClusters
|
|
67
69
|
# well.
|
68
70
|
def == clade
|
69
71
|
(
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
72
|
+
self.name == clade.name &&
|
73
|
+
self.all_leaves == clade.all_leaves &&
|
74
|
+
self.left_leaves == clade.left_leaves &&
|
75
|
+
self.right_leaves == clade.right_leaves &&
|
76
|
+
self.all_sibling_leaves == clade.all_sibling_leaves &&
|
77
|
+
self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
|
78
|
+
self.parent_leaves == clade.parent_leaves &&
|
79
|
+
self.other_leaves == clade.other_leaves &&
|
80
|
+
self.single_tag_info == clade.single_tag_info &&
|
81
|
+
self.all_tags == clade.all_tags
|
80
82
|
)
|
81
83
|
end
|
82
84
|
|
@@ -99,7 +101,7 @@ module TreeClusters
|
|
99
101
|
tag_info = self.all_leaves.map do |leaf|
|
100
102
|
assert name2tag.has_key?(leaf),
|
101
103
|
"leaf #{leaf} is not present in name2tag ht for " +
|
102
|
-
|
104
|
+
"md_cat #{md_cat}"
|
103
105
|
|
104
106
|
name2tag[leaf]
|
105
107
|
end
|
@@ -113,11 +115,11 @@ module TreeClusters
|
|
113
115
|
[unquote(node.name)]
|
114
116
|
else
|
115
117
|
node.
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
118
|
+
descendants.
|
119
|
+
flatten.
|
120
|
+
uniq.
|
121
|
+
select {|node| node.leaf?}.
|
122
|
+
map {|node| unquote(node.name)}
|
121
123
|
end
|
122
124
|
end
|
123
125
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tree_clusters
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-07-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -173,6 +173,7 @@ email:
|
|
173
173
|
- moorer@udel.edu
|
174
174
|
executables:
|
175
175
|
- clade_attrs
|
176
|
+
- key_cols
|
176
177
|
- snazzy_clades
|
177
178
|
- snazzy_clades_attrs
|
178
179
|
- snazzy_clades_key_cols
|
@@ -190,6 +191,7 @@ files:
|
|
190
191
|
- bin/console
|
191
192
|
- bin/setup
|
192
193
|
- exe/clade_attrs
|
194
|
+
- exe/key_cols
|
193
195
|
- exe/snazzy_clades
|
194
196
|
- exe/snazzy_clades_attrs
|
195
197
|
- exe/snazzy_clades_key_cols
|
@@ -205,6 +207,7 @@ files:
|
|
205
207
|
- test_files/small.attrs
|
206
208
|
- test_files/small.mapping
|
207
209
|
- test_files/small.tre
|
210
|
+
- test_files/small2.aln
|
208
211
|
- test_files/small_aln_bad_ids
|
209
212
|
- test_files/small_mapping_bad_ids
|
210
213
|
- test_files/small_tree_bad_ids
|