tree_clusters 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/exe/snazzy_clades_attrs +223 -0
- data/lib/tree_clusters.rb +39 -3
- data/lib/tree_clusters/version.rb +1 -1
- data/test_files/small.attrs +17 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 612ebc10d0f23a15ae9d11d35be3f1de197d29fd
|
4
|
+
data.tar.gz: '084ff094f024a062e1c0f1476b9105950f796edf'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 247778fdebedd3213d96ccafc52ce1d5667bdb9e8eefee60e30b3dbf1fa618ea73000d6d2a5802e47e84d9ba6fa4c119fd3acb1d21907c4c268f63e38949ffcd
|
7
|
+
data.tar.gz: e8d5cc7d6aa7ce86f277f42c5a5b8bd7b7d2bebe5ac3fbf5c4201315bc7aa8e411940056c30dce22226864d982083605180bfe7304c824566e107a43a308c910
|
data/.gitignore
CHANGED
@@ -0,0 +1,223 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
Signal.trap("PIPE", "EXIT")
|
4
|
+
|
5
|
+
require "tree_clusters"
|
6
|
+
require "trollop"
|
7
|
+
require "parse_fasta"
|
8
|
+
require "shannon"
|
9
|
+
require "fileutils"
|
10
|
+
|
11
|
+
TreeClusters.extend TreeClusters
|
12
|
+
|
13
|
+
def puts_info outf, clade_id, attr_cat, attr_set
|
14
|
+
outf.puts [clade_id, attr_cat, attr_set.to_a].join "\t"
|
15
|
+
end
|
16
|
+
|
17
|
+
opts = Trollop.options do
|
18
|
+
version TreeClusters::VERSION
|
19
|
+
|
20
|
+
banner <<-EOS
|
21
|
+
|
22
|
+
|
23
|
+
Checking IDs
|
24
|
+
------------
|
25
|
+
|
26
|
+
IDs for the sequences must match between the three input files.
|
27
|
+
|
28
|
+
The tree file is allowed to have quoted taxa names, but the mapping
|
29
|
+
file and alignment file are not.
|
30
|
+
|
31
|
+
If your alignment file has spaces in the name, the ID part of the
|
32
|
+
header (i.e., the part up until the space) must match with the
|
33
|
+
sequence IDs in the tree and the mapping file.
|
34
|
+
|
35
|
+
Example: This would be okay.
|
36
|
+
|
37
|
+
tree file:
|
38
|
+
('genome_A', 'genome_B');
|
39
|
+
|
40
|
+
aln file:
|
41
|
+
>genome_A apple pie
|
42
|
+
AAAAA
|
43
|
+
>genome_B brown sugar
|
44
|
+
AATTA
|
45
|
+
|
46
|
+
mapping file:
|
47
|
+
name coolness
|
48
|
+
genome_A cool
|
49
|
+
genome_B notcool
|
50
|
+
|
51
|
+
|
52
|
+
Subtracting parent nodes
|
53
|
+
------------------------
|
54
|
+
|
55
|
+
If a clade's parent would be the root of the tree, no columns will
|
56
|
+
be subtracted when removing the parent columns as it would be the
|
57
|
+
entire alignment.
|
58
|
+
|
59
|
+
Options:
|
60
|
+
EOS
|
61
|
+
|
62
|
+
opt(:tree,
|
63
|
+
"Newick tree file",
|
64
|
+
type: :string)
|
65
|
+
opt(:mapping,
|
66
|
+
"Mapping file",
|
67
|
+
type: :string)
|
68
|
+
opt(:attrs,
|
69
|
+
"Attributes file",
|
70
|
+
type: :string)
|
71
|
+
|
72
|
+
opt(:entropy_cutoff,
|
73
|
+
"Cutoff to consider a column low entropy",
|
74
|
+
default: 0.0)
|
75
|
+
opt(:clade_size_cutoff,
|
76
|
+
"Consider only clades with at least this many leaves",
|
77
|
+
default: 1)
|
78
|
+
|
79
|
+
opt(:outdir,
|
80
|
+
"Output directory",
|
81
|
+
default: ".")
|
82
|
+
opt(:base,
|
83
|
+
"Basename for output",
|
84
|
+
default: "snazzy_clades")
|
85
|
+
end
|
86
|
+
|
87
|
+
abort_if opts[:tree].nil?,
|
88
|
+
"--tree is a required arg"
|
89
|
+
abort_if opts[:mapping].nil?,
|
90
|
+
"--mapping is a required arg"
|
91
|
+
abort_if opts[:attrs].nil?,
|
92
|
+
"--attrs is a required arg"
|
93
|
+
|
94
|
+
abort_unless_file_exists opts[:tree]
|
95
|
+
abort_unless_file_exists opts[:mapping]
|
96
|
+
abort_unless_file_exists opts[:attrs]
|
97
|
+
|
98
|
+
# TODO check IDs when attrs is not a fasta file
|
99
|
+
# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:attrs]
|
100
|
+
|
101
|
+
abort_unless opts[:entropy_cutoff] >= 0,
|
102
|
+
"--entropy-cutoff must be >= 0"
|
103
|
+
abort_unless opts[:clade_size_cutoff] >= 1,
|
104
|
+
"--clade-size-cutoff must be >= 1"
|
105
|
+
|
106
|
+
FileUtils.mkdir_p opts[:outdir]
|
107
|
+
|
108
|
+
tree = NewickTree.fromFile opts[:tree]
|
109
|
+
metadata = TreeClusters.read_mapping_file opts[:mapping]
|
110
|
+
snazzy_clades = TreeClusters.snazzy_clades tree, metadata
|
111
|
+
attr_names, leaf2attrs = TreeClusters.read_attrs_file opts[:attrs]
|
112
|
+
|
113
|
+
clades_fname =
|
114
|
+
File.join opts[:outdir],
|
115
|
+
"#{opts[:base]}.snazzy_clades.txt"
|
116
|
+
members_fname =
|
117
|
+
File.join opts[:outdir],
|
118
|
+
"#{opts[:base]}.snazzy_clades_clade_members.txt"
|
119
|
+
attrs_fname =
|
120
|
+
File.join opts[:outdir],
|
121
|
+
"#{opts[:base]}.snazzy_clades_attrs_union.txt"
|
122
|
+
attrs_intersection_fname =
|
123
|
+
File.join opts[:outdir],
|
124
|
+
"#{opts[:base]}.snazzy_clades_attrs_intersection.txt"
|
125
|
+
attrs_minus_parent_attrs_fname =
|
126
|
+
File.join opts[:outdir],
|
127
|
+
"#{opts[:base]}.snazzy_clades_attrs_minus_parent_attrs.txt"
|
128
|
+
attrs_minus_sibling_attrs_fname =
|
129
|
+
File.join opts[:outdir],
|
130
|
+
"#{opts[:base]}.snazzy_clades_attrs_minus_sibling_attrs.txt"
|
131
|
+
attrs_minus_other_attrs_fname =
|
132
|
+
File.join opts[:outdir],
|
133
|
+
"#{opts[:base]}.snazzy_clades_attrs_minus_other_attrs.txt"
|
134
|
+
|
135
|
+
|
136
|
+
info_f =
|
137
|
+
File.open(clades_fname, "w")
|
138
|
+
clade_members_f =
|
139
|
+
File.open(members_fname, "w")
|
140
|
+
attrs_f =
|
141
|
+
File.open(attrs_fname, "w")
|
142
|
+
attrs_intersection_f =
|
143
|
+
File.open(attrs_intersection_fname, "w")
|
144
|
+
attrs_minus_parent_attrs_f =
|
145
|
+
File.open(attrs_minus_parent_attrs_fname, "w")
|
146
|
+
attrs_minus_sibling_attrs_f =
|
147
|
+
File.open(attrs_minus_sibling_attrs_fname, "w")
|
148
|
+
attrs_minus_other_attrs_f =
|
149
|
+
File.open(attrs_minus_other_attrs_fname, "w")
|
150
|
+
|
151
|
+
|
152
|
+
begin
|
153
|
+
# info is { metadata_category => metadata_tag , ... }
|
154
|
+
snazzy_clades.each_with_index do |(clade, info), idx|
|
155
|
+
assert clade.all_leaves.all? { |leaf| leaf2attrs.has_key? leaf },
|
156
|
+
"Not all leaves are present in the leaf2attrs hash table"
|
157
|
+
|
158
|
+
clade_id = "clade_#{idx+1}___#{clade.name}"
|
159
|
+
|
160
|
+
info_f.puts [clade_id,
|
161
|
+
info.count,
|
162
|
+
info.map { |pair| pair.join("|")}].join "\t"
|
163
|
+
|
164
|
+
clade_members_f.puts [clade_id,
|
165
|
+
clade.all_leaves.count,
|
166
|
+
clade.all_leaves].join "\t"
|
167
|
+
|
168
|
+
attr_names.each do |attr_category|
|
169
|
+
attrs_all_leaves =
|
170
|
+
leaf2attrs.attrs clade.all_leaves, attr_category
|
171
|
+
|
172
|
+
attrs_all_sibling_leaves =
|
173
|
+
leaf2attrs.attrs clade.all_sibling_leaves,
|
174
|
+
attr_category
|
175
|
+
attrs_parent_leaves =
|
176
|
+
leaf2attrs.attrs clade.parent_leaves,
|
177
|
+
attr_category
|
178
|
+
attrs_other_leaves =
|
179
|
+
leaf2attrs.attrs clade.other_leaves,
|
180
|
+
attr_category
|
181
|
+
|
182
|
+
attrs_all_minus_parent =
|
183
|
+
attrs_all_leaves.union - attrs_parent_leaves.union
|
184
|
+
attrs_all_minus_sibling =
|
185
|
+
attrs_all_leaves.union - attrs_all_sibling_leaves.union
|
186
|
+
attrs_all_minus_other =
|
187
|
+
attrs_all_leaves.union - attrs_other_leaves.union
|
188
|
+
|
189
|
+
|
190
|
+
puts_info attrs_f,
|
191
|
+
clade_id,
|
192
|
+
attr_category,
|
193
|
+
attrs_all_leaves.union
|
194
|
+
|
195
|
+
puts_info attrs_intersection_f,
|
196
|
+
clade_id,
|
197
|
+
attr_category,
|
198
|
+
attrs_all_leaves.intersection
|
199
|
+
|
200
|
+
puts_info attrs_minus_parent_attrs_f,
|
201
|
+
clade_id,
|
202
|
+
attr_category,
|
203
|
+
attrs_all_minus_parent
|
204
|
+
|
205
|
+
puts_info attrs_minus_sibling_attrs_f,
|
206
|
+
clade_id,
|
207
|
+
attr_category,
|
208
|
+
attrs_all_minus_sibling
|
209
|
+
|
210
|
+
puts_info attrs_minus_other_attrs_f,
|
211
|
+
clade_id,
|
212
|
+
attr_category,
|
213
|
+
attrs_all_minus_other
|
214
|
+
end
|
215
|
+
end
|
216
|
+
ensure
|
217
|
+
info_f.close
|
218
|
+
clade_members_f.close
|
219
|
+
attrs_f.close
|
220
|
+
attrs_minus_parent_attrs_f.close
|
221
|
+
attrs_minus_sibling_attrs_f.close
|
222
|
+
attrs_minus_other_attrs_f.close
|
223
|
+
end
|
data/lib/tree_clusters.rb
CHANGED
@@ -149,11 +149,8 @@ module TreeClusters
|
|
149
149
|
metadata.each do |md_cat, leaf2mdtag|
|
150
150
|
already_checked = Set.new
|
151
151
|
single_tag_clades = {}
|
152
|
-
p [md_cat, leaf2mdtag]
|
153
152
|
|
154
153
|
clades.each do |clade|
|
155
|
-
p [clade.name, clade.all_leaves]
|
156
|
-
|
157
154
|
assert clade.all_leaves.count > 1,
|
158
155
|
"A clade cannot also be a leaf"
|
159
156
|
|
@@ -220,6 +217,45 @@ module TreeClusters
|
|
220
217
|
metadata
|
221
218
|
end
|
222
219
|
|
220
|
+
def read_attrs_file fname
|
221
|
+
|
222
|
+
attr_names = Set.new
|
223
|
+
File.open(fname, "rt").each_line.with_index do |line, idx|
|
224
|
+
unless idx.zero?
|
225
|
+
_, attr_name, _ = line.chomp.split "\t"
|
226
|
+
|
227
|
+
attr_names << attr_name
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
attr_names = attr_names.to_a.sort
|
232
|
+
|
233
|
+
attrs = TreeClusters::Attrs.new
|
234
|
+
|
235
|
+
File.open(fname, "rt").each_line.with_index do |line, idx|
|
236
|
+
unless idx.zero?
|
237
|
+
leaf, attr_name, attr_val = line.chomp.split "\t"
|
238
|
+
|
239
|
+
if attrs.has_key? leaf
|
240
|
+
if attrs[leaf].has_key? attr_name
|
241
|
+
attrs[leaf][attr_name] << attr_val
|
242
|
+
else
|
243
|
+
attrs[leaf][attr_name] = Set.new([attr_val])
|
244
|
+
end
|
245
|
+
else
|
246
|
+
attrs[leaf] = {}
|
247
|
+
|
248
|
+
attr_names.each do |name|
|
249
|
+
attrs[leaf][name] = Set.new
|
250
|
+
end
|
251
|
+
attrs[leaf][attr_name] << attr_val
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
[attr_names, attrs]
|
257
|
+
end
|
258
|
+
|
223
259
|
# A Hash table for genome/leaf/taxa attributes
|
224
260
|
class Attrs < Hash
|
225
261
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
name attr name attr val
|
2
|
+
a-1 fruta manzana
|
3
|
+
a-1 fruta pera
|
4
|
+
a-1 color rojo
|
5
|
+
a-2 fruta manzana
|
6
|
+
a-2 color azul
|
7
|
+
a-2 fruta pera
|
8
|
+
a-2 color rojo
|
9
|
+
b-1 fruta pera
|
10
|
+
b-1 color blanco
|
11
|
+
b-2 color blanco
|
12
|
+
bb-1 color blanco
|
13
|
+
bbb-1 color blanco
|
14
|
+
bbb-2 color blanco
|
15
|
+
bbb-2 color gris
|
16
|
+
bbb-1 tamaño pequeña
|
17
|
+
bbb-2 tamaño pequeña
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tree_clusters
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-11-
|
11
|
+
date: 2017-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -173,6 +173,7 @@ email:
|
|
173
173
|
- moorer@udel.edu
|
174
174
|
executables:
|
175
175
|
- snazzy_clades
|
176
|
+
- snazzy_clades_attrs
|
176
177
|
- snazzy_clades_key_cols
|
177
178
|
extensions: []
|
178
179
|
extra_rdoc_files: []
|
@@ -188,6 +189,7 @@ files:
|
|
188
189
|
- bin/console
|
189
190
|
- bin/setup
|
190
191
|
- exe/snazzy_clades
|
192
|
+
- exe/snazzy_clades_attrs
|
191
193
|
- exe/snazzy_clades_key_cols
|
192
194
|
- lib/tree_clusters.rb
|
193
195
|
- lib/tree_clusters/version.rb
|
@@ -195,6 +197,7 @@ files:
|
|
195
197
|
- test_files/non_bifurcating.aln
|
196
198
|
- test_files/non_bifurcating.tre
|
197
199
|
- test_files/small.aln
|
200
|
+
- test_files/small.attrs
|
198
201
|
- test_files/small.mapping
|
199
202
|
- test_files/small.tre
|
200
203
|
- test_files/small_aln_bad_ids
|