tree_clusters 0.5.2 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/exe/snazzy_clades_attrs +223 -0
- data/lib/tree_clusters.rb +39 -3
- data/lib/tree_clusters/version.rb +1 -1
- data/test_files/small.attrs +17 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 612ebc10d0f23a15ae9d11d35be3f1de197d29fd
|
4
|
+
data.tar.gz: '084ff094f024a062e1c0f1476b9105950f796edf'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 247778fdebedd3213d96ccafc52ce1d5667bdb9e8eefee60e30b3dbf1fa618ea73000d6d2a5802e47e84d9ba6fa4c119fd3acb1d21907c4c268f63e38949ffcd
|
7
|
+
data.tar.gz: e8d5cc7d6aa7ce86f277f42c5a5b8bd7b7d2bebe5ac3fbf5c4201315bc7aa8e411940056c30dce22226864d982083605180bfe7304c824566e107a43a308c910
|
data/.gitignore
CHANGED
@@ -0,0 +1,223 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
Signal.trap("PIPE", "EXIT")
|
4
|
+
|
5
|
+
require "tree_clusters"
|
6
|
+
require "trollop"
|
7
|
+
require "parse_fasta"
|
8
|
+
require "shannon"
|
9
|
+
require "fileutils"
|
10
|
+
|
11
|
+
TreeClusters.extend TreeClusters
|
12
|
+
|
13
|
+
def puts_info outf, clade_id, attr_cat, attr_set
|
14
|
+
outf.puts [clade_id, attr_cat, attr_set.to_a].join "\t"
|
15
|
+
end
|
16
|
+
|
17
|
+
opts = Trollop.options do
|
18
|
+
version TreeClusters::VERSION
|
19
|
+
|
20
|
+
banner <<-EOS
|
21
|
+
|
22
|
+
|
23
|
+
Checking IDs
|
24
|
+
------------
|
25
|
+
|
26
|
+
IDs for the sequences must match between the three input files.
|
27
|
+
|
28
|
+
The tree file is allowed to have quoted taxa names, but the mapping
|
29
|
+
file and alignment file are not.
|
30
|
+
|
31
|
+
If your alignment file has spaces in the name, the ID part of the
|
32
|
+
header (i.e., the part up until the space) must match with the
|
33
|
+
sequence IDs in the tree and the mapping file.
|
34
|
+
|
35
|
+
Example: This would be okay.
|
36
|
+
|
37
|
+
tree file:
|
38
|
+
('genome_A', 'genome_B');
|
39
|
+
|
40
|
+
aln file:
|
41
|
+
>genome_A apple pie
|
42
|
+
AAAAA
|
43
|
+
>genome_B brown sugar
|
44
|
+
AATTA
|
45
|
+
|
46
|
+
mapping file:
|
47
|
+
name coolness
|
48
|
+
genome_A cool
|
49
|
+
genome_B notcool
|
50
|
+
|
51
|
+
|
52
|
+
Subtracting parent nodes
|
53
|
+
------------------------
|
54
|
+
|
55
|
+
If a clade's parent would be the root of the tree, no columns will
|
56
|
+
be subtracted when removing the parent columns as it would be the
|
57
|
+
entire alignment.
|
58
|
+
|
59
|
+
Options:
|
60
|
+
EOS
|
61
|
+
|
62
|
+
opt(:tree,
|
63
|
+
"Newick tree file",
|
64
|
+
type: :string)
|
65
|
+
opt(:mapping,
|
66
|
+
"Mapping file",
|
67
|
+
type: :string)
|
68
|
+
opt(:attrs,
|
69
|
+
"Attributes file",
|
70
|
+
type: :string)
|
71
|
+
|
72
|
+
opt(:entropy_cutoff,
|
73
|
+
"Cutoff to consider a column low entropy",
|
74
|
+
default: 0.0)
|
75
|
+
opt(:clade_size_cutoff,
|
76
|
+
"Consider only clades with at least this many leaves",
|
77
|
+
default: 1)
|
78
|
+
|
79
|
+
opt(:outdir,
|
80
|
+
"Output directory",
|
81
|
+
default: ".")
|
82
|
+
opt(:base,
|
83
|
+
"Basename for output",
|
84
|
+
default: "snazzy_clades")
|
85
|
+
end
|
86
|
+
|
87
|
+
abort_if opts[:tree].nil?,
|
88
|
+
"--tree is a required arg"
|
89
|
+
abort_if opts[:mapping].nil?,
|
90
|
+
"--mapping is a required arg"
|
91
|
+
abort_if opts[:attrs].nil?,
|
92
|
+
"--attrs is a required arg"
|
93
|
+
|
94
|
+
abort_unless_file_exists opts[:tree]
|
95
|
+
abort_unless_file_exists opts[:mapping]
|
96
|
+
abort_unless_file_exists opts[:attrs]
|
97
|
+
|
98
|
+
# TODO check IDs when attrs is not a fasta file
|
99
|
+
# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:attrs]
|
100
|
+
|
101
|
+
abort_unless opts[:entropy_cutoff] >= 0,
|
102
|
+
"--entropy-cutoff must be >= 0"
|
103
|
+
abort_unless opts[:clade_size_cutoff] >= 1,
|
104
|
+
"--clade-size-cutoff must be >= 1"
|
105
|
+
|
106
|
+
FileUtils.mkdir_p opts[:outdir]
|
107
|
+
|
108
|
+
tree = NewickTree.fromFile opts[:tree]
|
109
|
+
metadata = TreeClusters.read_mapping_file opts[:mapping]
|
110
|
+
snazzy_clades = TreeClusters.snazzy_clades tree, metadata
|
111
|
+
attr_names, leaf2attrs = TreeClusters.read_attrs_file opts[:attrs]
|
112
|
+
|
113
|
+
clades_fname =
|
114
|
+
File.join opts[:outdir],
|
115
|
+
"#{opts[:base]}.snazzy_clades.txt"
|
116
|
+
members_fname =
|
117
|
+
File.join opts[:outdir],
|
118
|
+
"#{opts[:base]}.snazzy_clades_clade_members.txt"
|
119
|
+
attrs_fname =
|
120
|
+
File.join opts[:outdir],
|
121
|
+
"#{opts[:base]}.snazzy_clades_attrs_union.txt"
|
122
|
+
attrs_intersection_fname =
|
123
|
+
File.join opts[:outdir],
|
124
|
+
"#{opts[:base]}.snazzy_clades_attrs_intersection.txt"
|
125
|
+
attrs_minus_parent_attrs_fname =
|
126
|
+
File.join opts[:outdir],
|
127
|
+
"#{opts[:base]}.snazzy_clades_attrs_minus_parent_attrs.txt"
|
128
|
+
attrs_minus_sibling_attrs_fname =
|
129
|
+
File.join opts[:outdir],
|
130
|
+
"#{opts[:base]}.snazzy_clades_attrs_minus_sibling_attrs.txt"
|
131
|
+
attrs_minus_other_attrs_fname =
|
132
|
+
File.join opts[:outdir],
|
133
|
+
"#{opts[:base]}.snazzy_clades_attrs_minus_other_attrs.txt"
|
134
|
+
|
135
|
+
|
136
|
+
info_f =
|
137
|
+
File.open(clades_fname, "w")
|
138
|
+
clade_members_f =
|
139
|
+
File.open(members_fname, "w")
|
140
|
+
attrs_f =
|
141
|
+
File.open(attrs_fname, "w")
|
142
|
+
attrs_intersection_f =
|
143
|
+
File.open(attrs_intersection_fname, "w")
|
144
|
+
attrs_minus_parent_attrs_f =
|
145
|
+
File.open(attrs_minus_parent_attrs_fname, "w")
|
146
|
+
attrs_minus_sibling_attrs_f =
|
147
|
+
File.open(attrs_minus_sibling_attrs_fname, "w")
|
148
|
+
attrs_minus_other_attrs_f =
|
149
|
+
File.open(attrs_minus_other_attrs_fname, "w")
|
150
|
+
|
151
|
+
|
152
|
+
begin
|
153
|
+
# info is { metadata_category => metadata_tag , ... }
|
154
|
+
snazzy_clades.each_with_index do |(clade, info), idx|
|
155
|
+
assert clade.all_leaves.all? { |leaf| leaf2attrs.has_key? leaf },
|
156
|
+
"Not all leaves are present in the leaf2attrs hash table"
|
157
|
+
|
158
|
+
clade_id = "clade_#{idx+1}___#{clade.name}"
|
159
|
+
|
160
|
+
info_f.puts [clade_id,
|
161
|
+
info.count,
|
162
|
+
info.map { |pair| pair.join("|")}].join "\t"
|
163
|
+
|
164
|
+
clade_members_f.puts [clade_id,
|
165
|
+
clade.all_leaves.count,
|
166
|
+
clade.all_leaves].join "\t"
|
167
|
+
|
168
|
+
attr_names.each do |attr_category|
|
169
|
+
attrs_all_leaves =
|
170
|
+
leaf2attrs.attrs clade.all_leaves, attr_category
|
171
|
+
|
172
|
+
attrs_all_sibling_leaves =
|
173
|
+
leaf2attrs.attrs clade.all_sibling_leaves,
|
174
|
+
attr_category
|
175
|
+
attrs_parent_leaves =
|
176
|
+
leaf2attrs.attrs clade.parent_leaves,
|
177
|
+
attr_category
|
178
|
+
attrs_other_leaves =
|
179
|
+
leaf2attrs.attrs clade.other_leaves,
|
180
|
+
attr_category
|
181
|
+
|
182
|
+
attrs_all_minus_parent =
|
183
|
+
attrs_all_leaves.union - attrs_parent_leaves.union
|
184
|
+
attrs_all_minus_sibling =
|
185
|
+
attrs_all_leaves.union - attrs_all_sibling_leaves.union
|
186
|
+
attrs_all_minus_other =
|
187
|
+
attrs_all_leaves.union - attrs_other_leaves.union
|
188
|
+
|
189
|
+
|
190
|
+
puts_info attrs_f,
|
191
|
+
clade_id,
|
192
|
+
attr_category,
|
193
|
+
attrs_all_leaves.union
|
194
|
+
|
195
|
+
puts_info attrs_intersection_f,
|
196
|
+
clade_id,
|
197
|
+
attr_category,
|
198
|
+
attrs_all_leaves.intersection
|
199
|
+
|
200
|
+
puts_info attrs_minus_parent_attrs_f,
|
201
|
+
clade_id,
|
202
|
+
attr_category,
|
203
|
+
attrs_all_minus_parent
|
204
|
+
|
205
|
+
puts_info attrs_minus_sibling_attrs_f,
|
206
|
+
clade_id,
|
207
|
+
attr_category,
|
208
|
+
attrs_all_minus_sibling
|
209
|
+
|
210
|
+
puts_info attrs_minus_other_attrs_f,
|
211
|
+
clade_id,
|
212
|
+
attr_category,
|
213
|
+
attrs_all_minus_other
|
214
|
+
end
|
215
|
+
end
|
216
|
+
ensure
|
217
|
+
info_f.close
|
218
|
+
clade_members_f.close
|
219
|
+
attrs_f.close
|
220
|
+
attrs_minus_parent_attrs_f.close
|
221
|
+
attrs_minus_sibling_attrs_f.close
|
222
|
+
attrs_minus_other_attrs_f.close
|
223
|
+
end
|
data/lib/tree_clusters.rb
CHANGED
@@ -149,11 +149,8 @@ module TreeClusters
|
|
149
149
|
metadata.each do |md_cat, leaf2mdtag|
|
150
150
|
already_checked = Set.new
|
151
151
|
single_tag_clades = {}
|
152
|
-
p [md_cat, leaf2mdtag]
|
153
152
|
|
154
153
|
clades.each do |clade|
|
155
|
-
p [clade.name, clade.all_leaves]
|
156
|
-
|
157
154
|
assert clade.all_leaves.count > 1,
|
158
155
|
"A clade cannot also be a leaf"
|
159
156
|
|
@@ -220,6 +217,45 @@ module TreeClusters
|
|
220
217
|
metadata
|
221
218
|
end
|
222
219
|
|
220
|
+
def read_attrs_file fname
|
221
|
+
|
222
|
+
attr_names = Set.new
|
223
|
+
File.open(fname, "rt").each_line.with_index do |line, idx|
|
224
|
+
unless idx.zero?
|
225
|
+
_, attr_name, _ = line.chomp.split "\t"
|
226
|
+
|
227
|
+
attr_names << attr_name
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
attr_names = attr_names.to_a.sort
|
232
|
+
|
233
|
+
attrs = TreeClusters::Attrs.new
|
234
|
+
|
235
|
+
File.open(fname, "rt").each_line.with_index do |line, idx|
|
236
|
+
unless idx.zero?
|
237
|
+
leaf, attr_name, attr_val = line.chomp.split "\t"
|
238
|
+
|
239
|
+
if attrs.has_key? leaf
|
240
|
+
if attrs[leaf].has_key? attr_name
|
241
|
+
attrs[leaf][attr_name] << attr_val
|
242
|
+
else
|
243
|
+
attrs[leaf][attr_name] = Set.new([attr_val])
|
244
|
+
end
|
245
|
+
else
|
246
|
+
attrs[leaf] = {}
|
247
|
+
|
248
|
+
attr_names.each do |name|
|
249
|
+
attrs[leaf][name] = Set.new
|
250
|
+
end
|
251
|
+
attrs[leaf][attr_name] << attr_val
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
[attr_names, attrs]
|
257
|
+
end
|
258
|
+
|
223
259
|
# A Hash table for genome/leaf/taxa attributes
|
224
260
|
class Attrs < Hash
|
225
261
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
name attr name attr val
|
2
|
+
a-1 fruta manzana
|
3
|
+
a-1 fruta pera
|
4
|
+
a-1 color rojo
|
5
|
+
a-2 fruta manzana
|
6
|
+
a-2 color azul
|
7
|
+
a-2 fruta pera
|
8
|
+
a-2 color rojo
|
9
|
+
b-1 fruta pera
|
10
|
+
b-1 color blanco
|
11
|
+
b-2 color blanco
|
12
|
+
bb-1 color blanco
|
13
|
+
bbb-1 color blanco
|
14
|
+
bbb-2 color blanco
|
15
|
+
bbb-2 color gris
|
16
|
+
bbb-1 tamaño pequeña
|
17
|
+
bbb-2 tamaño pequeña
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tree_clusters
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-11-
|
11
|
+
date: 2017-11-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -173,6 +173,7 @@ email:
|
|
173
173
|
- moorer@udel.edu
|
174
174
|
executables:
|
175
175
|
- snazzy_clades
|
176
|
+
- snazzy_clades_attrs
|
176
177
|
- snazzy_clades_key_cols
|
177
178
|
extensions: []
|
178
179
|
extra_rdoc_files: []
|
@@ -188,6 +189,7 @@ files:
|
|
188
189
|
- bin/console
|
189
190
|
- bin/setup
|
190
191
|
- exe/snazzy_clades
|
192
|
+
- exe/snazzy_clades_attrs
|
191
193
|
- exe/snazzy_clades_key_cols
|
192
194
|
- lib/tree_clusters.rb
|
193
195
|
- lib/tree_clusters/version.rb
|
@@ -195,6 +197,7 @@ files:
|
|
195
197
|
- test_files/non_bifurcating.aln
|
196
198
|
- test_files/non_bifurcating.tre
|
197
199
|
- test_files/small.aln
|
200
|
+
- test_files/small.attrs
|
198
201
|
- test_files/small.mapping
|
199
202
|
- test_files/small.tre
|
200
203
|
- test_files/small_aln_bad_ids
|