tree_clusters 0.8.0 → 0.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +14 -2
- data/exe/key_cols +59 -39
- data/lib/tree_clusters/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f37ec0e062c33d19f8e525279d587d5512ba106
|
4
|
+
data.tar.gz: a59464b000338b3fc1aa1fa2c0ee267b5231df73
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 53b9025669f26719174eb57376d7f1826095a074f896af1352bef944833f0ebfca95c94d444f6d990e7f97bd06002098b13e43c305e4416492114f4bd44408be
|
7
|
+
data.tar.gz: 1aadc375ce71361769e7f79768b3cbcd45606ac832100363fb68b79831d20019d2532b1d229d8b7e1d7b6becd49a5bb6ca72aea228d241a202686cfcf2d6e36c
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -20,13 +20,25 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
$ gem install tree_clusters
|
22
22
|
|
23
|
-
##
|
23
|
+
## Command line scripts
|
24
|
+
|
25
|
+
### key_cols
|
26
|
+
|
27
|
+
Use this to find key columns in an alignment. For more information, run
|
28
|
+
|
29
|
+
```bash
|
30
|
+
key_cols --help
|
31
|
+
```
|
32
|
+
|
33
|
+
## API
|
34
|
+
|
35
|
+
### Documentation
|
24
36
|
|
25
37
|
Checkout
|
26
38
|
[TreeClusters docs](http://rubydoc.info/gems/tree_clusters)
|
27
39
|
for the full api documentation.
|
28
40
|
|
29
|
-
|
41
|
+
### Usage
|
30
42
|
|
31
43
|
Here is a small example.
|
32
44
|
|
data/exe/key_cols
CHANGED
@@ -9,7 +9,8 @@ require "fileutils"
|
|
9
9
|
|
10
10
|
TreeClusters.extend TreeClusters
|
11
11
|
|
12
|
-
|
12
|
+
PROGRAM = "key_cols"
|
13
|
+
GREETING = "The #{PROGRAM} program"
|
13
14
|
UNDERLINE = "=" * GREETING.length
|
14
15
|
|
15
16
|
opts = Trollop.options do
|
@@ -46,12 +47,12 @@ opts = Trollop.options do
|
|
46
47
|
|
47
48
|
banner <<-EOS
|
48
49
|
|
49
|
-
|
50
|
-
|
50
|
+
The key_cols program
|
51
|
+
====================
|
51
52
|
|
52
|
-
Hi. My name is
|
53
|
-
|
54
|
-
|
53
|
+
Hi. My name is key_cols. If you give me a Newick tree file and an
|
54
|
+
alignment file (fasta format), I will tell you key columns for all
|
55
|
+
clades/clusters that have them.
|
55
56
|
|
56
57
|
Overview
|
57
58
|
--------
|
@@ -62,9 +63,8 @@ Overview
|
|
62
63
|
|
63
64
|
Here's an example....
|
64
65
|
|
65
|
-
After you run me
|
66
|
-
|
67
|
-
something like this:
|
66
|
+
After you run me, you'll get an output file with the extension,
|
67
|
+
'*.tree_clusters.key_cols.txt'. It may look something like this:
|
68
68
|
|
69
69
|
cluster_A 4 1-A 2-A 3-A 5-G
|
70
70
|
cluster_B 4 1-C 2-C 3-C 5-A
|
@@ -93,6 +93,10 @@ Notes & Gotchas
|
|
93
93
|
- I ignore columns with gap chars (currently just '-') regardless of
|
94
94
|
column entropy.
|
95
95
|
|
96
|
+
- If you're Newick tree has parentheses "(" or ")" in leaf names
|
97
|
+
you'll get a parsing error even if the name is single quoted.
|
98
|
+
We'll fix this at some point.
|
99
|
+
|
96
100
|
Option info
|
97
101
|
-----------
|
98
102
|
|
@@ -120,16 +124,16 @@ Option info
|
|
120
124
|
|
121
125
|
opt(:outdir,
|
122
126
|
"Output directory",
|
123
|
-
default: "
|
127
|
+
default: "key_cols_output")
|
124
128
|
opt(:base,
|
125
129
|
"Basename for output",
|
126
|
-
default: "
|
130
|
+
default: "key_cols_output")
|
127
131
|
end
|
128
132
|
|
129
133
|
abort_if opts[:tree].nil?,
|
130
|
-
"--tree is a required arg. Try running: #{
|
134
|
+
"--tree is a required arg. Try running: #{PROGRAM} --help"
|
131
135
|
abort_if opts[:aln].nil?,
|
132
|
-
"--aln is a required arg. Try running: #{
|
136
|
+
"--aln is a required arg. Try running: #{PROGRAM} --help"
|
133
137
|
|
134
138
|
abort_unless_file_exists opts[:tree]
|
135
139
|
abort_unless_file_exists opts[:aln]
|
@@ -143,40 +147,52 @@ abort_unless opts[:clade_size_cutoff] >= 1,
|
|
143
147
|
|
144
148
|
FileUtils.mkdir_p opts[:outdir]
|
145
149
|
|
150
|
+
AbortIf.logger.info { "Parsing input files" }
|
151
|
+
|
146
152
|
tree = NewickTree.fromFile opts[:tree]
|
147
153
|
leaf2attrs = TreeClusters.read_alignment opts[:aln]
|
148
154
|
|
149
|
-
members_fname
|
150
|
-
|
151
|
-
|
152
|
-
key_cols_fname
|
153
|
-
|
154
|
-
|
155
|
+
members_fname =
|
156
|
+
File.join opts[:outdir],
|
157
|
+
"#{opts[:base]}.tree_clusters.clade_members.txt"
|
158
|
+
key_cols_fname =
|
159
|
+
File.join opts[:outdir],
|
160
|
+
"#{opts[:base]}.tree_clusters.key_cols.txt"
|
155
161
|
annotated_tree_fname =
|
156
|
-
|
157
|
-
|
162
|
+
File.join opts[:outdir],
|
163
|
+
"#{opts[:base]}.tree_clusters.annotated_tree.txt"
|
158
164
|
|
159
|
-
clade_members_f
|
160
|
-
|
161
|
-
key_cols_f
|
162
|
-
|
163
|
-
annotated_tree_f
|
164
|
-
|
165
|
+
clade_members_f =
|
166
|
+
File.open(members_fname, "w")
|
167
|
+
key_cols_f =
|
168
|
+
File.open(key_cols_fname, "w")
|
169
|
+
annotated_tree_f =
|
170
|
+
File.open(annotated_tree_fname, "w")
|
165
171
|
|
166
172
|
key_col_sets = {}
|
173
|
+
clade_sizes = {}
|
174
|
+
clade_count = TreeClusters.all_clades(tree).count
|
167
175
|
|
176
|
+
AbortIf.logger.info { "Processing clades" }
|
168
177
|
begin
|
169
178
|
TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
|
170
|
-
|
179
|
+
if ((idx + 1) % 100).zero?
|
180
|
+
perc = ((idx + 1) / clade_count.to_f * 100).round 2
|
181
|
+
|
182
|
+
STDERR.printf("Processing clades: #{perc}%\r")
|
183
|
+
end
|
184
|
+
|
185
|
+
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
|
186
|
+
clade_sizes[clade_id] = clade.all_leaves.count
|
171
187
|
|
172
188
|
clade_members_f.puts [clade_id,
|
173
189
|
clade.all_leaves.count,
|
174
190
|
clade.all_leaves].join "\t"
|
175
191
|
|
176
|
-
key_cols_all_leaves
|
177
|
-
|
178
|
-
|
179
|
-
|
192
|
+
key_cols_all_leaves =
|
193
|
+
TreeClusters.low_ent_cols_with_bases clade.all_leaves,
|
194
|
+
leaf2attrs,
|
195
|
+
opts[:entropy_cutoff]
|
180
196
|
|
181
197
|
unless key_col_sets.has_key? key_cols_all_leaves
|
182
198
|
key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
|
@@ -185,17 +201,21 @@ begin
|
|
185
201
|
|
186
202
|
# This will change the node in the original NewickTree
|
187
203
|
clade.node.name = "'#{clade_id}'"
|
188
|
-
|
189
204
|
end
|
190
205
|
|
206
|
+
AbortIf.logger.info { "Writing results" }
|
207
|
+
|
191
208
|
# We only want key column sets that are unique to a single clade.
|
192
|
-
key_col_sets.select {
|
209
|
+
key_col_sets.select {|_, clades| clades.count == 1}.each do |kc_set, clades|
|
193
210
|
clade_id = clades.first
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
211
|
+
|
212
|
+
# TODO should we just skip processing clades that are too small rather than just not printing them out?
|
213
|
+
if clade_sizes[clade_id] > opts[:clade_size_cutoff]
|
214
|
+
key_cols_f.puts [clade_id,
|
215
|
+
kc_set.count,
|
216
|
+
kc_set.map {|pos, bases| "#{pos}-#{bases.join}"}
|
217
|
+
].join "\t"
|
218
|
+
end
|
199
219
|
end
|
200
220
|
|
201
221
|
annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
|