tree_clusters 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +14 -2
- data/exe/key_cols +59 -39
- data/lib/tree_clusters/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f37ec0e062c33d19f8e525279d587d5512ba106
|
4
|
+
data.tar.gz: a59464b000338b3fc1aa1fa2c0ee267b5231df73
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 53b9025669f26719174eb57376d7f1826095a074f896af1352bef944833f0ebfca95c94d444f6d990e7f97bd06002098b13e43c305e4416492114f4bd44408be
|
7
|
+
data.tar.gz: 1aadc375ce71361769e7f79768b3cbcd45606ac832100363fb68b79831d20019d2532b1d229d8b7e1d7b6becd49a5bb6ca72aea228d241a202686cfcf2d6e36c
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -20,13 +20,25 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
$ gem install tree_clusters
|
22
22
|
|
23
|
-
##
|
23
|
+
## Command line scripts
|
24
|
+
|
25
|
+
### key_cols
|
26
|
+
|
27
|
+
Use this to find key columns in an alignment. For more information, run
|
28
|
+
|
29
|
+
```bash
|
30
|
+
key_cols --help
|
31
|
+
```
|
32
|
+
|
33
|
+
## API
|
34
|
+
|
35
|
+
### Documentation
|
24
36
|
|
25
37
|
Checkout
|
26
38
|
[TreeClusters docs](http://rubydoc.info/gems/tree_clusters)
|
27
39
|
for the full api documentation.
|
28
40
|
|
29
|
-
|
41
|
+
### Usage
|
30
42
|
|
31
43
|
Here is a small example.
|
32
44
|
|
data/exe/key_cols
CHANGED
@@ -9,7 +9,8 @@ require "fileutils"
|
|
9
9
|
|
10
10
|
TreeClusters.extend TreeClusters
|
11
11
|
|
12
|
-
|
12
|
+
PROGRAM = "key_cols"
|
13
|
+
GREETING = "The #{PROGRAM} program"
|
13
14
|
UNDERLINE = "=" * GREETING.length
|
14
15
|
|
15
16
|
opts = Trollop.options do
|
@@ -46,12 +47,12 @@ opts = Trollop.options do
|
|
46
47
|
|
47
48
|
banner <<-EOS
|
48
49
|
|
49
|
-
|
50
|
-
|
50
|
+
The key_cols program
|
51
|
+
====================
|
51
52
|
|
52
|
-
Hi. My name is
|
53
|
-
|
54
|
-
|
53
|
+
Hi. My name is key_cols. If you give me a Newick tree file and an
|
54
|
+
alignment file (fasta format), I will tell you key columns for all
|
55
|
+
clades/clusters that have them.
|
55
56
|
|
56
57
|
Overview
|
57
58
|
--------
|
@@ -62,9 +63,8 @@ Overview
|
|
62
63
|
|
63
64
|
Here's an example....
|
64
65
|
|
65
|
-
After you run me
|
66
|
-
|
67
|
-
something like this:
|
66
|
+
After you run me, you'll get an output file with the extension,
|
67
|
+
'*.tree_clusters.key_cols.txt'. It may look something like this:
|
68
68
|
|
69
69
|
cluster_A 4 1-A 2-A 3-A 5-G
|
70
70
|
cluster_B 4 1-C 2-C 3-C 5-A
|
@@ -93,6 +93,10 @@ Notes & Gotchas
|
|
93
93
|
- I ignore columns with gap chars (currently just '-') regardless of
|
94
94
|
column entropy.
|
95
95
|
|
96
|
+
- If you're Newick tree has parentheses "(" or ")" in leaf names
|
97
|
+
you'll get a parsing error even if the name is single quoted.
|
98
|
+
We'll fix this at some point.
|
99
|
+
|
96
100
|
Option info
|
97
101
|
-----------
|
98
102
|
|
@@ -120,16 +124,16 @@ Option info
|
|
120
124
|
|
121
125
|
opt(:outdir,
|
122
126
|
"Output directory",
|
123
|
-
default: "
|
127
|
+
default: "key_cols_output")
|
124
128
|
opt(:base,
|
125
129
|
"Basename for output",
|
126
|
-
default: "
|
130
|
+
default: "key_cols_output")
|
127
131
|
end
|
128
132
|
|
129
133
|
abort_if opts[:tree].nil?,
|
130
|
-
"--tree is a required arg. Try running: #{
|
134
|
+
"--tree is a required arg. Try running: #{PROGRAM} --help"
|
131
135
|
abort_if opts[:aln].nil?,
|
132
|
-
"--aln is a required arg. Try running: #{
|
136
|
+
"--aln is a required arg. Try running: #{PROGRAM} --help"
|
133
137
|
|
134
138
|
abort_unless_file_exists opts[:tree]
|
135
139
|
abort_unless_file_exists opts[:aln]
|
@@ -143,40 +147,52 @@ abort_unless opts[:clade_size_cutoff] >= 1,
|
|
143
147
|
|
144
148
|
FileUtils.mkdir_p opts[:outdir]
|
145
149
|
|
150
|
+
AbortIf.logger.info { "Parsing input files" }
|
151
|
+
|
146
152
|
tree = NewickTree.fromFile opts[:tree]
|
147
153
|
leaf2attrs = TreeClusters.read_alignment opts[:aln]
|
148
154
|
|
149
|
-
members_fname
|
150
|
-
|
151
|
-
|
152
|
-
key_cols_fname
|
153
|
-
|
154
|
-
|
155
|
+
members_fname =
|
156
|
+
File.join opts[:outdir],
|
157
|
+
"#{opts[:base]}.tree_clusters.clade_members.txt"
|
158
|
+
key_cols_fname =
|
159
|
+
File.join opts[:outdir],
|
160
|
+
"#{opts[:base]}.tree_clusters.key_cols.txt"
|
155
161
|
annotated_tree_fname =
|
156
|
-
|
157
|
-
|
162
|
+
File.join opts[:outdir],
|
163
|
+
"#{opts[:base]}.tree_clusters.annotated_tree.txt"
|
158
164
|
|
159
|
-
clade_members_f
|
160
|
-
|
161
|
-
key_cols_f
|
162
|
-
|
163
|
-
annotated_tree_f
|
164
|
-
|
165
|
+
clade_members_f =
|
166
|
+
File.open(members_fname, "w")
|
167
|
+
key_cols_f =
|
168
|
+
File.open(key_cols_fname, "w")
|
169
|
+
annotated_tree_f =
|
170
|
+
File.open(annotated_tree_fname, "w")
|
165
171
|
|
166
172
|
key_col_sets = {}
|
173
|
+
clade_sizes = {}
|
174
|
+
clade_count = TreeClusters.all_clades(tree).count
|
167
175
|
|
176
|
+
AbortIf.logger.info { "Processing clades" }
|
168
177
|
begin
|
169
178
|
TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
|
170
|
-
|
179
|
+
if ((idx + 1) % 100).zero?
|
180
|
+
perc = ((idx + 1) / clade_count.to_f * 100).round 2
|
181
|
+
|
182
|
+
STDERR.printf("Processing clades: #{perc}%\r")
|
183
|
+
end
|
184
|
+
|
185
|
+
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
|
186
|
+
clade_sizes[clade_id] = clade.all_leaves.count
|
171
187
|
|
172
188
|
clade_members_f.puts [clade_id,
|
173
189
|
clade.all_leaves.count,
|
174
190
|
clade.all_leaves].join "\t"
|
175
191
|
|
176
|
-
key_cols_all_leaves
|
177
|
-
|
178
|
-
|
179
|
-
|
192
|
+
key_cols_all_leaves =
|
193
|
+
TreeClusters.low_ent_cols_with_bases clade.all_leaves,
|
194
|
+
leaf2attrs,
|
195
|
+
opts[:entropy_cutoff]
|
180
196
|
|
181
197
|
unless key_col_sets.has_key? key_cols_all_leaves
|
182
198
|
key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
|
@@ -185,17 +201,21 @@ begin
|
|
185
201
|
|
186
202
|
# This will change the node in the original NewickTree
|
187
203
|
clade.node.name = "'#{clade_id}'"
|
188
|
-
|
189
204
|
end
|
190
205
|
|
206
|
+
AbortIf.logger.info { "Writing results" }
|
207
|
+
|
191
208
|
# We only want key column sets that are unique to a single clade.
|
192
|
-
key_col_sets.select {
|
209
|
+
key_col_sets.select {|_, clades| clades.count == 1}.each do |kc_set, clades|
|
193
210
|
clade_id = clades.first
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
211
|
+
|
212
|
+
# TODO should we just skip processing clades that are too small rather than just not printing them out?
|
213
|
+
if clade_sizes[clade_id] > opts[:clade_size_cutoff]
|
214
|
+
key_cols_f.puts [clade_id,
|
215
|
+
kc_set.count,
|
216
|
+
kc_set.map {|pos, bases| "#{pos}-#{bases.join}"}
|
217
|
+
].join "\t"
|
218
|
+
end
|
199
219
|
end
|
200
220
|
|
201
221
|
annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
|