tree_clusters 0.8.0 → 0.8.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 588e0cf7da9cd87056e339593b2de594aa7f7e34
4
- data.tar.gz: a786a20ee672a2372ca3f1a1d1e8d7259acf4b16
3
+ metadata.gz: 5f37ec0e062c33d19f8e525279d587d5512ba106
4
+ data.tar.gz: a59464b000338b3fc1aa1fa2c0ee267b5231df73
5
5
  SHA512:
6
- metadata.gz: dbf4875a27201c9cb1b406427e3c1fde6b195f71493b62116bcf676f97bcaf590fec7a8cfb4433a14a861093d7106a6cdacf2b190d03b7487c40bcac4726ef3a
7
- data.tar.gz: 2bc0115fbeda56da3e8455cde72d859895e0e7b7670ceeeeac57ecc8434c656eedda846a653fbcfe69fdf33867c6b940a0ddaad55d0aaf8788606dfe261a702e
6
+ metadata.gz: 53b9025669f26719174eb57376d7f1826095a074f896af1352bef944833f0ebfca95c94d444f6d990e7f97bd06002098b13e43c305e4416492114f4bd44408be
7
+ data.tar.gz: 1aadc375ce71361769e7f79768b3cbcd45606ac832100363fb68b79831d20019d2532b1d229d8b7e1d7b6becd49a5bb6ca72aea228d241a202686cfcf2d6e36c
data/.gitignore CHANGED
@@ -30,5 +30,6 @@ todo.txt
30
30
  *.lock
31
31
 
32
32
  .idea
33
+ ARST
33
34
 
34
35
  exe/key_cols2
data/README.md CHANGED
@@ -20,13 +20,25 @@ Or install it yourself as:
20
20
 
21
21
  $ gem install tree_clusters
22
22
 
23
- ## Documentation
23
+ ## Command line scripts
24
+
25
+ ### key_cols
26
+
27
+ Use this to find key columns in an alignment. For more information, run
28
+
29
+ ```bash
30
+ key_cols --help
31
+ ```
32
+
33
+ ## API
34
+
35
+ ### Documentation
24
36
 
25
37
  Checkout
26
38
  [TreeClusters docs](http://rubydoc.info/gems/tree_clusters)
27
39
  for the full api documentation.
28
40
 
29
- ## Usage
41
+ ### Usage
30
42
 
31
43
  Here is a small example.
32
44
 
data/exe/key_cols CHANGED
@@ -9,7 +9,8 @@ require "fileutils"
9
9
 
10
10
  TreeClusters.extend TreeClusters
11
11
 
12
- GREETING = "The '#{__FILE__}' program"
12
+ PROGRAM = "key_cols"
13
+ GREETING = "The #{PROGRAM} program"
13
14
  UNDERLINE = "=" * GREETING.length
14
15
 
15
16
  opts = Trollop.options do
@@ -46,12 +47,12 @@ opts = Trollop.options do
46
47
 
47
48
  banner <<-EOS
48
49
 
49
- #{GREETING}
50
- #{UNDERLINE}
50
+ The key_cols program
51
+ ====================
51
52
 
52
- Hi. My name is #{__FILE__}. If you give me a Newick tree file and
53
- an alignment file (fasta format), I will tell you key columns for
54
- all clades/clusters that have them.
53
+ Hi. My name is key_cols. If you give me a Newick tree file and an
54
+ alignment file (fasta format), I will tell you key columns for all
55
+ clades/clusters that have them.
55
56
 
56
57
  Overview
57
58
  --------
@@ -62,9 +63,8 @@ Overview
62
63
 
63
64
  Here's an example....
64
65
 
65
- After you run me (#{__FILE__} is my name), you'll get an output file
66
- with the extension, '*.tree_clusters.key_cols.txt'. It may look
67
- something like this:
66
+ After you run me, you'll get an output file with the extension,
67
+ '*.tree_clusters.key_cols.txt'. It may look something like this:
68
68
 
69
69
  cluster_A 4 1-A 2-A 3-A 5-G
70
70
  cluster_B 4 1-C 2-C 3-C 5-A
@@ -93,6 +93,10 @@ Notes & Gotchas
93
93
  - I ignore columns with gap chars (currently just '-') regardless of
94
94
  column entropy.
95
95
 
96
+ - If you're Newick tree has parentheses "(" or ")" in leaf names
97
+ you'll get a parsing error even if the name is single quoted.
98
+ We'll fix this at some point.
99
+
96
100
  Option info
97
101
  -----------
98
102
 
@@ -120,16 +124,16 @@ Option info
120
124
 
121
125
  opt(:outdir,
122
126
  "Output directory",
123
- default: ".")
127
+ default: "key_cols_output")
124
128
  opt(:base,
125
129
  "Basename for output",
126
- default: "snazzy_clades")
130
+ default: "key_cols_output")
127
131
  end
128
132
 
129
133
  abort_if opts[:tree].nil?,
130
- "--tree is a required arg. Try running: #{__FILE__} --help"
134
+ "--tree is a required arg. Try running: #{PROGRAM} --help"
131
135
  abort_if opts[:aln].nil?,
132
- "--aln is a required arg. Try running: #{__FILE__} --help"
136
+ "--aln is a required arg. Try running: #{PROGRAM} --help"
133
137
 
134
138
  abort_unless_file_exists opts[:tree]
135
139
  abort_unless_file_exists opts[:aln]
@@ -143,40 +147,52 @@ abort_unless opts[:clade_size_cutoff] >= 1,
143
147
 
144
148
  FileUtils.mkdir_p opts[:outdir]
145
149
 
150
+ AbortIf.logger.info { "Parsing input files" }
151
+
146
152
  tree = NewickTree.fromFile opts[:tree]
147
153
  leaf2attrs = TreeClusters.read_alignment opts[:aln]
148
154
 
149
- members_fname =
150
- File.join opts[:outdir],
151
- "#{opts[:base]}.tree_clusters.clade_members.txt"
152
- key_cols_fname =
153
- File.join opts[:outdir],
154
- "#{opts[:base]}.tree_clusters.key_cols.txt"
155
+ members_fname =
156
+ File.join opts[:outdir],
157
+ "#{opts[:base]}.tree_clusters.clade_members.txt"
158
+ key_cols_fname =
159
+ File.join opts[:outdir],
160
+ "#{opts[:base]}.tree_clusters.key_cols.txt"
155
161
  annotated_tree_fname =
156
- File.join opts[:outdir],
157
- "#{opts[:base]}.tree_clusters.annotated_tree.txt"
162
+ File.join opts[:outdir],
163
+ "#{opts[:base]}.tree_clusters.annotated_tree.txt"
158
164
 
159
- clade_members_f =
160
- File.open(members_fname, "w")
161
- key_cols_f =
162
- File.open(key_cols_fname, "w")
163
- annotated_tree_f =
164
- File.open(annotated_tree_fname, "w")
165
+ clade_members_f =
166
+ File.open(members_fname, "w")
167
+ key_cols_f =
168
+ File.open(key_cols_fname, "w")
169
+ annotated_tree_f =
170
+ File.open(annotated_tree_fname, "w")
165
171
 
166
172
  key_col_sets = {}
173
+ clade_sizes = {}
174
+ clade_count = TreeClusters.all_clades(tree).count
167
175
 
176
+ AbortIf.logger.info { "Processing clades" }
168
177
  begin
169
178
  TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
170
- clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
179
+ if ((idx + 1) % 100).zero?
180
+ perc = ((idx + 1) / clade_count.to_f * 100).round 2
181
+
182
+ STDERR.printf("Processing clades: #{perc}%\r")
183
+ end
184
+
185
+ clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
186
+ clade_sizes[clade_id] = clade.all_leaves.count
171
187
 
172
188
  clade_members_f.puts [clade_id,
173
189
  clade.all_leaves.count,
174
190
  clade.all_leaves].join "\t"
175
191
 
176
- key_cols_all_leaves =
177
- TreeClusters.low_ent_cols_with_bases clade.all_leaves,
178
- leaf2attrs,
179
- opts[:entropy_cutoff]
192
+ key_cols_all_leaves =
193
+ TreeClusters.low_ent_cols_with_bases clade.all_leaves,
194
+ leaf2attrs,
195
+ opts[:entropy_cutoff]
180
196
 
181
197
  unless key_col_sets.has_key? key_cols_all_leaves
182
198
  key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
@@ -185,17 +201,21 @@ begin
185
201
 
186
202
  # This will change the node in the original NewickTree
187
203
  clade.node.name = "'#{clade_id}'"
188
-
189
204
  end
190
205
 
206
+ AbortIf.logger.info { "Writing results" }
207
+
191
208
  # We only want key column sets that are unique to a single clade.
192
- key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades|
209
+ key_col_sets.select {|_, clades| clades.count == 1}.each do |kc_set, clades|
193
210
  clade_id = clades.first
194
- key_cols_f.puts [
195
- clade_id,
196
- kc_set.count,
197
- kc_set.map { |pos, bases| "#{pos}-#{bases.join}" }
198
- ].join "\t"
211
+
212
+ # TODO should we just skip processing clades that are too small rather than just not printing them out?
213
+ if clade_sizes[clade_id] > opts[:clade_size_cutoff]
214
+ key_cols_f.puts [clade_id,
215
+ kc_set.count,
216
+ kc_set.map {|pos, bases| "#{pos}-#{bases.join}"}
217
+ ].join "\t"
218
+ end
199
219
  end
200
220
 
201
221
  annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
@@ -1,3 +1,3 @@
1
1
  module TreeClusters
2
- VERSION = "0.8.0"
2
+ VERSION = "0.8.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tree_clusters
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore