tree_clusters 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 588e0cf7da9cd87056e339593b2de594aa7f7e34
4
- data.tar.gz: a786a20ee672a2372ca3f1a1d1e8d7259acf4b16
3
+ metadata.gz: 5f37ec0e062c33d19f8e525279d587d5512ba106
4
+ data.tar.gz: a59464b000338b3fc1aa1fa2c0ee267b5231df73
5
5
  SHA512:
6
- metadata.gz: dbf4875a27201c9cb1b406427e3c1fde6b195f71493b62116bcf676f97bcaf590fec7a8cfb4433a14a861093d7106a6cdacf2b190d03b7487c40bcac4726ef3a
7
- data.tar.gz: 2bc0115fbeda56da3e8455cde72d859895e0e7b7670ceeeeac57ecc8434c656eedda846a653fbcfe69fdf33867c6b940a0ddaad55d0aaf8788606dfe261a702e
6
+ metadata.gz: 53b9025669f26719174eb57376d7f1826095a074f896af1352bef944833f0ebfca95c94d444f6d990e7f97bd06002098b13e43c305e4416492114f4bd44408be
7
+ data.tar.gz: 1aadc375ce71361769e7f79768b3cbcd45606ac832100363fb68b79831d20019d2532b1d229d8b7e1d7b6becd49a5bb6ca72aea228d241a202686cfcf2d6e36c
data/.gitignore CHANGED
@@ -30,5 +30,6 @@ todo.txt
30
30
  *.lock
31
31
 
32
32
  .idea
33
+ ARST
33
34
 
34
35
  exe/key_cols2
data/README.md CHANGED
@@ -20,13 +20,25 @@ Or install it yourself as:
20
20
 
21
21
  $ gem install tree_clusters
22
22
 
23
- ## Documentation
23
+ ## Command line scripts
24
+
25
+ ### key_cols
26
+
27
+ Use this to find key columns in an alignment. For more information, run
28
+
29
+ ```bash
30
+ key_cols --help
31
+ ```
32
+
33
+ ## API
34
+
35
+ ### Documentation
24
36
 
25
37
  Checkout
26
38
  [TreeClusters docs](http://rubydoc.info/gems/tree_clusters)
27
39
  for the full api documentation.
28
40
 
29
- ## Usage
41
+ ### Usage
30
42
 
31
43
  Here is a small example.
32
44
 
data/exe/key_cols CHANGED
@@ -9,7 +9,8 @@ require "fileutils"
9
9
 
10
10
  TreeClusters.extend TreeClusters
11
11
 
12
- GREETING = "The '#{__FILE__}' program"
12
+ PROGRAM = "key_cols"
13
+ GREETING = "The #{PROGRAM} program"
13
14
  UNDERLINE = "=" * GREETING.length
14
15
 
15
16
  opts = Trollop.options do
@@ -46,12 +47,12 @@ opts = Trollop.options do
46
47
 
47
48
  banner <<-EOS
48
49
 
49
- #{GREETING}
50
- #{UNDERLINE}
50
+ The key_cols program
51
+ ====================
51
52
 
52
- Hi. My name is #{__FILE__}. If you give me a Newick tree file and
53
- an alignment file (fasta format), I will tell you key columns for
54
- all clades/clusters that have them.
53
+ Hi. My name is key_cols. If you give me a Newick tree file and an
54
+ alignment file (fasta format), I will tell you key columns for all
55
+ clades/clusters that have them.
55
56
 
56
57
  Overview
57
58
  --------
@@ -62,9 +63,8 @@ Overview
62
63
 
63
64
  Here's an example....
64
65
 
65
- After you run me (#{__FILE__} is my name), you'll get an output file
66
- with the extension, '*.tree_clusters.key_cols.txt'. It may look
67
- something like this:
66
+ After you run me, you'll get an output file with the extension,
67
+ '*.tree_clusters.key_cols.txt'. It may look something like this:
68
68
 
69
69
  cluster_A 4 1-A 2-A 3-A 5-G
70
70
  cluster_B 4 1-C 2-C 3-C 5-A
@@ -93,6 +93,10 @@ Notes & Gotchas
93
93
  - I ignore columns with gap chars (currently just '-') regardless of
94
94
  column entropy.
95
95
 
96
+ - If you're Newick tree has parentheses "(" or ")" in leaf names
97
+ you'll get a parsing error even if the name is single quoted.
98
+ We'll fix this at some point.
99
+
96
100
  Option info
97
101
  -----------
98
102
 
@@ -120,16 +124,16 @@ Option info
120
124
 
121
125
  opt(:outdir,
122
126
  "Output directory",
123
- default: ".")
127
+ default: "key_cols_output")
124
128
  opt(:base,
125
129
  "Basename for output",
126
- default: "snazzy_clades")
130
+ default: "key_cols_output")
127
131
  end
128
132
 
129
133
  abort_if opts[:tree].nil?,
130
- "--tree is a required arg. Try running: #{__FILE__} --help"
134
+ "--tree is a required arg. Try running: #{PROGRAM} --help"
131
135
  abort_if opts[:aln].nil?,
132
- "--aln is a required arg. Try running: #{__FILE__} --help"
136
+ "--aln is a required arg. Try running: #{PROGRAM} --help"
133
137
 
134
138
  abort_unless_file_exists opts[:tree]
135
139
  abort_unless_file_exists opts[:aln]
@@ -143,40 +147,52 @@ abort_unless opts[:clade_size_cutoff] >= 1,
143
147
 
144
148
  FileUtils.mkdir_p opts[:outdir]
145
149
 
150
+ AbortIf.logger.info { "Parsing input files" }
151
+
146
152
  tree = NewickTree.fromFile opts[:tree]
147
153
  leaf2attrs = TreeClusters.read_alignment opts[:aln]
148
154
 
149
- members_fname =
150
- File.join opts[:outdir],
151
- "#{opts[:base]}.tree_clusters.clade_members.txt"
152
- key_cols_fname =
153
- File.join opts[:outdir],
154
- "#{opts[:base]}.tree_clusters.key_cols.txt"
155
+ members_fname =
156
+ File.join opts[:outdir],
157
+ "#{opts[:base]}.tree_clusters.clade_members.txt"
158
+ key_cols_fname =
159
+ File.join opts[:outdir],
160
+ "#{opts[:base]}.tree_clusters.key_cols.txt"
155
161
  annotated_tree_fname =
156
- File.join opts[:outdir],
157
- "#{opts[:base]}.tree_clusters.annotated_tree.txt"
162
+ File.join opts[:outdir],
163
+ "#{opts[:base]}.tree_clusters.annotated_tree.txt"
158
164
 
159
- clade_members_f =
160
- File.open(members_fname, "w")
161
- key_cols_f =
162
- File.open(key_cols_fname, "w")
163
- annotated_tree_f =
164
- File.open(annotated_tree_fname, "w")
165
+ clade_members_f =
166
+ File.open(members_fname, "w")
167
+ key_cols_f =
168
+ File.open(key_cols_fname, "w")
169
+ annotated_tree_f =
170
+ File.open(annotated_tree_fname, "w")
165
171
 
166
172
  key_col_sets = {}
173
+ clade_sizes = {}
174
+ clade_count = TreeClusters.all_clades(tree).count
167
175
 
176
+ AbortIf.logger.info { "Processing clades" }
168
177
  begin
169
178
  TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
170
- clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
179
+ if ((idx + 1) % 100).zero?
180
+ perc = ((idx + 1) / clade_count.to_f * 100).round 2
181
+
182
+ STDERR.printf("Processing clades: #{perc}%\r")
183
+ end
184
+
185
+ clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
186
+ clade_sizes[clade_id] = clade.all_leaves.count
171
187
 
172
188
  clade_members_f.puts [clade_id,
173
189
  clade.all_leaves.count,
174
190
  clade.all_leaves].join "\t"
175
191
 
176
- key_cols_all_leaves =
177
- TreeClusters.low_ent_cols_with_bases clade.all_leaves,
178
- leaf2attrs,
179
- opts[:entropy_cutoff]
192
+ key_cols_all_leaves =
193
+ TreeClusters.low_ent_cols_with_bases clade.all_leaves,
194
+ leaf2attrs,
195
+ opts[:entropy_cutoff]
180
196
 
181
197
  unless key_col_sets.has_key? key_cols_all_leaves
182
198
  key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
@@ -185,17 +201,21 @@ begin
185
201
 
186
202
  # This will change the node in the original NewickTree
187
203
  clade.node.name = "'#{clade_id}'"
188
-
189
204
  end
190
205
 
206
+ AbortIf.logger.info { "Writing results" }
207
+
191
208
  # We only want key column sets that are unique to a single clade.
192
- key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades|
209
+ key_col_sets.select {|_, clades| clades.count == 1}.each do |kc_set, clades|
193
210
  clade_id = clades.first
194
- key_cols_f.puts [
195
- clade_id,
196
- kc_set.count,
197
- kc_set.map { |pos, bases| "#{pos}-#{bases.join}" }
198
- ].join "\t"
211
+
212
+ # TODO should we just skip processing clades that are too small rather than just not printing them out?
213
+ if clade_sizes[clade_id] > opts[:clade_size_cutoff]
214
+ key_cols_f.puts [clade_id,
215
+ kc_set.count,
216
+ kc_set.map {|pos, bases| "#{pos}-#{bases.join}"}
217
+ ].join "\t"
218
+ end
199
219
  end
200
220
 
201
221
  annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
@@ -1,3 +1,3 @@
1
1
  module TreeClusters
2
- VERSION = "0.8.0"
2
+ VERSION = "0.8.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tree_clusters
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.8.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore