tree_clusters 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7f7ba7426f7048a684ba53a52611efe18cda9603
4
- data.tar.gz: c511525c40d1b766c08c2a58e6076633d179439c
3
+ metadata.gz: be727e6384cf510290ddab3bf5aabd5075505e77
4
+ data.tar.gz: 69e94311ac35dcb26579edada3d6555b56048cd1
5
5
  SHA512:
6
- metadata.gz: 4f5ad29eb52cbcdf997a7d1f2431bf9f26adf76c534bd6bbc74fc79c05b331f224d7e8b056173e4b49454a324ee5eb14c2f8dc401379552a8b9fed36627e0eff
7
- data.tar.gz: 8d7885844452c979abdfbc71ee85c4ecedefa8818047062ede26eb894bc1913ae612093e07cf68c760aa12654e448b97ce323d6982f5589e2f13821debbf3bd7
6
+ metadata.gz: 5aacaa9b69741eff4f23b62b7be230befc444847cbf869ed0a5394e1fd768b34fc150483a71f079aef6fd683a9c58fe170e8ffdfc6f229b677862ce7b851c875
7
+ data.tar.gz: c3e639c1bced1b5c480e99f0d9e19ecb65a27dc3cf09e81d429714951d47b7b66905b82b9ed20d14ac05d36be0067b9b3de51796447acfd91a814ce73ed07fdc
data/.gitignore CHANGED
@@ -31,5 +31,6 @@ todo.txt
31
31
 
32
32
  .idea
33
33
  ARST
34
+ RNR
34
35
 
35
36
  exe/key_cols2
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ # TODO the block variables called "clade" aren't actually instances of the TreeClusters::Clade class. It's kind of confusing.
4
+
3
5
  Signal.trap("PIPE", "EXIT")
4
6
 
5
7
  require "tree_clusters"
@@ -54,6 +56,8 @@ The key_cols program
54
56
  alignment file (fasta format), I will tell you key columns for all
55
57
  clades/clusters that have them.
56
58
 
59
+ Version: v#{TreeClusters::VERSION}
60
+
57
61
  Overview
58
62
  --------
59
63
 
@@ -147,9 +151,10 @@ abort_unless opts[:clade_size_cutoff] >= 1,
147
151
 
148
152
  FileUtils.mkdir_p opts[:outdir]
149
153
 
150
- AbortIf.logger.info { "Parsing input files" }
154
+ AbortIf.logger.info { "Reading tree" }
155
+ tree = NewickTree.fromFile opts[:tree]
151
156
 
152
- tree = NewickTree.fromFile opts[:tree]
157
+ AbortIf.logger.info { "Reading alignment" }
153
158
  leaf2attrs = TreeClusters.read_alignment opts[:aln]
154
159
 
155
160
  members_fname =
@@ -169,22 +174,37 @@ key_cols_f =
169
174
  annotated_tree_f =
170
175
  File.open(annotated_tree_fname, "w")
171
176
 
172
- key_col_sets = {}
173
- clade_sizes = {}
174
- clade_count = TreeClusters.all_clades(tree).count
177
+ key_col_sets = {}
178
+ clade_sizes = {}
179
+ # AbortIf.logger.info { "Counting clades" }
180
+ # clade_count = TreeClusters.all_clades(tree).count
175
181
  change_these_names = Set.new
176
182
 
177
183
  def all_clades_helper tree
178
- TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
184
+ # TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
185
+ tree.clade_nodes.
186
+ reverse.
187
+ sort_by { |node| node.all_leaves.count }.
188
+ reverse
179
189
  end
180
190
 
181
- AbortIf.logger.info { "Processing clades" }
191
+ AbortIf.logger.info { "Getting all clades" }
192
+ clades = all_clades_helper(tree)
193
+ clade_count = clades.count
194
+
195
+ AbortIf.logger.info { "Processing clades (The first few clades go reaaally slowly, but then I speed up!)" }
182
196
  begin
183
- all_clades_helper(tree).each_with_index do |clade, idx|
184
- if ((idx + 1) % 100).zero?
197
+ clades.each_with_index do |clade, idx|
198
+ # It starts off really slowly, then speeds up a lot.
199
+ if (idx+1) < 100 ||
200
+ ((idx+1) < 1000 && ((idx + 1) % 10).zero?) ||
201
+ ((idx+1) < 10000 && ((idx + 1) % 100).zero?) ||
202
+ ((idx+1) < 100000 && ((idx + 1) % 1000).zero?) ||
203
+ ((idx + 1) % 10000).zero?
204
+
185
205
  perc = ((idx + 1) / clade_count.to_f * 100).round 2
186
206
 
187
- STDERR.printf("Processing clades: #{perc}%\r")
207
+ STDERR.printf("Processing clades: #{idx + 1} of #{clade_count} (#{perc}%%)\r")
188
208
  end
189
209
 
190
210
  clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
@@ -224,12 +244,12 @@ begin
224
244
 
225
245
  AbortIf.logger.info { "Annotating tree" }
226
246
 
227
- all_clades_helper(tree).each_with_index do |clade, idx|
247
+ clades.each_with_index do |clade, idx|
228
248
  clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
229
249
 
230
250
  if change_these_names.include? clade_id
231
251
  # This will change the node in the original NewickTree
232
- clade.node.name = "'#{clade_id}'"
252
+ clade.name = "'#{clade_id}'"
233
253
  end
234
254
  end
235
255
 
@@ -27,12 +27,20 @@ class NewickTree
27
27
  end
28
28
 
29
29
  def unquoted_taxa
30
+ # @note self.taxa calls taxa method on the root of the tree.
30
31
  self.taxa.map { |str| str.tr %q{"'}, "" }
31
32
  end
32
33
  end
33
34
 
35
+ class NewickNode
36
+ def all_leaves
37
+ self.leaves.map { |n| n.name.tr %q{"'}, "" }
38
+ end
39
+ end
40
+
34
41
  # Top level namespace of the Gem.
35
42
  module TreeClusters
43
+ PROJ_ROOT = File.join __dir__, ".."
36
44
 
37
45
  # Given an ary of strings, find the most common string in the ary.
38
46
  #
@@ -48,18 +56,24 @@ module TreeClusters
48
56
  # @note Each string is upcase'd before frequencies are calculated.
49
57
  def consensus bases
50
58
  bases.
51
- map(&:upcase).
52
- group_by(&:itself).
53
- sort_by { |_, bases| bases.count }.
54
- reverse.
55
- first.
56
- first
59
+ map(&:upcase).
60
+ group_by(&:itself).
61
+ sort_by { |_, bases| bases.count }.
62
+ reverse.
63
+ first.
64
+ first
57
65
  end
58
66
 
59
67
  def read_alignment aln_fname
60
68
  leaf2attrs = TreeClusters::Attrs.new
61
- aln_len = nil
69
+ aln_len = nil
70
+ seq_num = 0
62
71
  ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
72
+ seq_num += 1
73
+ if ((seq_num + 1) % 1000).zero?
74
+ STDERR.printf("Reading alignment: #{seq_num + 1}\r")
75
+ end
76
+
63
77
  leaf2attrs[rec.id] = { aln: rec.seq.chars }
64
78
 
65
79
  aln_len ||= rec.seq.length
@@ -73,13 +87,13 @@ module TreeClusters
73
87
 
74
88
  def low_ent_cols leaves, leaf2attrs, entropy_cutoff
75
89
  low_ent_cols = []
76
- alns = leaf2attrs.attrs leaves, :aln
77
- aln_cols = alns.transpose
90
+ alns = leaf2attrs.attrs leaves, :aln
91
+ aln_cols = alns.transpose
78
92
 
79
93
  aln_cols.each_with_index do |aln_col, aln_col_idx|
80
- has_gaps = aln_col.any? { |aa| aa == "-" }
94
+ has_gaps = aln_col.any? { |aa| aa == "-" }
81
95
  low_entropy =
82
- Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
96
+ Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
83
97
 
84
98
  if !has_gaps && low_entropy
85
99
  low_ent_cols << (aln_col_idx + 1)
@@ -92,11 +106,11 @@ module TreeClusters
92
106
  # Like low_ent_cols method but also returns the bases at the positions.
93
107
  def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
94
108
  low_ent_cols = []
95
- alns = leaf2attrs.attrs leaves, :aln
96
- aln_cols = alns.transpose
109
+ alns = leaf2attrs.attrs leaves, :aln
110
+ aln_cols = alns.transpose
97
111
 
98
112
  aln_cols.each_with_index do |aln_col, aln_col_idx|
99
- has_gaps = aln_col.any? { |aa| aa == "-" }
113
+ has_gaps = aln_col.any? { |aa| aa == "-" }
100
114
  low_entropy =
101
115
  Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
102
116
 
@@ -130,9 +144,9 @@ module TreeClusters
130
144
  if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
131
145
  AbortIf::logger.error { "Seq IDs did not match in all input files" }
132
146
 
133
- tree_ids = tree_ids.to_a.sort
147
+ tree_ids = tree_ids.to_a.sort
134
148
  mapping_ids = mapping_ids.to_a.sort
135
- aln_ids = aln_ids.to_a.sort
149
+ aln_ids = aln_ids.to_a.sort
136
150
 
137
151
  AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
138
152
  AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
@@ -152,7 +166,7 @@ module TreeClusters
152
166
  # @yieldparam clade [Clade] a clade of the tree
153
167
  #
154
168
  # @return [Enumerator<Clade>] enumerator of Clade objects
155
- def all_clades tree, metadata=nil
169
+ def all_clades tree, metadata = nil
156
170
  return enum_for(:all_clades, tree, metadata) unless block_given?
157
171
 
158
172
  tree.clade_nodes.reverse.each do |node|
@@ -164,12 +178,12 @@ module TreeClusters
164
178
  snazzy_clades = {}
165
179
 
166
180
  clades = self.
167
- all_clades(tree, metadata).
168
- sort_by { |clade| clade.all_leaves.count }.
169
- reverse
181
+ all_clades(tree, metadata).
182
+ sort_by { |clade| clade.all_leaves.count }.
183
+ reverse
170
184
 
171
185
  metadata.each do |md_cat, leaf2mdtag|
172
- already_checked = Set.new
186
+ already_checked = Set.new
173
187
  single_tag_clades = {}
174
188
 
175
189
  clades.each do |clade|
@@ -177,8 +191,8 @@ module TreeClusters
177
191
  "A clade cannot also be a leaf"
178
192
 
179
193
  unless clade.all_leaves.all? do |leaf|
180
- already_checked.include? leaf
181
- end
194
+ already_checked.include? leaf
195
+ end
182
196
  md_tags = clade.all_leaves.map do |leaf|
183
197
  assert leaf2mdtag.has_key?(leaf),
184
198
  "leaf #{leaf} is missing from leaf2mdtag ht"
@@ -224,9 +238,9 @@ module TreeClusters
224
238
  snazzy_info = {}
225
239
 
226
240
  clades = self.
227
- all_clades(tree, metadata).
228
- sort_by { |clade| clade.all_leaves.count }.
229
- reverse
241
+ all_clades(tree, metadata).
242
+ sort_by { |clade| clade.all_leaves.count }.
243
+ reverse
230
244
 
231
245
  # Non snazzy clades have a value of nil, so set all to nil and the
232
246
  # snazzy ones will be overwritten.
@@ -235,7 +249,7 @@ module TreeClusters
235
249
  end
236
250
 
237
251
  metadata.each do |md_cat, leaf2mdtag|
238
- already_checked = Set.new
252
+ already_checked = Set.new
239
253
  single_tag_clades = {}
240
254
 
241
255
  clades.each do |clade|
@@ -243,8 +257,8 @@ module TreeClusters
243
257
  "A clade cannot also be a leaf"
244
258
 
245
259
  unless clade.all_leaves.all? do |leaf|
246
- already_checked.include? leaf
247
- end
260
+ already_checked.include? leaf
261
+ end
248
262
  md_tags = clade.all_leaves.map do |leaf|
249
263
  assert leaf2mdtag.has_key?(leaf),
250
264
  "leaf #{leaf} is missing from leaf2mdtag ht"
@@ -289,7 +303,7 @@ module TreeClusters
289
303
 
290
304
  def read_mapping_file fname
291
305
  md_cat_names = nil
292
- metadata = TreeClusters::Attrs.new
306
+ metadata = TreeClusters::Attrs.new
293
307
 
294
308
  File.open(fname, "rt").each_line.with_index do |line, idx|
295
309
  leaf_name, *metadata_vals = line.chomp.split "\t"
@@ -342,6 +356,6 @@ module TreeClusters
342
356
  end
343
357
  end
344
358
 
345
- [attr_names, attrs]
359
+ [attr_names, attrs]
346
360
  end
347
361
  end
@@ -1,3 +1,3 @@
1
1
  module TreeClusters
2
- VERSION = "0.8.2"
2
+ VERSION = "0.8.3"
3
3
  end
@@ -0,0 +1 @@
1
+ (('a-1','a-2')'clade_3___cluster_A',(("b-1",b-2)'clade_4___cluster_B1',(bb-1,(bbb-1,bbb-2)'clade_5___cluster_B3')'clade_2___cluster_B2')'clade_1___cluster_B')cluster_C;
@@ -0,0 +1,5 @@
1
+ clade_1___cluster_B 5 b-1 b-2 bb-1 bbb-1 bbb-2
2
+ clade_2___cluster_B2 3 bb-1 bbb-1 bbb-2
3
+ clade_3___cluster_A 2 a-1 a-2
4
+ clade_4___cluster_B1 2 b-1 b-2
5
+ clade_5___cluster_B3 2 bbb-1 bbb-2
@@ -0,0 +1,5 @@
1
+ clade_1___cluster_B 1 1-C
2
+ clade_2___cluster_B2 2 1-C 4-G
3
+ clade_3___cluster_A 4 1-A 2-A 3-A 5-G
4
+ clade_4___cluster_B1 4 1-C 2-C 3-C 5-A
5
+ clade_5___cluster_B3 4 1-C 3-G 4-G 5-G
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tree_clusters
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.8.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
@@ -201,6 +201,9 @@ files:
201
201
  - lib/tree_clusters/clade.rb
202
202
  - lib/tree_clusters/version.rb
203
203
  - test_files/bad.aln
204
+ - test_files/key_cols/expected_output/small2/ARST.tree_clusters.annotated_tree.txt
205
+ - test_files/key_cols/expected_output/small2/ARST.tree_clusters.clade_members.txt
206
+ - test_files/key_cols/expected_output/small2/ARST.tree_clusters.key_cols.txt
204
207
  - test_files/non_bifurcating.aln
205
208
  - test_files/non_bifurcating.tre
206
209
  - test_files/small.aln