tree_clusters 0.8.2 → 0.8.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7f7ba7426f7048a684ba53a52611efe18cda9603
4
- data.tar.gz: c511525c40d1b766c08c2a58e6076633d179439c
3
+ metadata.gz: be727e6384cf510290ddab3bf5aabd5075505e77
4
+ data.tar.gz: 69e94311ac35dcb26579edada3d6555b56048cd1
5
5
  SHA512:
6
- metadata.gz: 4f5ad29eb52cbcdf997a7d1f2431bf9f26adf76c534bd6bbc74fc79c05b331f224d7e8b056173e4b49454a324ee5eb14c2f8dc401379552a8b9fed36627e0eff
7
- data.tar.gz: 8d7885844452c979abdfbc71ee85c4ecedefa8818047062ede26eb894bc1913ae612093e07cf68c760aa12654e448b97ce323d6982f5589e2f13821debbf3bd7
6
+ metadata.gz: 5aacaa9b69741eff4f23b62b7be230befc444847cbf869ed0a5394e1fd768b34fc150483a71f079aef6fd683a9c58fe170e8ffdfc6f229b677862ce7b851c875
7
+ data.tar.gz: c3e639c1bced1b5c480e99f0d9e19ecb65a27dc3cf09e81d429714951d47b7b66905b82b9ed20d14ac05d36be0067b9b3de51796447acfd91a814ce73ed07fdc
data/.gitignore CHANGED
@@ -31,5 +31,6 @@ todo.txt
31
31
 
32
32
  .idea
33
33
  ARST
34
+ RNR
34
35
 
35
36
  exe/key_cols2
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ # TODO the block variables called "clade" aren't actually instances of the TreeClusters::Clade class. It's kind of confusing.
4
+
3
5
  Signal.trap("PIPE", "EXIT")
4
6
 
5
7
  require "tree_clusters"
@@ -54,6 +56,8 @@ The key_cols program
54
56
  alignment file (fasta format), I will tell you key columns for all
55
57
  clades/clusters that have them.
56
58
 
59
+ Version: v#{TreeClusters::VERSION}
60
+
57
61
  Overview
58
62
  --------
59
63
 
@@ -147,9 +151,10 @@ abort_unless opts[:clade_size_cutoff] >= 1,
147
151
 
148
152
  FileUtils.mkdir_p opts[:outdir]
149
153
 
150
- AbortIf.logger.info { "Parsing input files" }
154
+ AbortIf.logger.info { "Reading tree" }
155
+ tree = NewickTree.fromFile opts[:tree]
151
156
 
152
- tree = NewickTree.fromFile opts[:tree]
157
+ AbortIf.logger.info { "Reading alignment" }
153
158
  leaf2attrs = TreeClusters.read_alignment opts[:aln]
154
159
 
155
160
  members_fname =
@@ -169,22 +174,37 @@ key_cols_f =
169
174
  annotated_tree_f =
170
175
  File.open(annotated_tree_fname, "w")
171
176
 
172
- key_col_sets = {}
173
- clade_sizes = {}
174
- clade_count = TreeClusters.all_clades(tree).count
177
+ key_col_sets = {}
178
+ clade_sizes = {}
179
+ # AbortIf.logger.info { "Counting clades" }
180
+ # clade_count = TreeClusters.all_clades(tree).count
175
181
  change_these_names = Set.new
176
182
 
177
183
  def all_clades_helper tree
178
- TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
184
+ # TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
185
+ tree.clade_nodes.
186
+ reverse.
187
+ sort_by { |node| node.all_leaves.count }.
188
+ reverse
179
189
  end
180
190
 
181
- AbortIf.logger.info { "Processing clades" }
191
+ AbortIf.logger.info { "Getting all clades" }
192
+ clades = all_clades_helper(tree)
193
+ clade_count = clades.count
194
+
195
+ AbortIf.logger.info { "Processing clades (The first few clades go reaaally slowly, but then I speed up!)" }
182
196
  begin
183
- all_clades_helper(tree).each_with_index do |clade, idx|
184
- if ((idx + 1) % 100).zero?
197
+ clades.each_with_index do |clade, idx|
198
+ # It starts off really slowly, then speeds up a lot.
199
+ if (idx+1) < 100 ||
200
+ ((idx+1) < 1000 && ((idx + 1) % 10).zero?) ||
201
+ ((idx+1) < 10000 && ((idx + 1) % 100).zero?) ||
202
+ ((idx+1) < 100000 && ((idx + 1) % 1000).zero?) ||
203
+ ((idx + 1) % 10000).zero?
204
+
185
205
  perc = ((idx + 1) / clade_count.to_f * 100).round 2
186
206
 
187
- STDERR.printf("Processing clades: #{perc}%\r")
207
+ STDERR.printf("Processing clades: #{idx + 1} of #{clade_count} (#{perc}%%)\r")
188
208
  end
189
209
 
190
210
  clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
@@ -224,12 +244,12 @@ begin
224
244
 
225
245
  AbortIf.logger.info { "Annotating tree" }
226
246
 
227
- all_clades_helper(tree).each_with_index do |clade, idx|
247
+ clades.each_with_index do |clade, idx|
228
248
  clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
229
249
 
230
250
  if change_these_names.include? clade_id
231
251
  # This will change the node in the original NewickTree
232
- clade.node.name = "'#{clade_id}'"
252
+ clade.name = "'#{clade_id}'"
233
253
  end
234
254
  end
235
255
 
@@ -27,12 +27,20 @@ class NewickTree
27
27
  end
28
28
 
29
29
  def unquoted_taxa
30
+ # @note self.taxa calls taxa method on the root of the tree.
30
31
  self.taxa.map { |str| str.tr %q{"'}, "" }
31
32
  end
32
33
  end
33
34
 
35
+ class NewickNode
36
+ def all_leaves
37
+ self.leaves.map { |n| n.name.tr %q{"'}, "" }
38
+ end
39
+ end
40
+
34
41
  # Top level namespace of the Gem.
35
42
  module TreeClusters
43
+ PROJ_ROOT = File.join __dir__, ".."
36
44
 
37
45
  # Given an ary of strings, find the most common string in the ary.
38
46
  #
@@ -48,18 +56,24 @@ module TreeClusters
48
56
  # @note Each string is upcase'd before frequencies are calculated.
49
57
  def consensus bases
50
58
  bases.
51
- map(&:upcase).
52
- group_by(&:itself).
53
- sort_by { |_, bases| bases.count }.
54
- reverse.
55
- first.
56
- first
59
+ map(&:upcase).
60
+ group_by(&:itself).
61
+ sort_by { |_, bases| bases.count }.
62
+ reverse.
63
+ first.
64
+ first
57
65
  end
58
66
 
59
67
  def read_alignment aln_fname
60
68
  leaf2attrs = TreeClusters::Attrs.new
61
- aln_len = nil
69
+ aln_len = nil
70
+ seq_num = 0
62
71
  ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
72
+ seq_num += 1
73
+ if ((seq_num + 1) % 1000).zero?
74
+ STDERR.printf("Reading alignment: #{seq_num + 1}\r")
75
+ end
76
+
63
77
  leaf2attrs[rec.id] = { aln: rec.seq.chars }
64
78
 
65
79
  aln_len ||= rec.seq.length
@@ -73,13 +87,13 @@ module TreeClusters
73
87
 
74
88
  def low_ent_cols leaves, leaf2attrs, entropy_cutoff
75
89
  low_ent_cols = []
76
- alns = leaf2attrs.attrs leaves, :aln
77
- aln_cols = alns.transpose
90
+ alns = leaf2attrs.attrs leaves, :aln
91
+ aln_cols = alns.transpose
78
92
 
79
93
  aln_cols.each_with_index do |aln_col, aln_col_idx|
80
- has_gaps = aln_col.any? { |aa| aa == "-" }
94
+ has_gaps = aln_col.any? { |aa| aa == "-" }
81
95
  low_entropy =
82
- Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
96
+ Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
83
97
 
84
98
  if !has_gaps && low_entropy
85
99
  low_ent_cols << (aln_col_idx + 1)
@@ -92,11 +106,11 @@ module TreeClusters
92
106
  # Like low_ent_cols method but also returns the bases at the positions.
93
107
  def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
94
108
  low_ent_cols = []
95
- alns = leaf2attrs.attrs leaves, :aln
96
- aln_cols = alns.transpose
109
+ alns = leaf2attrs.attrs leaves, :aln
110
+ aln_cols = alns.transpose
97
111
 
98
112
  aln_cols.each_with_index do |aln_col, aln_col_idx|
99
- has_gaps = aln_col.any? { |aa| aa == "-" }
113
+ has_gaps = aln_col.any? { |aa| aa == "-" }
100
114
  low_entropy =
101
115
  Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
102
116
 
@@ -130,9 +144,9 @@ module TreeClusters
130
144
  if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
131
145
  AbortIf::logger.error { "Seq IDs did not match in all input files" }
132
146
 
133
- tree_ids = tree_ids.to_a.sort
147
+ tree_ids = tree_ids.to_a.sort
134
148
  mapping_ids = mapping_ids.to_a.sort
135
- aln_ids = aln_ids.to_a.sort
149
+ aln_ids = aln_ids.to_a.sort
136
150
 
137
151
  AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
138
152
  AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
@@ -152,7 +166,7 @@ module TreeClusters
152
166
  # @yieldparam clade [Clade] a clade of the tree
153
167
  #
154
168
  # @return [Enumerator<Clade>] enumerator of Clade objects
155
- def all_clades tree, metadata=nil
169
+ def all_clades tree, metadata = nil
156
170
  return enum_for(:all_clades, tree, metadata) unless block_given?
157
171
 
158
172
  tree.clade_nodes.reverse.each do |node|
@@ -164,12 +178,12 @@ module TreeClusters
164
178
  snazzy_clades = {}
165
179
 
166
180
  clades = self.
167
- all_clades(tree, metadata).
168
- sort_by { |clade| clade.all_leaves.count }.
169
- reverse
181
+ all_clades(tree, metadata).
182
+ sort_by { |clade| clade.all_leaves.count }.
183
+ reverse
170
184
 
171
185
  metadata.each do |md_cat, leaf2mdtag|
172
- already_checked = Set.new
186
+ already_checked = Set.new
173
187
  single_tag_clades = {}
174
188
 
175
189
  clades.each do |clade|
@@ -177,8 +191,8 @@ module TreeClusters
177
191
  "A clade cannot also be a leaf"
178
192
 
179
193
  unless clade.all_leaves.all? do |leaf|
180
- already_checked.include? leaf
181
- end
194
+ already_checked.include? leaf
195
+ end
182
196
  md_tags = clade.all_leaves.map do |leaf|
183
197
  assert leaf2mdtag.has_key?(leaf),
184
198
  "leaf #{leaf} is missing from leaf2mdtag ht"
@@ -224,9 +238,9 @@ module TreeClusters
224
238
  snazzy_info = {}
225
239
 
226
240
  clades = self.
227
- all_clades(tree, metadata).
228
- sort_by { |clade| clade.all_leaves.count }.
229
- reverse
241
+ all_clades(tree, metadata).
242
+ sort_by { |clade| clade.all_leaves.count }.
243
+ reverse
230
244
 
231
245
  # Non snazzy clades have a value of nil, so set all to nil and the
232
246
  # snazzy ones will be overwritten.
@@ -235,7 +249,7 @@ module TreeClusters
235
249
  end
236
250
 
237
251
  metadata.each do |md_cat, leaf2mdtag|
238
- already_checked = Set.new
252
+ already_checked = Set.new
239
253
  single_tag_clades = {}
240
254
 
241
255
  clades.each do |clade|
@@ -243,8 +257,8 @@ module TreeClusters
243
257
  "A clade cannot also be a leaf"
244
258
 
245
259
  unless clade.all_leaves.all? do |leaf|
246
- already_checked.include? leaf
247
- end
260
+ already_checked.include? leaf
261
+ end
248
262
  md_tags = clade.all_leaves.map do |leaf|
249
263
  assert leaf2mdtag.has_key?(leaf),
250
264
  "leaf #{leaf} is missing from leaf2mdtag ht"
@@ -289,7 +303,7 @@ module TreeClusters
289
303
 
290
304
  def read_mapping_file fname
291
305
  md_cat_names = nil
292
- metadata = TreeClusters::Attrs.new
306
+ metadata = TreeClusters::Attrs.new
293
307
 
294
308
  File.open(fname, "rt").each_line.with_index do |line, idx|
295
309
  leaf_name, *metadata_vals = line.chomp.split "\t"
@@ -342,6 +356,6 @@ module TreeClusters
342
356
  end
343
357
  end
344
358
 
345
- [attr_names, attrs]
359
+ [attr_names, attrs]
346
360
  end
347
361
  end
@@ -1,3 +1,3 @@
1
1
  module TreeClusters
2
- VERSION = "0.8.2"
2
+ VERSION = "0.8.3"
3
3
  end
@@ -0,0 +1 @@
1
+ (('a-1','a-2')'clade_3___cluster_A',(("b-1",b-2)'clade_4___cluster_B1',(bb-1,(bbb-1,bbb-2)'clade_5___cluster_B3')'clade_2___cluster_B2')'clade_1___cluster_B')cluster_C;
@@ -0,0 +1,5 @@
1
+ clade_1___cluster_B 5 b-1 b-2 bb-1 bbb-1 bbb-2
2
+ clade_2___cluster_B2 3 bb-1 bbb-1 bbb-2
3
+ clade_3___cluster_A 2 a-1 a-2
4
+ clade_4___cluster_B1 2 b-1 b-2
5
+ clade_5___cluster_B3 2 bbb-1 bbb-2
@@ -0,0 +1,5 @@
1
+ clade_1___cluster_B 1 1-C
2
+ clade_2___cluster_B2 2 1-C 4-G
3
+ clade_3___cluster_A 4 1-A 2-A 3-A 5-G
4
+ clade_4___cluster_B1 4 1-C 2-C 3-C 5-A
5
+ clade_5___cluster_B3 4 1-C 3-G 4-G 5-G
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tree_clusters
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: 0.8.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
@@ -201,6 +201,9 @@ files:
201
201
  - lib/tree_clusters/clade.rb
202
202
  - lib/tree_clusters/version.rb
203
203
  - test_files/bad.aln
204
+ - test_files/key_cols/expected_output/small2/ARST.tree_clusters.annotated_tree.txt
205
+ - test_files/key_cols/expected_output/small2/ARST.tree_clusters.clade_members.txt
206
+ - test_files/key_cols/expected_output/small2/ARST.tree_clusters.key_cols.txt
204
207
  - test_files/non_bifurcating.aln
205
208
  - test_files/non_bifurcating.tre
206
209
  - test_files/small.aln