tree_clusters 0.7.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e6f0523128e1d9efede01f9af494d99e7e51a48c
4
- data.tar.gz: 58bac03495f585ad2ea755abaac5ceefdb90e2d3
3
+ metadata.gz: 588e0cf7da9cd87056e339593b2de594aa7f7e34
4
+ data.tar.gz: a786a20ee672a2372ca3f1a1d1e8d7259acf4b16
5
5
  SHA512:
6
- metadata.gz: 73a82096be4f5be8199be28bf3e12b0ac63c08b9da1cfea1fded6cd4da7c448ffc7cdc8a285c681b88493b39fa32fcace02be0fff23190a314050d107db4b2f5
7
- data.tar.gz: a5b00bec9a4f567efc098e664e6479e691f96e761a5c57ac5e4f198159c9d1cfd320dbb2033a1a458597460f621815e03ee6ecef351f31db4597f4a0a7aac838
6
+ metadata.gz: dbf4875a27201c9cb1b406427e3c1fde6b195f71493b62116bcf676f97bcaf590fec7a8cfb4433a14a861093d7106a6cdacf2b190d03b7487c40bcac4726ef3a
7
+ data.tar.gz: 2bc0115fbeda56da3e8455cde72d859895e0e7b7670ceeeeac57ecc8434c656eedda846a653fbcfe69fdf33867c6b940a0ddaad55d0aaf8788606dfe261a702e
data/.gitignore CHANGED
@@ -28,3 +28,7 @@ todo.txt
28
28
  # rspec failure tracking
29
29
  .rspec_status
30
30
  *.lock
31
+
32
+ .idea
33
+
34
+ exe/key_cols2
data/exe/key_cols ADDED
@@ -0,0 +1,206 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ Signal.trap("PIPE", "EXIT")
4
+
5
+ require "tree_clusters"
6
+ require "trollop"
7
+ require "parse_fasta"
8
+ require "fileutils"
9
+
10
+ TreeClusters.extend TreeClusters
11
+
12
+ GREETING = "The '#{__FILE__}' program"
13
+ UNDERLINE = "=" * GREETING.length
14
+
15
+ opts = Trollop.options do
16
+ version TreeClusters::VERSION
17
+
18
+ # banner <<-EOS
19
+
20
+
21
+ # Checking IDs
22
+ # ------------
23
+
24
+ # IDs for the sequences must match between the three input files.
25
+
26
+ # The tree file is allowed to have quoted taxa names, but the
27
+ # mapping file and alignment file are not.
28
+
29
+ # If your alignment file has spaces in the name, the ID part of the
30
+ # header (i.e., the part up until the space) must match with the
31
+ # sequence IDs in the tree and the mapping file.
32
+
33
+ # Example: This would be okay.
34
+
35
+ # tree file:
36
+ # ('genome_A', 'genome_B');
37
+
38
+ # aln file:
39
+ # >genome_A apple pie
40
+ # AAAAA
41
+ # >genome_B brown sugar
42
+ # AATTA
43
+
44
+ # Options:
45
+ # EOS
46
+
47
+ banner <<-EOS
48
+
49
+ #{GREETING}
50
+ #{UNDERLINE}
51
+
52
+ Hi. My name is #{__FILE__}. If you give me a Newick tree file and
53
+ an alignment file (fasta format), I will tell you key columns for
54
+ all clades/clusters that have them.
55
+
56
+ Overview
57
+ --------
58
+
59
+ A clade has key columns if you can use the residue/nucleotide at
60
+ those columns to tell sequences in the clade from sequences outside
61
+ of the clade.
62
+
63
+ Here's an example....
64
+
65
+ After you run me (#{__FILE__} is my name), you'll get an output file
66
+ with the extension, '*.tree_clusters.key_cols.txt'. It may look
67
+ something like this:
68
+
69
+ cluster_A 4 1-A 2-A 3-A 5-G
70
+ cluster_B 4 1-C 2-C 3-C 5-A
71
+
72
+ This file has the clade name, the number of key columns for that
73
+ clade, and then the rest of the columns tell you the position
74
+ (1-based) and the nucleotide or residue in that column in all
75
+ sequences of that clade.
76
+
77
+ In this case we have only two clades. The key columns for both are
78
+ 1, 2, 3, and 5. So you can use columns 1, 2, 3, and 5 to classify a
79
+ sequence as belonging to one of these clades. If it has A, A, A,
80
+ and G in those positions, it'll be in cluster_A, and if it has C, C,
81
+ C, and A in those positions, it'll be in cluster_B. If it has any
82
+ other combination in those 4 columns of the alignment, it won't be
83
+ in either clade.
84
+
85
+ This is just a silly example and most of the time you'll get
86
+ different key columns for different clades. Note that every clade
87
+ may not have key columns listed depending on your data and the
88
+ options you select.
89
+
90
+ Notes & Gotchas
91
+ --------------
92
+
93
+ - I ignore columns with gap chars (currently just '-') regardless of
94
+ column entropy.
95
+
96
+ Option info
97
+ -----------
98
+
99
+ --entropy-cutoff: A cutoff of 0 means that you allow no variation at
100
+ any column.
101
+
102
+ --clade-size-cutoff: Use this option to ignore tiny clades.
103
+
104
+ Options:
105
+ EOS
106
+
107
+ opt(:tree,
108
+ "Newick tree file",
109
+ type: :string)
110
+ opt(:aln,
111
+ "Alignment file",
112
+ type: :string)
113
+
114
+ opt(:entropy_cutoff,
115
+ "Cutoff to consider a column low entropy",
116
+ default: 0.0)
117
+ opt(:clade_size_cutoff,
118
+ "Consider only clades with at least this many leaves",
119
+ default: 1)
120
+
121
+ opt(:outdir,
122
+ "Output directory",
123
+ default: ".")
124
+ opt(:base,
125
+ "Basename for output",
126
+ default: "snazzy_clades")
127
+ end
128
+
129
+ abort_if opts[:tree].nil?,
130
+ "--tree is a required arg. Try running: #{__FILE__} --help"
131
+ abort_if opts[:aln].nil?,
132
+ "--aln is a required arg. Try running: #{__FILE__} --help"
133
+
134
+ abort_unless_file_exists opts[:tree]
135
+ abort_unless_file_exists opts[:aln]
136
+
137
+ # TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]
138
+
139
+ abort_unless opts[:entropy_cutoff] >= 0,
140
+ "--entropy-cutoff must be >= 0"
141
+ abort_unless opts[:clade_size_cutoff] >= 1,
142
+ "--clade-size-cutoff must be >= 1"
143
+
144
+ FileUtils.mkdir_p opts[:outdir]
145
+
146
+ tree = NewickTree.fromFile opts[:tree]
147
+ leaf2attrs = TreeClusters.read_alignment opts[:aln]
148
+
149
+ members_fname =
150
+ File.join opts[:outdir],
151
+ "#{opts[:base]}.tree_clusters.clade_members.txt"
152
+ key_cols_fname =
153
+ File.join opts[:outdir],
154
+ "#{opts[:base]}.tree_clusters.key_cols.txt"
155
+ annotated_tree_fname =
156
+ File.join opts[:outdir],
157
+ "#{opts[:base]}.tree_clusters.annotated_tree.txt"
158
+
159
+ clade_members_f =
160
+ File.open(members_fname, "w")
161
+ key_cols_f =
162
+ File.open(key_cols_fname, "w")
163
+ annotated_tree_f =
164
+ File.open(annotated_tree_fname, "w")
165
+
166
+ key_col_sets = {}
167
+
168
+ begin
169
+ TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
170
+ clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
171
+
172
+ clade_members_f.puts [clade_id,
173
+ clade.all_leaves.count,
174
+ clade.all_leaves].join "\t"
175
+
176
+ key_cols_all_leaves =
177
+ TreeClusters.low_ent_cols_with_bases clade.all_leaves,
178
+ leaf2attrs,
179
+ opts[:entropy_cutoff]
180
+
181
+ unless key_col_sets.has_key? key_cols_all_leaves
182
+ key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
183
+ end
184
+ key_col_sets[key_cols_all_leaves] << clade_id
185
+
186
+ # This will change the node in the original NewickTree
187
+ clade.node.name = "'#{clade_id}'"
188
+
189
+ end
190
+
191
+ # We only want key column sets that are unique to a single clade.
192
+ key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades|
193
+ clade_id = clades.first
194
+ key_cols_f.puts [
195
+ clade_id,
196
+ kc_set.count,
197
+ kc_set.map { |pos, bases| "#{pos}-#{bases.join}" }
198
+ ].join "\t"
199
+ end
200
+
201
+ annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
202
+ ensure
203
+ clade_members_f.close
204
+ key_cols_f.close
205
+ annotated_tree_f.close
206
+ end
data/lib/tree_clusters.rb CHANGED
@@ -89,6 +89,25 @@ module TreeClusters
89
89
  Set.new low_ent_cols
90
90
  end
91
91
 
92
+ # Like low_ent_cols method but also returns the bases at the positions.
93
+ def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
94
+ low_ent_cols = []
95
+ alns = leaf2attrs.attrs leaves, :aln
96
+ aln_cols = alns.transpose
97
+
98
+ aln_cols.each_with_index do |aln_col, aln_col_idx|
99
+ has_gaps = aln_col.any? { |aa| aa == "-" }
100
+ low_entropy =
101
+ Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
102
+
103
+ if !has_gaps && low_entropy
104
+ low_ent_cols << [(aln_col_idx + 1), aln_col.map(&:upcase).uniq.sort]
105
+ end
106
+ end
107
+
108
+ Set.new low_ent_cols
109
+ end
110
+
92
111
  # @note If there are quoted names in the tree file, they are
93
112
  # unquoted first.
94
113
  def check_ids tree, mapping, aln
@@ -1,7 +1,8 @@
1
1
  module TreeClusters
2
2
  # Represents a clade in a NewickTree
3
3
  class Clade
4
- attr_accessor :name,
4
+ attr_accessor :node,
5
+ :name,
5
6
  :all_leaves,
6
7
  :left_leaves,
7
8
  :right_leaves,
@@ -18,10 +19,11 @@ module TreeClusters
18
19
  #
19
20
  # @param node [NewickNode] a NewickNode from a NewickTree
20
21
  # @param tree [NewickTree] a NewickTree
21
- def initialize node, tree, metadata=nil
22
+ def initialize node, tree, metadata = nil
22
23
  tree_taxa = tree.unquoted_taxa
23
24
 
24
- @name = unquote node.name
25
+ @node = node
26
+ @name = unquote node.name
25
27
  @all_leaves = descendant_leaves node
26
28
 
27
29
  if (children = node.children).count == 2
@@ -37,7 +39,7 @@ module TreeClusters
37
39
  # "Node #{node.name} has more than one sibling."
38
40
 
39
41
  @each_sibling_leaf_set = siblings.
40
- map { |node| descendant_leaves node }
42
+ map {|node| descendant_leaves node}
41
43
 
42
44
  @all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq
43
45
 
@@ -47,14 +49,14 @@ module TreeClusters
47
49
  @parent_leaves = descendant_leaves parent
48
50
 
49
51
  @other_leaves =
50
- Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
52
+ Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
51
53
 
52
54
  @non_parent_leaves =
53
- Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
55
+ Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
54
56
 
55
57
  if metadata
56
- @metadata = metadata
57
- @all_tags ||= get_all_tags
58
+ @metadata = metadata
59
+ @all_tags ||= get_all_tags
58
60
  @single_tag_info ||= get_single_tag_info
59
61
  else
60
62
  @single_tag_info = nil
@@ -67,16 +69,16 @@ module TreeClusters
67
69
  # well.
68
70
  def == clade
69
71
  (
70
- self.name == clade.name &&
71
- self.all_leaves == clade.all_leaves &&
72
- self.left_leaves == clade.left_leaves &&
73
- self.right_leaves == clade.right_leaves &&
74
- self.all_sibling_leaves == clade.all_sibling_leaves &&
75
- self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
76
- self.parent_leaves == clade.parent_leaves &&
77
- self.other_leaves == clade.other_leaves &&
78
- self.single_tag_info == clade.single_tag_info &&
79
- self.all_tags == clade.all_tags
72
+ self.name == clade.name &&
73
+ self.all_leaves == clade.all_leaves &&
74
+ self.left_leaves == clade.left_leaves &&
75
+ self.right_leaves == clade.right_leaves &&
76
+ self.all_sibling_leaves == clade.all_sibling_leaves &&
77
+ self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
78
+ self.parent_leaves == clade.parent_leaves &&
79
+ self.other_leaves == clade.other_leaves &&
80
+ self.single_tag_info == clade.single_tag_info &&
81
+ self.all_tags == clade.all_tags
80
82
  )
81
83
  end
82
84
 
@@ -99,7 +101,7 @@ module TreeClusters
99
101
  tag_info = self.all_leaves.map do |leaf|
100
102
  assert name2tag.has_key?(leaf),
101
103
  "leaf #{leaf} is not present in name2tag ht for " +
102
- "md_cat #{md_cat}"
104
+ "md_cat #{md_cat}"
103
105
 
104
106
  name2tag[leaf]
105
107
  end
@@ -113,11 +115,11 @@ module TreeClusters
113
115
  [unquote(node.name)]
114
116
  else
115
117
  node.
116
- descendants.
117
- flatten.
118
- uniq.
119
- select { |node| node.leaf? }.
120
- map { |node| unquote(node.name) }
118
+ descendants.
119
+ flatten.
120
+ uniq.
121
+ select {|node| node.leaf?}.
122
+ map {|node| unquote(node.name)}
121
123
  end
122
124
  end
123
125
 
@@ -1,3 +1,3 @@
1
1
  module TreeClusters
2
- VERSION = "0.7.0"
2
+ VERSION = "0.8.0"
3
3
  end
@@ -0,0 +1,14 @@
1
+ >a-1 apple
2
+ AAAAg
3
+ >a-2 pie
4
+ AAATg
5
+ >b-1 is
6
+ CCCCa
7
+ >b-2 really
8
+ cccTa
9
+ >bb-1 good
10
+ CCTGa
11
+ >bbb-1 and
12
+ CCGgg
13
+ >bbb-2 tasty
14
+ CGGGg
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tree_clusters
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-14 00:00:00.000000000 Z
11
+ date: 2018-07-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -173,6 +173,7 @@ email:
173
173
  - moorer@udel.edu
174
174
  executables:
175
175
  - clade_attrs
176
+ - key_cols
176
177
  - snazzy_clades
177
178
  - snazzy_clades_attrs
178
179
  - snazzy_clades_key_cols
@@ -190,6 +191,7 @@ files:
190
191
  - bin/console
191
192
  - bin/setup
192
193
  - exe/clade_attrs
194
+ - exe/key_cols
193
195
  - exe/snazzy_clades
194
196
  - exe/snazzy_clades_attrs
195
197
  - exe/snazzy_clades_key_cols
@@ -205,6 +207,7 @@ files:
205
207
  - test_files/small.attrs
206
208
  - test_files/small.mapping
207
209
  - test_files/small.tre
210
+ - test_files/small2.aln
208
211
  - test_files/small_aln_bad_ids
209
212
  - test_files/small_mapping_bad_ids
210
213
  - test_files/small_tree_bad_ids