tree_clusters 0.5.2 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7d7f7982ea210cef7357e004ceb85d8b17389a32
4
- data.tar.gz: a924d2bfd587ce03678c391f2a5f761848169726
3
+ metadata.gz: 612ebc10d0f23a15ae9d11d35be3f1de197d29fd
4
+ data.tar.gz: '084ff094f024a062e1c0f1476b9105950f796edf'
5
5
  SHA512:
6
- metadata.gz: 7b4540a485a4635194a170c74c1bae7cd4eeb5b58df3d3a934933a161ef918903461ba75924f3baf65724050b2d3d122ac953a0822f5218c7ac0f8b37db72405
7
- data.tar.gz: fced0e673bcacf6dc113d6112bd5d02b9d5fcf5756ae50d1b77cb41b1c8f97b76a3fe2ef86f8f6c7ea8ddeee96a5b5aa06600cdef310842276a486b829763eae
6
+ metadata.gz: 247778fdebedd3213d96ccafc52ce1d5667bdb9e8eefee60e30b3dbf1fa618ea73000d6d2a5802e47e84d9ba6fa4c119fd3acb1d21907c4c268f63e38949ffcd
7
+ data.tar.gz: e8d5cc7d6aa7ce86f277f42c5a5b8bd7b7d2bebe5ac3fbf5c4201315bc7aa8e411940056c30dce22226864d982083605180bfe7304c824566e107a43a308c910
data/.gitignore CHANGED
@@ -23,6 +23,8 @@ TEST
23
23
 
24
24
  time.html
25
25
 
26
+ todo.txt
27
+
26
28
  # rspec failure tracking
27
29
  .rspec_status
28
30
  *.lock
@@ -0,0 +1,223 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ Signal.trap("PIPE", "EXIT")
4
+
5
+ require "tree_clusters"
6
+ require "trollop"
7
+ require "parse_fasta"
8
+ require "shannon"
9
+ require "fileutils"
10
+
11
+ TreeClusters.extend TreeClusters
12
+
13
+ def puts_info outf, clade_id, attr_cat, attr_set
14
+ outf.puts [clade_id, attr_cat, attr_set.to_a].join "\t"
15
+ end
16
+
17
+ opts = Trollop.options do
18
+ version TreeClusters::VERSION
19
+
20
+ banner <<-EOS
21
+
22
+
23
+ Checking IDs
24
+ ------------
25
+
26
+ IDs for the sequences must match between the three input files.
27
+
28
+ The tree file is allowed to have quoted taxa names, but the mapping
29
+ file and alignment file are not.
30
+
31
+ If your alignment file has spaces in the name, the ID part of the
32
+ header (i.e., the part up until the space) must match with the
33
+ sequence IDs in the tree and the mapping file.
34
+
35
+ Example: This would be okay.
36
+
37
+ tree file:
38
+ ('genome_A', 'genome_B');
39
+
40
+ aln file:
41
+ >genome_A apple pie
42
+ AAAAA
43
+ >genome_B brown sugar
44
+ AATTA
45
+
46
+ mapping file:
47
+ name coolness
48
+ genome_A cool
49
+ genome_B notcool
50
+
51
+
52
+ Subtracting parent nodes
53
+ ------------------------
54
+
55
+ If a clade's parent would be the root of the tree, no columns will
56
+ be subtracted when removing the parent columns as it would be the
57
+ entire alignment.
58
+
59
+ Options:
60
+ EOS
61
+
62
+ opt(:tree,
63
+ "Newick tree file",
64
+ type: :string)
65
+ opt(:mapping,
66
+ "Mapping file",
67
+ type: :string)
68
+ opt(:attrs,
69
+ "Attributes file",
70
+ type: :string)
71
+
72
+ opt(:entropy_cutoff,
73
+ "Cutoff to consider a column low entropy",
74
+ default: 0.0)
75
+ opt(:clade_size_cutoff,
76
+ "Consider only clades with at least this many leaves",
77
+ default: 1)
78
+
79
+ opt(:outdir,
80
+ "Output directory",
81
+ default: ".")
82
+ opt(:base,
83
+ "Basename for output",
84
+ default: "snazzy_clades")
85
+ end
86
+
87
+ abort_if opts[:tree].nil?,
88
+ "--tree is a required arg"
89
+ abort_if opts[:mapping].nil?,
90
+ "--mapping is a required arg"
91
+ abort_if opts[:attrs].nil?,
92
+ "--attrs is a required arg"
93
+
94
+ abort_unless_file_exists opts[:tree]
95
+ abort_unless_file_exists opts[:mapping]
96
+ abort_unless_file_exists opts[:attrs]
97
+
98
+ # TODO check IDs when attrs is not a fasta file
99
+ # TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:attrs]
100
+
101
+ abort_unless opts[:entropy_cutoff] >= 0,
102
+ "--entropy-cutoff must be >= 0"
103
+ abort_unless opts[:clade_size_cutoff] >= 1,
104
+ "--clade-size-cutoff must be >= 1"
105
+
106
+ FileUtils.mkdir_p opts[:outdir]
107
+
108
+ tree = NewickTree.fromFile opts[:tree]
109
+ metadata = TreeClusters.read_mapping_file opts[:mapping]
110
+ snazzy_clades = TreeClusters.snazzy_clades tree, metadata
111
+ attr_names, leaf2attrs = TreeClusters.read_attrs_file opts[:attrs]
112
+
113
+ clades_fname =
114
+ File.join opts[:outdir],
115
+ "#{opts[:base]}.snazzy_clades.txt"
116
+ members_fname =
117
+ File.join opts[:outdir],
118
+ "#{opts[:base]}.snazzy_clades_clade_members.txt"
119
+ attrs_fname =
120
+ File.join opts[:outdir],
121
+ "#{opts[:base]}.snazzy_clades_attrs_union.txt"
122
+ attrs_intersection_fname =
123
+ File.join opts[:outdir],
124
+ "#{opts[:base]}.snazzy_clades_attrs_intersection.txt"
125
+ attrs_minus_parent_attrs_fname =
126
+ File.join opts[:outdir],
127
+ "#{opts[:base]}.snazzy_clades_attrs_minus_parent_attrs.txt"
128
+ attrs_minus_sibling_attrs_fname =
129
+ File.join opts[:outdir],
130
+ "#{opts[:base]}.snazzy_clades_attrs_minus_sibling_attrs.txt"
131
+ attrs_minus_other_attrs_fname =
132
+ File.join opts[:outdir],
133
+ "#{opts[:base]}.snazzy_clades_attrs_minus_other_attrs.txt"
134
+
135
+
136
+ info_f =
137
+ File.open(clades_fname, "w")
138
+ clade_members_f =
139
+ File.open(members_fname, "w")
140
+ attrs_f =
141
+ File.open(attrs_fname, "w")
142
+ attrs_intersection_f =
143
+ File.open(attrs_intersection_fname, "w")
144
+ attrs_minus_parent_attrs_f =
145
+ File.open(attrs_minus_parent_attrs_fname, "w")
146
+ attrs_minus_sibling_attrs_f =
147
+ File.open(attrs_minus_sibling_attrs_fname, "w")
148
+ attrs_minus_other_attrs_f =
149
+ File.open(attrs_minus_other_attrs_fname, "w")
150
+
151
+
152
+ begin
153
+ # info is { metadata_category => metadata_tag , ... }
154
+ snazzy_clades.each_with_index do |(clade, info), idx|
155
+ assert clade.all_leaves.all? { |leaf| leaf2attrs.has_key? leaf },
156
+ "Not all leaves are present in the leaf2attrs hash table"
157
+
158
+ clade_id = "clade_#{idx+1}___#{clade.name}"
159
+
160
+ info_f.puts [clade_id,
161
+ info.count,
162
+ info.map { |pair| pair.join("|")}].join "\t"
163
+
164
+ clade_members_f.puts [clade_id,
165
+ clade.all_leaves.count,
166
+ clade.all_leaves].join "\t"
167
+
168
+ attr_names.each do |attr_category|
169
+ attrs_all_leaves =
170
+ leaf2attrs.attrs clade.all_leaves, attr_category
171
+
172
+ attrs_all_sibling_leaves =
173
+ leaf2attrs.attrs clade.all_sibling_leaves,
174
+ attr_category
175
+ attrs_parent_leaves =
176
+ leaf2attrs.attrs clade.parent_leaves,
177
+ attr_category
178
+ attrs_other_leaves =
179
+ leaf2attrs.attrs clade.other_leaves,
180
+ attr_category
181
+
182
+ attrs_all_minus_parent =
183
+ attrs_all_leaves.union - attrs_parent_leaves.union
184
+ attrs_all_minus_sibling =
185
+ attrs_all_leaves.union - attrs_all_sibling_leaves.union
186
+ attrs_all_minus_other =
187
+ attrs_all_leaves.union - attrs_other_leaves.union
188
+
189
+
190
+ puts_info attrs_f,
191
+ clade_id,
192
+ attr_category,
193
+ attrs_all_leaves.union
194
+
195
+ puts_info attrs_intersection_f,
196
+ clade_id,
197
+ attr_category,
198
+ attrs_all_leaves.intersection
199
+
200
+ puts_info attrs_minus_parent_attrs_f,
201
+ clade_id,
202
+ attr_category,
203
+ attrs_all_minus_parent
204
+
205
+ puts_info attrs_minus_sibling_attrs_f,
206
+ clade_id,
207
+ attr_category,
208
+ attrs_all_minus_sibling
209
+
210
+ puts_info attrs_minus_other_attrs_f,
211
+ clade_id,
212
+ attr_category,
213
+ attrs_all_minus_other
214
+ end
215
+ end
216
+ ensure
217
+ info_f.close
218
+ clade_members_f.close
219
+ attrs_f.close
220
+ attrs_minus_parent_attrs_f.close
221
+ attrs_minus_sibling_attrs_f.close
222
+ attrs_minus_other_attrs_f.close
223
+ end
data/lib/tree_clusters.rb CHANGED
@@ -149,11 +149,8 @@ module TreeClusters
149
149
  metadata.each do |md_cat, leaf2mdtag|
150
150
  already_checked = Set.new
151
151
  single_tag_clades = {}
152
- p [md_cat, leaf2mdtag]
153
152
 
154
153
  clades.each do |clade|
155
- p [clade.name, clade.all_leaves]
156
-
157
154
  assert clade.all_leaves.count > 1,
158
155
  "A clade cannot also be a leaf"
159
156
 
@@ -220,6 +217,45 @@ module TreeClusters
220
217
  metadata
221
218
  end
222
219
 
220
+ def read_attrs_file fname
221
+
222
+ attr_names = Set.new
223
+ File.open(fname, "rt").each_line.with_index do |line, idx|
224
+ unless idx.zero?
225
+ _, attr_name, _ = line.chomp.split "\t"
226
+
227
+ attr_names << attr_name
228
+ end
229
+ end
230
+
231
+ attr_names = attr_names.to_a.sort
232
+
233
+ attrs = TreeClusters::Attrs.new
234
+
235
+ File.open(fname, "rt").each_line.with_index do |line, idx|
236
+ unless idx.zero?
237
+ leaf, attr_name, attr_val = line.chomp.split "\t"
238
+
239
+ if attrs.has_key? leaf
240
+ if attrs[leaf].has_key? attr_name
241
+ attrs[leaf][attr_name] << attr_val
242
+ else
243
+ attrs[leaf][attr_name] = Set.new([attr_val])
244
+ end
245
+ else
246
+ attrs[leaf] = {}
247
+
248
+ attr_names.each do |name|
249
+ attrs[leaf][name] = Set.new
250
+ end
251
+ attrs[leaf][attr_name] << attr_val
252
+ end
253
+ end
254
+ end
255
+
256
+ [attr_names, attrs]
257
+ end
258
+
223
259
  # A Hash table for genome/leaf/taxa attributes
224
260
  class Attrs < Hash
225
261
 
@@ -1,3 +1,3 @@
1
1
  module TreeClusters
2
- VERSION = "0.5.2"
2
+ VERSION = "0.6.0"
3
3
  end
@@ -0,0 +1,17 @@
1
+ name attr name attr val
2
+ a-1 fruta manzana
3
+ a-1 fruta pera
4
+ a-1 color rojo
5
+ a-2 fruta manzana
6
+ a-2 color azul
7
+ a-2 fruta pera
8
+ a-2 color rojo
9
+ b-1 fruta pera
10
+ b-1 color blanco
11
+ b-2 color blanco
12
+ bb-1 color blanco
13
+ bbb-1 color blanco
14
+ bbb-2 color blanco
15
+ bbb-2 color gris
16
+ bbb-1 tamaño pequeña
17
+ bbb-2 tamaño pequeña
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tree_clusters
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-02 00:00:00.000000000 Z
11
+ date: 2017-11-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -173,6 +173,7 @@ email:
173
173
  - moorer@udel.edu
174
174
  executables:
175
175
  - snazzy_clades
176
+ - snazzy_clades_attrs
176
177
  - snazzy_clades_key_cols
177
178
  extensions: []
178
179
  extra_rdoc_files: []
@@ -188,6 +189,7 @@ files:
188
189
  - bin/console
189
190
  - bin/setup
190
191
  - exe/snazzy_clades
192
+ - exe/snazzy_clades_attrs
191
193
  - exe/snazzy_clades_key_cols
192
194
  - lib/tree_clusters.rb
193
195
  - lib/tree_clusters/version.rb
@@ -195,6 +197,7 @@ files:
195
197
  - test_files/non_bifurcating.aln
196
198
  - test_files/non_bifurcating.tre
197
199
  - test_files/small.aln
200
+ - test_files/small.attrs
198
201
  - test_files/small.mapping
199
202
  - test_files/small.tre
200
203
  - test_files/small_aln_bad_ids