tree_clusters 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7d7f7982ea210cef7357e004ceb85d8b17389a32
4
- data.tar.gz: a924d2bfd587ce03678c391f2a5f761848169726
3
+ metadata.gz: 612ebc10d0f23a15ae9d11d35be3f1de197d29fd
4
+ data.tar.gz: '084ff094f024a062e1c0f1476b9105950f796edf'
5
5
  SHA512:
6
- metadata.gz: 7b4540a485a4635194a170c74c1bae7cd4eeb5b58df3d3a934933a161ef918903461ba75924f3baf65724050b2d3d122ac953a0822f5218c7ac0f8b37db72405
7
- data.tar.gz: fced0e673bcacf6dc113d6112bd5d02b9d5fcf5756ae50d1b77cb41b1c8f97b76a3fe2ef86f8f6c7ea8ddeee96a5b5aa06600cdef310842276a486b829763eae
6
+ metadata.gz: 247778fdebedd3213d96ccafc52ce1d5667bdb9e8eefee60e30b3dbf1fa618ea73000d6d2a5802e47e84d9ba6fa4c119fd3acb1d21907c4c268f63e38949ffcd
7
+ data.tar.gz: e8d5cc7d6aa7ce86f277f42c5a5b8bd7b7d2bebe5ac3fbf5c4201315bc7aa8e411940056c30dce22226864d982083605180bfe7304c824566e107a43a308c910
data/.gitignore CHANGED
@@ -23,6 +23,8 @@ TEST
23
23
 
24
24
  time.html
25
25
 
26
+ todo.txt
27
+
26
28
  # rspec failure tracking
27
29
  .rspec_status
28
30
  *.lock
@@ -0,0 +1,223 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ Signal.trap("PIPE", "EXIT")
4
+
5
+ require "tree_clusters"
6
+ require "trollop"
7
+ require "parse_fasta"
8
+ require "shannon"
9
+ require "fileutils"
10
+
11
+ TreeClusters.extend TreeClusters
12
+
13
+ def puts_info outf, clade_id, attr_cat, attr_set
14
+ outf.puts [clade_id, attr_cat, attr_set.to_a].join "\t"
15
+ end
16
+
17
+ opts = Trollop.options do
18
+ version TreeClusters::VERSION
19
+
20
+ banner <<-EOS
21
+
22
+
23
+ Checking IDs
24
+ ------------
25
+
26
+ IDs for the sequences must match between the three input files.
27
+
28
+ The tree file is allowed to have quoted taxa names, but the mapping
29
+ file and alignment file are not.
30
+
31
+ If your alignment file has spaces in the name, the ID part of the
32
+ header (i.e., the part up until the space) must match with the
33
+ sequence IDs in the tree and the mapping file.
34
+
35
+ Example: This would be okay.
36
+
37
+ tree file:
38
+ ('genome_A', 'genome_B');
39
+
40
+ aln file:
41
+ >genome_A apple pie
42
+ AAAAA
43
+ >genome_B brown sugar
44
+ AATTA
45
+
46
+ mapping file:
47
+ name coolness
48
+ genome_A cool
49
+ genome_B notcool
50
+
51
+
52
+ Subtracting parent nodes
53
+ ------------------------
54
+
55
+ If a clade's parent would be the root of the tree, no columns will
56
+ be subtracted when removing the parent columns as it would be the
57
+ entire alignment.
58
+
59
+ Options:
60
+ EOS
61
+
62
+ opt(:tree,
63
+ "Newick tree file",
64
+ type: :string)
65
+ opt(:mapping,
66
+ "Mapping file",
67
+ type: :string)
68
+ opt(:attrs,
69
+ "Attributes file",
70
+ type: :string)
71
+
72
+ opt(:entropy_cutoff,
73
+ "Cutoff to consider a column low entropy",
74
+ default: 0.0)
75
+ opt(:clade_size_cutoff,
76
+ "Consider only clades with at least this many leaves",
77
+ default: 1)
78
+
79
+ opt(:outdir,
80
+ "Output directory",
81
+ default: ".")
82
+ opt(:base,
83
+ "Basename for output",
84
+ default: "snazzy_clades")
85
+ end
86
+
87
+ abort_if opts[:tree].nil?,
88
+ "--tree is a required arg"
89
+ abort_if opts[:mapping].nil?,
90
+ "--mapping is a required arg"
91
+ abort_if opts[:attrs].nil?,
92
+ "--attrs is a required arg"
93
+
94
+ abort_unless_file_exists opts[:tree]
95
+ abort_unless_file_exists opts[:mapping]
96
+ abort_unless_file_exists opts[:attrs]
97
+
98
+ # TODO check IDs when attrs is not a fasta file
99
+ # TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:attrs]
100
+
101
+ abort_unless opts[:entropy_cutoff] >= 0,
102
+ "--entropy-cutoff must be >= 0"
103
+ abort_unless opts[:clade_size_cutoff] >= 1,
104
+ "--clade-size-cutoff must be >= 1"
105
+
106
+ FileUtils.mkdir_p opts[:outdir]
107
+
108
+ tree = NewickTree.fromFile opts[:tree]
109
+ metadata = TreeClusters.read_mapping_file opts[:mapping]
110
+ snazzy_clades = TreeClusters.snazzy_clades tree, metadata
111
+ attr_names, leaf2attrs = TreeClusters.read_attrs_file opts[:attrs]
112
+
113
+ clades_fname =
114
+ File.join opts[:outdir],
115
+ "#{opts[:base]}.snazzy_clades.txt"
116
+ members_fname =
117
+ File.join opts[:outdir],
118
+ "#{opts[:base]}.snazzy_clades_clade_members.txt"
119
+ attrs_fname =
120
+ File.join opts[:outdir],
121
+ "#{opts[:base]}.snazzy_clades_attrs_union.txt"
122
+ attrs_intersection_fname =
123
+ File.join opts[:outdir],
124
+ "#{opts[:base]}.snazzy_clades_attrs_intersection.txt"
125
+ attrs_minus_parent_attrs_fname =
126
+ File.join opts[:outdir],
127
+ "#{opts[:base]}.snazzy_clades_attrs_minus_parent_attrs.txt"
128
+ attrs_minus_sibling_attrs_fname =
129
+ File.join opts[:outdir],
130
+ "#{opts[:base]}.snazzy_clades_attrs_minus_sibling_attrs.txt"
131
+ attrs_minus_other_attrs_fname =
132
+ File.join opts[:outdir],
133
+ "#{opts[:base]}.snazzy_clades_attrs_minus_other_attrs.txt"
134
+
135
+
136
+ info_f =
137
+ File.open(clades_fname, "w")
138
+ clade_members_f =
139
+ File.open(members_fname, "w")
140
+ attrs_f =
141
+ File.open(attrs_fname, "w")
142
+ attrs_intersection_f =
143
+ File.open(attrs_intersection_fname, "w")
144
+ attrs_minus_parent_attrs_f =
145
+ File.open(attrs_minus_parent_attrs_fname, "w")
146
+ attrs_minus_sibling_attrs_f =
147
+ File.open(attrs_minus_sibling_attrs_fname, "w")
148
+ attrs_minus_other_attrs_f =
149
+ File.open(attrs_minus_other_attrs_fname, "w")
150
+
151
+
152
+ begin
153
+ # info is { metadata_category => metadata_tag , ... }
154
+ snazzy_clades.each_with_index do |(clade, info), idx|
155
+ assert clade.all_leaves.all? { |leaf| leaf2attrs.has_key? leaf },
156
+ "Not all leaves are present in the leaf2attrs hash table"
157
+
158
+ clade_id = "clade_#{idx+1}___#{clade.name}"
159
+
160
+ info_f.puts [clade_id,
161
+ info.count,
162
+ info.map { |pair| pair.join("|")}].join "\t"
163
+
164
+ clade_members_f.puts [clade_id,
165
+ clade.all_leaves.count,
166
+ clade.all_leaves].join "\t"
167
+
168
+ attr_names.each do |attr_category|
169
+ attrs_all_leaves =
170
+ leaf2attrs.attrs clade.all_leaves, attr_category
171
+
172
+ attrs_all_sibling_leaves =
173
+ leaf2attrs.attrs clade.all_sibling_leaves,
174
+ attr_category
175
+ attrs_parent_leaves =
176
+ leaf2attrs.attrs clade.parent_leaves,
177
+ attr_category
178
+ attrs_other_leaves =
179
+ leaf2attrs.attrs clade.other_leaves,
180
+ attr_category
181
+
182
+ attrs_all_minus_parent =
183
+ attrs_all_leaves.union - attrs_parent_leaves.union
184
+ attrs_all_minus_sibling =
185
+ attrs_all_leaves.union - attrs_all_sibling_leaves.union
186
+ attrs_all_minus_other =
187
+ attrs_all_leaves.union - attrs_other_leaves.union
188
+
189
+
190
+ puts_info attrs_f,
191
+ clade_id,
192
+ attr_category,
193
+ attrs_all_leaves.union
194
+
195
+ puts_info attrs_intersection_f,
196
+ clade_id,
197
+ attr_category,
198
+ attrs_all_leaves.intersection
199
+
200
+ puts_info attrs_minus_parent_attrs_f,
201
+ clade_id,
202
+ attr_category,
203
+ attrs_all_minus_parent
204
+
205
+ puts_info attrs_minus_sibling_attrs_f,
206
+ clade_id,
207
+ attr_category,
208
+ attrs_all_minus_sibling
209
+
210
+ puts_info attrs_minus_other_attrs_f,
211
+ clade_id,
212
+ attr_category,
213
+ attrs_all_minus_other
214
+ end
215
+ end
216
+ ensure
217
+ info_f.close
218
+ clade_members_f.close
219
+ attrs_f.close
220
+ attrs_minus_parent_attrs_f.close
221
+ attrs_minus_sibling_attrs_f.close
222
+ attrs_minus_other_attrs_f.close
223
+ end
data/lib/tree_clusters.rb CHANGED
@@ -149,11 +149,8 @@ module TreeClusters
149
149
  metadata.each do |md_cat, leaf2mdtag|
150
150
  already_checked = Set.new
151
151
  single_tag_clades = {}
152
- p [md_cat, leaf2mdtag]
153
152
 
154
153
  clades.each do |clade|
155
- p [clade.name, clade.all_leaves]
156
-
157
154
  assert clade.all_leaves.count > 1,
158
155
  "A clade cannot also be a leaf"
159
156
 
@@ -220,6 +217,45 @@ module TreeClusters
220
217
  metadata
221
218
  end
222
219
 
220
+ def read_attrs_file fname
221
+
222
+ attr_names = Set.new
223
+ File.open(fname, "rt").each_line.with_index do |line, idx|
224
+ unless idx.zero?
225
+ _, attr_name, _ = line.chomp.split "\t"
226
+
227
+ attr_names << attr_name
228
+ end
229
+ end
230
+
231
+ attr_names = attr_names.to_a.sort
232
+
233
+ attrs = TreeClusters::Attrs.new
234
+
235
+ File.open(fname, "rt").each_line.with_index do |line, idx|
236
+ unless idx.zero?
237
+ leaf, attr_name, attr_val = line.chomp.split "\t"
238
+
239
+ if attrs.has_key? leaf
240
+ if attrs[leaf].has_key? attr_name
241
+ attrs[leaf][attr_name] << attr_val
242
+ else
243
+ attrs[leaf][attr_name] = Set.new([attr_val])
244
+ end
245
+ else
246
+ attrs[leaf] = {}
247
+
248
+ attr_names.each do |name|
249
+ attrs[leaf][name] = Set.new
250
+ end
251
+ attrs[leaf][attr_name] << attr_val
252
+ end
253
+ end
254
+ end
255
+
256
+ [attr_names, attrs]
257
+ end
258
+
223
259
  # A Hash table for genome/leaf/taxa attributes
224
260
  class Attrs < Hash
225
261
 
@@ -1,3 +1,3 @@
1
1
  module TreeClusters
2
- VERSION = "0.5.2"
2
+ VERSION = "0.6.0"
3
3
  end
@@ -0,0 +1,17 @@
1
+ name attr name attr val
2
+ a-1 fruta manzana
3
+ a-1 fruta pera
4
+ a-1 color rojo
5
+ a-2 fruta manzana
6
+ a-2 color azul
7
+ a-2 fruta pera
8
+ a-2 color rojo
9
+ b-1 fruta pera
10
+ b-1 color blanco
11
+ b-2 color blanco
12
+ bb-1 color blanco
13
+ bbb-1 color blanco
14
+ bbb-2 color blanco
15
+ bbb-2 color gris
16
+ bbb-1 tamaño pequeña
17
+ bbb-2 tamaño pequeña
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tree_clusters
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-11-02 00:00:00.000000000 Z
11
+ date: 2017-11-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -173,6 +173,7 @@ email:
173
173
  - moorer@udel.edu
174
174
  executables:
175
175
  - snazzy_clades
176
+ - snazzy_clades_attrs
176
177
  - snazzy_clades_key_cols
177
178
  extensions: []
178
179
  extra_rdoc_files: []
@@ -188,6 +189,7 @@ files:
188
189
  - bin/console
189
190
  - bin/setup
190
191
  - exe/snazzy_clades
192
+ - exe/snazzy_clades_attrs
191
193
  - exe/snazzy_clades_key_cols
192
194
  - lib/tree_clusters.rb
193
195
  - lib/tree_clusters/version.rb
@@ -195,6 +197,7 @@ files:
195
197
  - test_files/non_bifurcating.aln
196
198
  - test_files/non_bifurcating.tre
197
199
  - test_files/small.aln
200
+ - test_files/small.attrs
198
201
  - test_files/small.mapping
199
202
  - test_files/small.tre
200
203
  - test_files/small_aln_bad_ids