tree_clusters 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/exe/snazzy_clades_key_cols +71 -58
- data/lib/tree_clusters.rb +92 -0
- data/lib/tree_clusters/version.rb +1 -1
- data/test_files/bad.aln +6 -0
- data/test_files/small.aln +9 -9
- data/test_files/small_aln_bad_ids +4 -0
- data/test_files/small_mapping_bad_ids +2 -0
- data/test_files/small_tree_bad_ids +1 -0
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6fbd972a3068d716241676c936e3867083ad419a
|
4
|
+
data.tar.gz: 973608883e2ca684541f905cbf6644443e217945
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e55333d9d293b13d2c9c260962adfd0e9ff8b12b6989a4b4bc6f6a8cc67bced83f843d19e3b751b9e917117c724d6307107403a83aba06f9dfe7c8b30ca6b9b
|
7
|
+
data.tar.gz: 7740ace88a99fd9c424a6d49fb83457b4d79e0295cf08fd0664123d08314bcdaa97391bf4f529613ea3151c44567939301343868e4674f9460445a657d445cec
|
data/exe/snazzy_clades_key_cols
CHANGED
@@ -8,21 +8,10 @@ require "parse_fasta"
|
|
8
8
|
require "shannon"
|
9
9
|
require "fileutils"
|
10
10
|
|
11
|
-
|
12
|
-
low_ent_cols = []
|
13
|
-
alns = leaf2attrs.attrs leaves, :aln
|
14
|
-
aln_cols = alns.transpose
|
15
|
-
|
16
|
-
aln_cols.each_with_index do |aln_col, aln_col_idx|
|
17
|
-
has_gaps = aln_col.any? { |aa| aa == "-" }
|
18
|
-
low_entropy = Shannon::entropy(aln_col.join) <= entropy_cutoff
|
19
|
-
|
20
|
-
if !has_gaps && low_entropy
|
21
|
-
low_ent_cols << (aln_col_idx + 1)
|
22
|
-
end
|
23
|
-
end
|
11
|
+
TreeClusters.extend TreeClusters
|
24
12
|
|
25
|
-
|
13
|
+
def puts_info outf, clade_id, key_cols
|
14
|
+
outf.puts [clade_id, key_cols.count, key_cols.to_a].join "\t"
|
26
15
|
end
|
27
16
|
|
28
17
|
opts = Trollop.options do
|
@@ -62,44 +51,61 @@ opts = Trollop.options do
|
|
62
51
|
default: "snazzy_clades")
|
63
52
|
end
|
64
53
|
|
65
|
-
|
54
|
+
abort_if opts[:tree].nil?,
|
55
|
+
"--tree is a required arg"
|
56
|
+
abort_if opts[:mapping].nil?,
|
57
|
+
"--mapping is a required arg"
|
58
|
+
abort_if opts[:aln].nil?,
|
59
|
+
"--aln is a required arg"
|
66
60
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
metadata = TreeClusters.read_mapping_file opts[:mapping]
|
71
|
-
snazzy_clades = TreeClusters.snazzy_clades tree, metadata
|
61
|
+
abort_unless_file_exists opts[:tree]
|
62
|
+
abort_unless_file_exists opts[:mapping]
|
63
|
+
abort_unless_file_exists opts[:aln]
|
72
64
|
|
73
|
-
|
74
|
-
leaf2attrs = TreeClusters::Attrs.new
|
75
|
-
ParseFasta::SeqFile.open(opts[:aln]).each_record do |rec|
|
76
|
-
leaf2attrs[rec.id] = { aln: rec.seq.chars }
|
65
|
+
TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]
|
77
66
|
|
78
|
-
|
67
|
+
abort_unless opts[:entropy_cutoff] >= 0,
|
68
|
+
"--entropy-cutoff must be >= 0"
|
69
|
+
abort_unless opts[:clade_size_cutoff] >= 1,
|
70
|
+
"--clade-size-cutoff must be >= 1"
|
79
71
|
|
80
|
-
|
81
|
-
"Aln len mismatch for #{rec.id}"
|
82
|
-
end
|
72
|
+
FileUtils.mkdir_p opts[:outdir]
|
83
73
|
|
74
|
+
tree = NewickTree.fromFile opts[:tree]
|
75
|
+
metadata = TreeClusters.read_mapping_file opts[:mapping]
|
76
|
+
snazzy_clades = TreeClusters.snazzy_clades tree, metadata
|
77
|
+
leaf2attrs = TreeClusters.read_alignment opts[:aln]
|
78
|
+
|
79
|
+
clades_fname =
|
80
|
+
File.join opts[:outdir],
|
81
|
+
"#{opts[:base]}.snazzy_clades.txt"
|
82
|
+
members_fname =
|
83
|
+
File.join opts[:outdir],
|
84
|
+
"#{opts[:base]}.snazzy_clades_clade_members.txt"
|
85
|
+
all_key_cols_fname =
|
86
|
+
File.join opts[:outdir],
|
87
|
+
"#{opts[:base]}.snazzy_clades_key_cols.txt"
|
88
|
+
key_cols_fname =
|
89
|
+
File.join opts[:outdir],
|
90
|
+
"#{opts[:base]}.snazzy_clades_key_cols.txt"
|
91
|
+
key_cols_minus_parent_cols_fname =
|
92
|
+
File.join opts[:outdir],
|
93
|
+
"#{opts[:base]}.snazzy_clades_key_cols_minus_parent_cols.txt"
|
94
|
+
key_cols_minus_sibling_cols_fname =
|
95
|
+
File.join opts[:outdir],
|
96
|
+
"#{opts[:base]}.snazzy_clades_key_cols_minus_sibling_cols.txt"
|
97
|
+
|
98
|
+
info_f =
|
99
|
+
File.open(clades_fname, "w")
|
100
|
+
clade_members_f =
|
101
|
+
File.open(members_fname, "w")
|
102
|
+
key_cols_f =
|
103
|
+
File.open(key_cols_fname, "w")
|
104
|
+
key_cols_minus_parent_cols_f =
|
105
|
+
File.open(key_cols_minus_parent_cols_fname, "w")
|
106
|
+
key_cols_minus_sibling_cols_f =
|
107
|
+
File.open(key_cols_minus_sibling_cols_fname, "w")
|
84
108
|
|
85
|
-
clades_fname = File.join opts[:outdir],
|
86
|
-
"#{opts[:base]}.snazzy_clades.txt"
|
87
|
-
members_fname = File.join opts[:outdir],
|
88
|
-
"#{opts[:base]}.snazzy_clades_clade_members.txt"
|
89
|
-
all_key_cols_fname = File.join opts[:outdir],
|
90
|
-
"#{opts[:base]}.snazzy_clades_key_cols.txt"
|
91
|
-
key_cols_fname = File.join opts[:outdir],
|
92
|
-
"#{opts[:base]}.snazzy_clades_key_cols.txt"
|
93
|
-
key_cols_minus_parent_cols_fname = File.join opts[:outdir],
|
94
|
-
"#{opts[:base]}.snazzy_clades_key_cols_minus_parent_cols.txt"
|
95
|
-
key_cols_minus_sibling_cols_fname = File.join opts[:outdir],
|
96
|
-
"#{opts[:base]}.snazzy_clades_key_cols_minus_sibling_cols.txt"
|
97
|
-
|
98
|
-
info_f = File.open(clades_fname, "w")
|
99
|
-
clade_members_f = File.open(members_fname, "w")
|
100
|
-
key_cols_f = File.open(key_cols_fname, "w")
|
101
|
-
key_cols_minus_parent_cols_f = File.open(key_cols_minus_parent_cols_fname, "w")
|
102
|
-
key_cols_minus_sibling_cols_f = File.open(key_cols_minus_sibling_cols_fname, "w")
|
103
109
|
|
104
110
|
begin
|
105
111
|
# info is { metadata_category => metadata_tag , ... }
|
@@ -115,27 +121,34 @@ begin
|
|
115
121
|
clade.all_leaves].join "\t"
|
116
122
|
|
117
123
|
key_cols_all_leaves =
|
118
|
-
|
124
|
+
TreeClusters.low_ent_cols clade.all_leaves,
|
125
|
+
leaf2attrs,
|
126
|
+
opts[:entropy_cutoff]
|
119
127
|
key_cols_all_sibling_leaves =
|
120
|
-
|
128
|
+
TreeClusters.low_ent_cols clade.all_sibling_leaves,
|
129
|
+
leaf2attrs,
|
130
|
+
opts[:entropy_cutoff]
|
121
131
|
key_cols_parent_leaves =
|
122
|
-
|
132
|
+
TreeClusters.low_ent_cols clade.parent_leaves,
|
133
|
+
leaf2attrs,
|
134
|
+
opts[:entropy_cutoff]
|
123
135
|
|
124
136
|
key_cols_all_minus_sibling =
|
125
137
|
key_cols_all_leaves - key_cols_all_sibling_leaves
|
126
138
|
key_cols_all_minus_parent =
|
127
139
|
key_cols_all_leaves - key_cols_parent_leaves
|
128
140
|
|
129
|
-
key_cols_f
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
key_cols_all_minus_sibling.count,
|
137
|
-
key_cols_all_minus_sibling.to_a].join "\t"
|
141
|
+
puts_info key_cols_f,
|
142
|
+
clade_id,
|
143
|
+
key_cols_all_leaves
|
144
|
+
|
145
|
+
puts_info key_cols_minus_parent_cols_f,
|
146
|
+
clade_id,
|
147
|
+
key_cols_all_minus_parent
|
138
148
|
|
149
|
+
puts_info key_cols_minus_sibling_cols_f,
|
150
|
+
clade_id,
|
151
|
+
key_cols_all_minus_sibling
|
139
152
|
end
|
140
153
|
ensure
|
141
154
|
info_f.close
|
data/lib/tree_clusters.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require "abort_if"
|
2
2
|
require "Newick"
|
3
3
|
require "set"
|
4
|
+
require "parse_fasta"
|
5
|
+
require "shannon"
|
4
6
|
require "tree_clusters/version"
|
5
7
|
|
6
8
|
include AbortIf
|
@@ -25,6 +27,96 @@ end
|
|
25
27
|
# Top level namespace of the Gem.
|
26
28
|
module TreeClusters
|
27
29
|
|
30
|
+
# Given an ary of strings, find the most common string in the ary.
|
31
|
+
#
|
32
|
+
# @param bases [Array<String>] an array of strings
|
33
|
+
#
|
34
|
+
# @return most_common_str [String] the most common string in the ary.
|
35
|
+
#
|
36
|
+
# @example Upper case and lower case count as the same.
|
37
|
+
# TreeClusters::consensus %w[a A C T] #=> "A"
|
38
|
+
# @example Ties take the one closest to the end
|
39
|
+
# TreeClusters::consensus %w[a c T t C t g] #=> "T"
|
40
|
+
#
|
41
|
+
# @note Each string is upcase'd before frequencies are calculated.
|
42
|
+
def consensus bases
|
43
|
+
bases.
|
44
|
+
map(&:upcase).
|
45
|
+
group_by(&:itself).
|
46
|
+
sort_by { |_, bases| bases.count }.
|
47
|
+
reverse.
|
48
|
+
first.
|
49
|
+
first
|
50
|
+
end
|
51
|
+
|
52
|
+
def read_alignment aln_fname
|
53
|
+
leaf2attrs = TreeClusters::Attrs.new
|
54
|
+
aln_len = nil
|
55
|
+
ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
|
56
|
+
leaf2attrs[rec.id] = { aln: rec.seq.chars }
|
57
|
+
|
58
|
+
aln_len ||= rec.seq.length
|
59
|
+
|
60
|
+
abort_unless aln_len == rec.seq.length,
|
61
|
+
"Aln len mismatch for #{rec.id}"
|
62
|
+
end
|
63
|
+
|
64
|
+
leaf2attrs
|
65
|
+
end
|
66
|
+
|
67
|
+
def low_ent_cols leaves, leaf2attrs, entropy_cutoff
|
68
|
+
low_ent_cols = []
|
69
|
+
alns = leaf2attrs.attrs leaves, :aln
|
70
|
+
aln_cols = alns.transpose
|
71
|
+
|
72
|
+
aln_cols.each_with_index do |aln_col, aln_col_idx|
|
73
|
+
has_gaps = aln_col.any? { |aa| aa == "-" }
|
74
|
+
low_entropy =
|
75
|
+
Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
|
76
|
+
|
77
|
+
if !has_gaps && low_entropy
|
78
|
+
low_ent_cols << (aln_col_idx + 1)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
Set.new low_ent_cols
|
83
|
+
end
|
84
|
+
|
85
|
+
def check_ids tree, mapping, aln
|
86
|
+
tree_ids = Set.new(NewickTree.fromFile(tree).taxa)
|
87
|
+
|
88
|
+
mapping_ids = Set.new
|
89
|
+
File.open(mapping, "rt").each_line.with_index do |line, idx|
|
90
|
+
unless idx.zero?
|
91
|
+
id, *rest = line.chomp.split
|
92
|
+
|
93
|
+
mapping_ids << id
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
aln_ids = Set.new
|
98
|
+
ParseFasta::SeqFile.open(aln).each_record do |rec|
|
99
|
+
aln_ids << rec.id
|
100
|
+
end
|
101
|
+
|
102
|
+
if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
|
103
|
+
AbortIf::logger.error { "Seq IDs did not match in all input files" }
|
104
|
+
|
105
|
+
tree_ids = tree_ids.to_a.sort
|
106
|
+
mapping_ids = mapping_ids.to_a.sort
|
107
|
+
aln_ids = aln_ids.to_a.sort
|
108
|
+
|
109
|
+
AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
|
110
|
+
AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
|
111
|
+
AbortIf::logger.debug { ["aln_ids", aln_ids].join "\t" }
|
112
|
+
|
113
|
+
raise AbortIf::Exit
|
114
|
+
else
|
115
|
+
true
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
|
28
120
|
# Given a NewickTree, return an array of all Clades in that tree.
|
29
121
|
#
|
30
122
|
# @param tree [NewickTree] a NewickTree object
|
data/test_files/small.aln
CHANGED
@@ -0,0 +1 @@
|
|
1
|
+
(a, b);
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tree_clusters
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
@@ -191,11 +191,15 @@ files:
|
|
191
191
|
- exe/snazzy_clades_key_cols
|
192
192
|
- lib/tree_clusters.rb
|
193
193
|
- lib/tree_clusters/version.rb
|
194
|
+
- test_files/bad.aln
|
194
195
|
- test_files/non_bifurcating.aln
|
195
196
|
- test_files/non_bifurcating.tre
|
196
197
|
- test_files/small.aln
|
197
198
|
- test_files/small.mapping
|
198
199
|
- test_files/small.tre
|
200
|
+
- test_files/small_aln_bad_ids
|
201
|
+
- test_files/small_mapping_bad_ids
|
202
|
+
- test_files/small_tree_bad_ids
|
199
203
|
- test_files/test.tre
|
200
204
|
- tree_clusters.gemspec
|
201
205
|
homepage: https://github.com/mooreryan/tree_clusters
|