tree_clusters 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/exe/snazzy_clades_key_cols +71 -58
- data/lib/tree_clusters.rb +92 -0
- data/lib/tree_clusters/version.rb +1 -1
- data/test_files/bad.aln +6 -0
- data/test_files/small.aln +9 -9
- data/test_files/small_aln_bad_ids +4 -0
- data/test_files/small_mapping_bad_ids +2 -0
- data/test_files/small_tree_bad_ids +1 -0
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6fbd972a3068d716241676c936e3867083ad419a
|
4
|
+
data.tar.gz: 973608883e2ca684541f905cbf6644443e217945
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5e55333d9d293b13d2c9c260962adfd0e9ff8b12b6989a4b4bc6f6a8cc67bced83f843d19e3b751b9e917117c724d6307107403a83aba06f9dfe7c8b30ca6b9b
|
7
|
+
data.tar.gz: 7740ace88a99fd9c424a6d49fb83457b4d79e0295cf08fd0664123d08314bcdaa97391bf4f529613ea3151c44567939301343868e4674f9460445a657d445cec
|
data/exe/snazzy_clades_key_cols
CHANGED
@@ -8,21 +8,10 @@ require "parse_fasta"
|
|
8
8
|
require "shannon"
|
9
9
|
require "fileutils"
|
10
10
|
|
11
|
-
|
12
|
-
low_ent_cols = []
|
13
|
-
alns = leaf2attrs.attrs leaves, :aln
|
14
|
-
aln_cols = alns.transpose
|
15
|
-
|
16
|
-
aln_cols.each_with_index do |aln_col, aln_col_idx|
|
17
|
-
has_gaps = aln_col.any? { |aa| aa == "-" }
|
18
|
-
low_entropy = Shannon::entropy(aln_col.join) <= entropy_cutoff
|
19
|
-
|
20
|
-
if !has_gaps && low_entropy
|
21
|
-
low_ent_cols << (aln_col_idx + 1)
|
22
|
-
end
|
23
|
-
end
|
11
|
+
TreeClusters.extend TreeClusters
|
24
12
|
|
25
|
-
|
13
|
+
def puts_info outf, clade_id, key_cols
|
14
|
+
outf.puts [clade_id, key_cols.count, key_cols.to_a].join "\t"
|
26
15
|
end
|
27
16
|
|
28
17
|
opts = Trollop.options do
|
@@ -62,44 +51,61 @@ opts = Trollop.options do
|
|
62
51
|
default: "snazzy_clades")
|
63
52
|
end
|
64
53
|
|
65
|
-
|
54
|
+
abort_if opts[:tree].nil?,
|
55
|
+
"--tree is a required arg"
|
56
|
+
abort_if opts[:mapping].nil?,
|
57
|
+
"--mapping is a required arg"
|
58
|
+
abort_if opts[:aln].nil?,
|
59
|
+
"--aln is a required arg"
|
66
60
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
metadata = TreeClusters.read_mapping_file opts[:mapping]
|
71
|
-
snazzy_clades = TreeClusters.snazzy_clades tree, metadata
|
61
|
+
abort_unless_file_exists opts[:tree]
|
62
|
+
abort_unless_file_exists opts[:mapping]
|
63
|
+
abort_unless_file_exists opts[:aln]
|
72
64
|
|
73
|
-
|
74
|
-
leaf2attrs = TreeClusters::Attrs.new
|
75
|
-
ParseFasta::SeqFile.open(opts[:aln]).each_record do |rec|
|
76
|
-
leaf2attrs[rec.id] = { aln: rec.seq.chars }
|
65
|
+
TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]
|
77
66
|
|
78
|
-
|
67
|
+
abort_unless opts[:entropy_cutoff] >= 0,
|
68
|
+
"--entropy-cutoff must be >= 0"
|
69
|
+
abort_unless opts[:clade_size_cutoff] >= 1,
|
70
|
+
"--clade-size-cutoff must be >= 1"
|
79
71
|
|
80
|
-
|
81
|
-
"Aln len mismatch for #{rec.id}"
|
82
|
-
end
|
72
|
+
FileUtils.mkdir_p opts[:outdir]
|
83
73
|
|
74
|
+
tree = NewickTree.fromFile opts[:tree]
|
75
|
+
metadata = TreeClusters.read_mapping_file opts[:mapping]
|
76
|
+
snazzy_clades = TreeClusters.snazzy_clades tree, metadata
|
77
|
+
leaf2attrs = TreeClusters.read_alignment opts[:aln]
|
78
|
+
|
79
|
+
clades_fname =
|
80
|
+
File.join opts[:outdir],
|
81
|
+
"#{opts[:base]}.snazzy_clades.txt"
|
82
|
+
members_fname =
|
83
|
+
File.join opts[:outdir],
|
84
|
+
"#{opts[:base]}.snazzy_clades_clade_members.txt"
|
85
|
+
all_key_cols_fname =
|
86
|
+
File.join opts[:outdir],
|
87
|
+
"#{opts[:base]}.snazzy_clades_key_cols.txt"
|
88
|
+
key_cols_fname =
|
89
|
+
File.join opts[:outdir],
|
90
|
+
"#{opts[:base]}.snazzy_clades_key_cols.txt"
|
91
|
+
key_cols_minus_parent_cols_fname =
|
92
|
+
File.join opts[:outdir],
|
93
|
+
"#{opts[:base]}.snazzy_clades_key_cols_minus_parent_cols.txt"
|
94
|
+
key_cols_minus_sibling_cols_fname =
|
95
|
+
File.join opts[:outdir],
|
96
|
+
"#{opts[:base]}.snazzy_clades_key_cols_minus_sibling_cols.txt"
|
97
|
+
|
98
|
+
info_f =
|
99
|
+
File.open(clades_fname, "w")
|
100
|
+
clade_members_f =
|
101
|
+
File.open(members_fname, "w")
|
102
|
+
key_cols_f =
|
103
|
+
File.open(key_cols_fname, "w")
|
104
|
+
key_cols_minus_parent_cols_f =
|
105
|
+
File.open(key_cols_minus_parent_cols_fname, "w")
|
106
|
+
key_cols_minus_sibling_cols_f =
|
107
|
+
File.open(key_cols_minus_sibling_cols_fname, "w")
|
84
108
|
|
85
|
-
clades_fname = File.join opts[:outdir],
|
86
|
-
"#{opts[:base]}.snazzy_clades.txt"
|
87
|
-
members_fname = File.join opts[:outdir],
|
88
|
-
"#{opts[:base]}.snazzy_clades_clade_members.txt"
|
89
|
-
all_key_cols_fname = File.join opts[:outdir],
|
90
|
-
"#{opts[:base]}.snazzy_clades_key_cols.txt"
|
91
|
-
key_cols_fname = File.join opts[:outdir],
|
92
|
-
"#{opts[:base]}.snazzy_clades_key_cols.txt"
|
93
|
-
key_cols_minus_parent_cols_fname = File.join opts[:outdir],
|
94
|
-
"#{opts[:base]}.snazzy_clades_key_cols_minus_parent_cols.txt"
|
95
|
-
key_cols_minus_sibling_cols_fname = File.join opts[:outdir],
|
96
|
-
"#{opts[:base]}.snazzy_clades_key_cols_minus_sibling_cols.txt"
|
97
|
-
|
98
|
-
info_f = File.open(clades_fname, "w")
|
99
|
-
clade_members_f = File.open(members_fname, "w")
|
100
|
-
key_cols_f = File.open(key_cols_fname, "w")
|
101
|
-
key_cols_minus_parent_cols_f = File.open(key_cols_minus_parent_cols_fname, "w")
|
102
|
-
key_cols_minus_sibling_cols_f = File.open(key_cols_minus_sibling_cols_fname, "w")
|
103
109
|
|
104
110
|
begin
|
105
111
|
# info is { metadata_category => metadata_tag , ... }
|
@@ -115,27 +121,34 @@ begin
|
|
115
121
|
clade.all_leaves].join "\t"
|
116
122
|
|
117
123
|
key_cols_all_leaves =
|
118
|
-
|
124
|
+
TreeClusters.low_ent_cols clade.all_leaves,
|
125
|
+
leaf2attrs,
|
126
|
+
opts[:entropy_cutoff]
|
119
127
|
key_cols_all_sibling_leaves =
|
120
|
-
|
128
|
+
TreeClusters.low_ent_cols clade.all_sibling_leaves,
|
129
|
+
leaf2attrs,
|
130
|
+
opts[:entropy_cutoff]
|
121
131
|
key_cols_parent_leaves =
|
122
|
-
|
132
|
+
TreeClusters.low_ent_cols clade.parent_leaves,
|
133
|
+
leaf2attrs,
|
134
|
+
opts[:entropy_cutoff]
|
123
135
|
|
124
136
|
key_cols_all_minus_sibling =
|
125
137
|
key_cols_all_leaves - key_cols_all_sibling_leaves
|
126
138
|
key_cols_all_minus_parent =
|
127
139
|
key_cols_all_leaves - key_cols_parent_leaves
|
128
140
|
|
129
|
-
key_cols_f
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
key_cols_all_minus_sibling.count,
|
137
|
-
key_cols_all_minus_sibling.to_a].join "\t"
|
141
|
+
puts_info key_cols_f,
|
142
|
+
clade_id,
|
143
|
+
key_cols_all_leaves
|
144
|
+
|
145
|
+
puts_info key_cols_minus_parent_cols_f,
|
146
|
+
clade_id,
|
147
|
+
key_cols_all_minus_parent
|
138
148
|
|
149
|
+
puts_info key_cols_minus_sibling_cols_f,
|
150
|
+
clade_id,
|
151
|
+
key_cols_all_minus_sibling
|
139
152
|
end
|
140
153
|
ensure
|
141
154
|
info_f.close
|
data/lib/tree_clusters.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
require "abort_if"
|
2
2
|
require "Newick"
|
3
3
|
require "set"
|
4
|
+
require "parse_fasta"
|
5
|
+
require "shannon"
|
4
6
|
require "tree_clusters/version"
|
5
7
|
|
6
8
|
include AbortIf
|
@@ -25,6 +27,96 @@ end
|
|
25
27
|
# Top level namespace of the Gem.
|
26
28
|
module TreeClusters
|
27
29
|
|
30
|
+
# Given an ary of strings, find the most common string in the ary.
|
31
|
+
#
|
32
|
+
# @param bases [Array<String>] an array of strings
|
33
|
+
#
|
34
|
+
# @return most_common_str [String] the most common string in the ary.
|
35
|
+
#
|
36
|
+
# @example Upper case and lower case count as the same.
|
37
|
+
# TreeClusters::consensus %w[a A C T] #=> "A"
|
38
|
+
# @example Ties take the one closest to the end
|
39
|
+
# TreeClusters::consensus %w[a c T t C t g] #=> "T"
|
40
|
+
#
|
41
|
+
# @note Each string is upcase'd before frequencies are calculated.
|
42
|
+
def consensus bases
|
43
|
+
bases.
|
44
|
+
map(&:upcase).
|
45
|
+
group_by(&:itself).
|
46
|
+
sort_by { |_, bases| bases.count }.
|
47
|
+
reverse.
|
48
|
+
first.
|
49
|
+
first
|
50
|
+
end
|
51
|
+
|
52
|
+
def read_alignment aln_fname
|
53
|
+
leaf2attrs = TreeClusters::Attrs.new
|
54
|
+
aln_len = nil
|
55
|
+
ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
|
56
|
+
leaf2attrs[rec.id] = { aln: rec.seq.chars }
|
57
|
+
|
58
|
+
aln_len ||= rec.seq.length
|
59
|
+
|
60
|
+
abort_unless aln_len == rec.seq.length,
|
61
|
+
"Aln len mismatch for #{rec.id}"
|
62
|
+
end
|
63
|
+
|
64
|
+
leaf2attrs
|
65
|
+
end
|
66
|
+
|
67
|
+
def low_ent_cols leaves, leaf2attrs, entropy_cutoff
|
68
|
+
low_ent_cols = []
|
69
|
+
alns = leaf2attrs.attrs leaves, :aln
|
70
|
+
aln_cols = alns.transpose
|
71
|
+
|
72
|
+
aln_cols.each_with_index do |aln_col, aln_col_idx|
|
73
|
+
has_gaps = aln_col.any? { |aa| aa == "-" }
|
74
|
+
low_entropy =
|
75
|
+
Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
|
76
|
+
|
77
|
+
if !has_gaps && low_entropy
|
78
|
+
low_ent_cols << (aln_col_idx + 1)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
Set.new low_ent_cols
|
83
|
+
end
|
84
|
+
|
85
|
+
def check_ids tree, mapping, aln
|
86
|
+
tree_ids = Set.new(NewickTree.fromFile(tree).taxa)
|
87
|
+
|
88
|
+
mapping_ids = Set.new
|
89
|
+
File.open(mapping, "rt").each_line.with_index do |line, idx|
|
90
|
+
unless idx.zero?
|
91
|
+
id, *rest = line.chomp.split
|
92
|
+
|
93
|
+
mapping_ids << id
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
aln_ids = Set.new
|
98
|
+
ParseFasta::SeqFile.open(aln).each_record do |rec|
|
99
|
+
aln_ids << rec.id
|
100
|
+
end
|
101
|
+
|
102
|
+
if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
|
103
|
+
AbortIf::logger.error { "Seq IDs did not match in all input files" }
|
104
|
+
|
105
|
+
tree_ids = tree_ids.to_a.sort
|
106
|
+
mapping_ids = mapping_ids.to_a.sort
|
107
|
+
aln_ids = aln_ids.to_a.sort
|
108
|
+
|
109
|
+
AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
|
110
|
+
AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
|
111
|
+
AbortIf::logger.debug { ["aln_ids", aln_ids].join "\t" }
|
112
|
+
|
113
|
+
raise AbortIf::Exit
|
114
|
+
else
|
115
|
+
true
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
|
28
120
|
# Given a NewickTree, return an array of all Clades in that tree.
|
29
121
|
#
|
30
122
|
# @param tree [NewickTree] a NewickTree object
|
data/test_files/small.aln
CHANGED
@@ -0,0 +1 @@
|
|
1
|
+
(a, b);
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tree_clusters
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Moore
|
@@ -191,11 +191,15 @@ files:
|
|
191
191
|
- exe/snazzy_clades_key_cols
|
192
192
|
- lib/tree_clusters.rb
|
193
193
|
- lib/tree_clusters/version.rb
|
194
|
+
- test_files/bad.aln
|
194
195
|
- test_files/non_bifurcating.aln
|
195
196
|
- test_files/non_bifurcating.tre
|
196
197
|
- test_files/small.aln
|
197
198
|
- test_files/small.mapping
|
198
199
|
- test_files/small.tre
|
200
|
+
- test_files/small_aln_bad_ids
|
201
|
+
- test_files/small_mapping_bad_ids
|
202
|
+
- test_files/small_tree_bad_ids
|
199
203
|
- test_files/test.tre
|
200
204
|
- tree_clusters.gemspec
|
201
205
|
homepage: https://github.com/mooreryan/tree_clusters
|