tree_clusters 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea8e2199383e573da42d0a6d265b446de07bea4c
4
- data.tar.gz: 775d0ace4a7398438ed4c4a34504ae9d7128d7c1
3
+ metadata.gz: 6fbd972a3068d716241676c936e3867083ad419a
4
+ data.tar.gz: 973608883e2ca684541f905cbf6644443e217945
5
5
  SHA512:
6
- metadata.gz: db7302cad104d6ae05e10a4f02b1880016ba9a960cb44209d283cf9c5d730dc39ebd26a6cf64d40bd0e516d6253692112615bde41dc0e1e935ee0bc582cb36c4
7
- data.tar.gz: 8dbb10b889de0f3c7db92d2138c89bdcf5357279daadaf12c97ac2e7d9b74980e3e9219308e4bb3252c93bb92b9230d77e5dc166ebd5633f36e949a54c72ca94
6
+ metadata.gz: 5e55333d9d293b13d2c9c260962adfd0e9ff8b12b6989a4b4bc6f6a8cc67bced83f843d19e3b751b9e917117c724d6307107403a83aba06f9dfe7c8b30ca6b9b
7
+ data.tar.gz: 7740ace88a99fd9c424a6d49fb83457b4d79e0295cf08fd0664123d08314bcdaa97391bf4f529613ea3151c44567939301343868e4674f9460445a657d445cec
@@ -8,21 +8,10 @@ require "parse_fasta"
8
8
  require "shannon"
9
9
  require "fileutils"
10
10
 
11
- def get_low_ent_cols leaves, leaf2attrs, entropy_cutoff
12
- low_ent_cols = []
13
- alns = leaf2attrs.attrs leaves, :aln
14
- aln_cols = alns.transpose
15
-
16
- aln_cols.each_with_index do |aln_col, aln_col_idx|
17
- has_gaps = aln_col.any? { |aa| aa == "-" }
18
- low_entropy = Shannon::entropy(aln_col.join) <= entropy_cutoff
19
-
20
- if !has_gaps && low_entropy
21
- low_ent_cols << (aln_col_idx + 1)
22
- end
23
- end
11
+ TreeClusters.extend TreeClusters
24
12
 
25
- Set.new low_ent_cols
13
+ def puts_info outf, clade_id, key_cols
14
+ outf.puts [clade_id, key_cols.count, key_cols.to_a].join "\t"
26
15
  end
27
16
 
28
17
  opts = Trollop.options do
@@ -62,44 +51,61 @@ opts = Trollop.options do
62
51
  default: "snazzy_clades")
63
52
  end
64
53
 
65
- FileUtils.mkdir_p opts[:outdir]
54
+ abort_if opts[:tree].nil?,
55
+ "--tree is a required arg"
56
+ abort_if opts[:mapping].nil?,
57
+ "--mapping is a required arg"
58
+ abort_if opts[:aln].nil?,
59
+ "--aln is a required arg"
66
60
 
67
- TreeClusters.extend TreeClusters
68
-
69
- tree = NewickTree.fromFile opts[:tree]
70
- metadata = TreeClusters.read_mapping_file opts[:mapping]
71
- snazzy_clades = TreeClusters.snazzy_clades tree, metadata
61
+ abort_unless_file_exists opts[:tree]
62
+ abort_unless_file_exists opts[:mapping]
63
+ abort_unless_file_exists opts[:aln]
72
64
 
73
- aln_len = nil
74
- leaf2attrs = TreeClusters::Attrs.new
75
- ParseFasta::SeqFile.open(opts[:aln]).each_record do |rec|
76
- leaf2attrs[rec.id] = { aln: rec.seq.chars }
65
+ TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]
77
66
 
78
- aln_len ||= rec.seq.length
67
+ abort_unless opts[:entropy_cutoff] >= 0,
68
+ "--entropy-cutoff must be >= 0"
69
+ abort_unless opts[:clade_size_cutoff] >= 1,
70
+ "--clade-size-cutoff must be >= 1"
79
71
 
80
- abort_unless aln_len == rec.seq.length,
81
- "Aln len mismatch for #{rec.id}"
82
- end
72
+ FileUtils.mkdir_p opts[:outdir]
83
73
 
74
+ tree = NewickTree.fromFile opts[:tree]
75
+ metadata = TreeClusters.read_mapping_file opts[:mapping]
76
+ snazzy_clades = TreeClusters.snazzy_clades tree, metadata
77
+ leaf2attrs = TreeClusters.read_alignment opts[:aln]
78
+
79
+ clades_fname =
80
+ File.join opts[:outdir],
81
+ "#{opts[:base]}.snazzy_clades.txt"
82
+ members_fname =
83
+ File.join opts[:outdir],
84
+ "#{opts[:base]}.snazzy_clades_clade_members.txt"
85
+ all_key_cols_fname =
86
+ File.join opts[:outdir],
87
+ "#{opts[:base]}.snazzy_clades_key_cols.txt"
88
+ key_cols_fname =
89
+ File.join opts[:outdir],
90
+ "#{opts[:base]}.snazzy_clades_key_cols.txt"
91
+ key_cols_minus_parent_cols_fname =
92
+ File.join opts[:outdir],
93
+ "#{opts[:base]}.snazzy_clades_key_cols_minus_parent_cols.txt"
94
+ key_cols_minus_sibling_cols_fname =
95
+ File.join opts[:outdir],
96
+ "#{opts[:base]}.snazzy_clades_key_cols_minus_sibling_cols.txt"
97
+
98
+ info_f =
99
+ File.open(clades_fname, "w")
100
+ clade_members_f =
101
+ File.open(members_fname, "w")
102
+ key_cols_f =
103
+ File.open(key_cols_fname, "w")
104
+ key_cols_minus_parent_cols_f =
105
+ File.open(key_cols_minus_parent_cols_fname, "w")
106
+ key_cols_minus_sibling_cols_f =
107
+ File.open(key_cols_minus_sibling_cols_fname, "w")
84
108
 
85
- clades_fname = File.join opts[:outdir],
86
- "#{opts[:base]}.snazzy_clades.txt"
87
- members_fname = File.join opts[:outdir],
88
- "#{opts[:base]}.snazzy_clades_clade_members.txt"
89
- all_key_cols_fname = File.join opts[:outdir],
90
- "#{opts[:base]}.snazzy_clades_key_cols.txt"
91
- key_cols_fname = File.join opts[:outdir],
92
- "#{opts[:base]}.snazzy_clades_key_cols.txt"
93
- key_cols_minus_parent_cols_fname = File.join opts[:outdir],
94
- "#{opts[:base]}.snazzy_clades_key_cols_minus_parent_cols.txt"
95
- key_cols_minus_sibling_cols_fname = File.join opts[:outdir],
96
- "#{opts[:base]}.snazzy_clades_key_cols_minus_sibling_cols.txt"
97
-
98
- info_f = File.open(clades_fname, "w")
99
- clade_members_f = File.open(members_fname, "w")
100
- key_cols_f = File.open(key_cols_fname, "w")
101
- key_cols_minus_parent_cols_f = File.open(key_cols_minus_parent_cols_fname, "w")
102
- key_cols_minus_sibling_cols_f = File.open(key_cols_minus_sibling_cols_fname, "w")
103
109
 
104
110
  begin
105
111
  # info is { metadata_category => metadata_tag , ... }
@@ -115,27 +121,34 @@ begin
115
121
  clade.all_leaves].join "\t"
116
122
 
117
123
  key_cols_all_leaves =
118
- get_low_ent_cols clade.all_leaves, leaf2attrs, opts[:entropy_cutoff]
124
+ TreeClusters.low_ent_cols clade.all_leaves,
125
+ leaf2attrs,
126
+ opts[:entropy_cutoff]
119
127
  key_cols_all_sibling_leaves =
120
- get_low_ent_cols clade.all_sibling_leaves, leaf2attrs, opts[:entropy_cutoff]
128
+ TreeClusters.low_ent_cols clade.all_sibling_leaves,
129
+ leaf2attrs,
130
+ opts[:entropy_cutoff]
121
131
  key_cols_parent_leaves =
122
- get_low_ent_cols clade.parent_leaves, leaf2attrs, opts[:entropy_cutoff]
132
+ TreeClusters.low_ent_cols clade.parent_leaves,
133
+ leaf2attrs,
134
+ opts[:entropy_cutoff]
123
135
 
124
136
  key_cols_all_minus_sibling =
125
137
  key_cols_all_leaves - key_cols_all_sibling_leaves
126
138
  key_cols_all_minus_parent =
127
139
  key_cols_all_leaves - key_cols_parent_leaves
128
140
 
129
- key_cols_f.puts [clade_id,
130
- key_cols_all_leaves.count,
131
- key_cols_all_leaves.to_a].join "\t"
132
- key_cols_minus_parent_cols_f.puts [clade_id,
133
- key_cols_all_minus_parent.count,
134
- key_cols_all_minus_parent.to_a].join "\t"
135
- key_cols_minus_sibling_cols_f.puts [clade_id,
136
- key_cols_all_minus_sibling.count,
137
- key_cols_all_minus_sibling.to_a].join "\t"
141
+ puts_info key_cols_f,
142
+ clade_id,
143
+ key_cols_all_leaves
144
+
145
+ puts_info key_cols_minus_parent_cols_f,
146
+ clade_id,
147
+ key_cols_all_minus_parent
138
148
 
149
+ puts_info key_cols_minus_sibling_cols_f,
150
+ clade_id,
151
+ key_cols_all_minus_sibling
139
152
  end
140
153
  ensure
141
154
  info_f.close
data/lib/tree_clusters.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  require "abort_if"
2
2
  require "Newick"
3
3
  require "set"
4
+ require "parse_fasta"
5
+ require "shannon"
4
6
  require "tree_clusters/version"
5
7
 
6
8
  include AbortIf
@@ -25,6 +27,96 @@ end
25
27
  # Top level namespace of the Gem.
26
28
  module TreeClusters
27
29
 
30
+ # Given an ary of strings, find the most common string in the ary.
31
+ #
32
+ # @param bases [Array<String>] an array of strings
33
+ #
34
+ # @return most_common_str [String] the most common string in the ary.
35
+ #
36
+ # @example Upper case and lower case count as the same.
37
+ # TreeClusters::consensus %w[a A C T] #=> "A"
38
+ # @example Ties take the one closest to the end
39
+ # TreeClusters::consensus %w[a c T t C t g] #=> "T"
40
+ #
41
+ # @note Each string is upcase'd before frequencies are calculated.
42
+ def consensus bases
43
+ bases.
44
+ map(&:upcase).
45
+ group_by(&:itself).
46
+ sort_by { |_, bases| bases.count }.
47
+ reverse.
48
+ first.
49
+ first
50
+ end
51
+
52
+ def read_alignment aln_fname
53
+ leaf2attrs = TreeClusters::Attrs.new
54
+ aln_len = nil
55
+ ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
56
+ leaf2attrs[rec.id] = { aln: rec.seq.chars }
57
+
58
+ aln_len ||= rec.seq.length
59
+
60
+ abort_unless aln_len == rec.seq.length,
61
+ "Aln len mismatch for #{rec.id}"
62
+ end
63
+
64
+ leaf2attrs
65
+ end
66
+
67
+ def low_ent_cols leaves, leaf2attrs, entropy_cutoff
68
+ low_ent_cols = []
69
+ alns = leaf2attrs.attrs leaves, :aln
70
+ aln_cols = alns.transpose
71
+
72
+ aln_cols.each_with_index do |aln_col, aln_col_idx|
73
+ has_gaps = aln_col.any? { |aa| aa == "-" }
74
+ low_entropy =
75
+ Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
76
+
77
+ if !has_gaps && low_entropy
78
+ low_ent_cols << (aln_col_idx + 1)
79
+ end
80
+ end
81
+
82
+ Set.new low_ent_cols
83
+ end
84
+
85
+ def check_ids tree, mapping, aln
86
+ tree_ids = Set.new(NewickTree.fromFile(tree).taxa)
87
+
88
+ mapping_ids = Set.new
89
+ File.open(mapping, "rt").each_line.with_index do |line, idx|
90
+ unless idx.zero?
91
+ id, *rest = line.chomp.split
92
+
93
+ mapping_ids << id
94
+ end
95
+ end
96
+
97
+ aln_ids = Set.new
98
+ ParseFasta::SeqFile.open(aln).each_record do |rec|
99
+ aln_ids << rec.id
100
+ end
101
+
102
+ if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
103
+ AbortIf::logger.error { "Seq IDs did not match in all input files" }
104
+
105
+ tree_ids = tree_ids.to_a.sort
106
+ mapping_ids = mapping_ids.to_a.sort
107
+ aln_ids = aln_ids.to_a.sort
108
+
109
+ AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
110
+ AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
111
+ AbortIf::logger.debug { ["aln_ids", aln_ids].join "\t" }
112
+
113
+ raise AbortIf::Exit
114
+ else
115
+ true
116
+ end
117
+ end
118
+
119
+
28
120
  # Given a NewickTree, return an array of all Clades in that tree.
29
121
  #
30
122
  # @param tree [NewickTree] a NewickTree object
@@ -1,3 +1,3 @@
1
1
  module TreeClusters
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -0,0 +1,6 @@
1
+ >s1
2
+ AAA
3
+ >s2
4
+ AAAA
5
+ >s3
6
+ AA
data/test_files/small.aln CHANGED
@@ -1,14 +1,14 @@
1
- >a-1
1
+ >a-1 apple
2
2
  AAAA
3
- >a-2
3
+ >a-2 pie
4
4
  AAAT
5
- >b-1
5
+ >b-1 is
6
6
  CCCC
7
- >b-2
8
- CCCT
9
- >bb-1
7
+ >b-2 really
8
+ cccT
9
+ >bb-1 good
10
10
  CCTG
11
- >bbb-1
12
- CCGG
13
- >bbb-2
11
+ >bbb-1 and
12
+ CCGg
13
+ >bbb-2 tasty
14
14
  CGGG
@@ -0,0 +1,4 @@
1
+ >apple
2
+ aRSOIN
3
+ >pie
4
+ ARSOIT
@@ -0,0 +1,2 @@
1
+ snazzy pie
2
+ apple thing
@@ -0,0 +1 @@
1
+ (a, b);
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tree_clusters
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
@@ -191,11 +191,15 @@ files:
191
191
  - exe/snazzy_clades_key_cols
192
192
  - lib/tree_clusters.rb
193
193
  - lib/tree_clusters/version.rb
194
+ - test_files/bad.aln
194
195
  - test_files/non_bifurcating.aln
195
196
  - test_files/non_bifurcating.tre
196
197
  - test_files/small.aln
197
198
  - test_files/small.mapping
198
199
  - test_files/small.tre
200
+ - test_files/small_aln_bad_ids
201
+ - test_files/small_mapping_bad_ids
202
+ - test_files/small_tree_bad_ids
199
203
  - test_files/test.tre
200
204
  - tree_clusters.gemspec
201
205
  homepage: https://github.com/mooreryan/tree_clusters