tree_clusters 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea8e2199383e573da42d0a6d265b446de07bea4c
4
- data.tar.gz: 775d0ace4a7398438ed4c4a34504ae9d7128d7c1
3
+ metadata.gz: 6fbd972a3068d716241676c936e3867083ad419a
4
+ data.tar.gz: 973608883e2ca684541f905cbf6644443e217945
5
5
  SHA512:
6
- metadata.gz: db7302cad104d6ae05e10a4f02b1880016ba9a960cb44209d283cf9c5d730dc39ebd26a6cf64d40bd0e516d6253692112615bde41dc0e1e935ee0bc582cb36c4
7
- data.tar.gz: 8dbb10b889de0f3c7db92d2138c89bdcf5357279daadaf12c97ac2e7d9b74980e3e9219308e4bb3252c93bb92b9230d77e5dc166ebd5633f36e949a54c72ca94
6
+ metadata.gz: 5e55333d9d293b13d2c9c260962adfd0e9ff8b12b6989a4b4bc6f6a8cc67bced83f843d19e3b751b9e917117c724d6307107403a83aba06f9dfe7c8b30ca6b9b
7
+ data.tar.gz: 7740ace88a99fd9c424a6d49fb83457b4d79e0295cf08fd0664123d08314bcdaa97391bf4f529613ea3151c44567939301343868e4674f9460445a657d445cec
@@ -8,21 +8,10 @@ require "parse_fasta"
8
8
  require "shannon"
9
9
  require "fileutils"
10
10
 
11
- def get_low_ent_cols leaves, leaf2attrs, entropy_cutoff
12
- low_ent_cols = []
13
- alns = leaf2attrs.attrs leaves, :aln
14
- aln_cols = alns.transpose
15
-
16
- aln_cols.each_with_index do |aln_col, aln_col_idx|
17
- has_gaps = aln_col.any? { |aa| aa == "-" }
18
- low_entropy = Shannon::entropy(aln_col.join) <= entropy_cutoff
19
-
20
- if !has_gaps && low_entropy
21
- low_ent_cols << (aln_col_idx + 1)
22
- end
23
- end
11
+ TreeClusters.extend TreeClusters
24
12
 
25
- Set.new low_ent_cols
13
+ def puts_info outf, clade_id, key_cols
14
+ outf.puts [clade_id, key_cols.count, key_cols.to_a].join "\t"
26
15
  end
27
16
 
28
17
  opts = Trollop.options do
@@ -62,44 +51,61 @@ opts = Trollop.options do
62
51
  default: "snazzy_clades")
63
52
  end
64
53
 
65
- FileUtils.mkdir_p opts[:outdir]
54
+ abort_if opts[:tree].nil?,
55
+ "--tree is a required arg"
56
+ abort_if opts[:mapping].nil?,
57
+ "--mapping is a required arg"
58
+ abort_if opts[:aln].nil?,
59
+ "--aln is a required arg"
66
60
 
67
- TreeClusters.extend TreeClusters
68
-
69
- tree = NewickTree.fromFile opts[:tree]
70
- metadata = TreeClusters.read_mapping_file opts[:mapping]
71
- snazzy_clades = TreeClusters.snazzy_clades tree, metadata
61
+ abort_unless_file_exists opts[:tree]
62
+ abort_unless_file_exists opts[:mapping]
63
+ abort_unless_file_exists opts[:aln]
72
64
 
73
- aln_len = nil
74
- leaf2attrs = TreeClusters::Attrs.new
75
- ParseFasta::SeqFile.open(opts[:aln]).each_record do |rec|
76
- leaf2attrs[rec.id] = { aln: rec.seq.chars }
65
+ TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]
77
66
 
78
- aln_len ||= rec.seq.length
67
+ abort_unless opts[:entropy_cutoff] >= 0,
68
+ "--entropy-cutoff must be >= 0"
69
+ abort_unless opts[:clade_size_cutoff] >= 1,
70
+ "--clade-size-cutoff must be >= 1"
79
71
 
80
- abort_unless aln_len == rec.seq.length,
81
- "Aln len mismatch for #{rec.id}"
82
- end
72
+ FileUtils.mkdir_p opts[:outdir]
83
73
 
74
+ tree = NewickTree.fromFile opts[:tree]
75
+ metadata = TreeClusters.read_mapping_file opts[:mapping]
76
+ snazzy_clades = TreeClusters.snazzy_clades tree, metadata
77
+ leaf2attrs = TreeClusters.read_alignment opts[:aln]
78
+
79
+ clades_fname =
80
+ File.join opts[:outdir],
81
+ "#{opts[:base]}.snazzy_clades.txt"
82
+ members_fname =
83
+ File.join opts[:outdir],
84
+ "#{opts[:base]}.snazzy_clades_clade_members.txt"
85
+ all_key_cols_fname =
86
+ File.join opts[:outdir],
87
+ "#{opts[:base]}.snazzy_clades_key_cols.txt"
88
+ key_cols_fname =
89
+ File.join opts[:outdir],
90
+ "#{opts[:base]}.snazzy_clades_key_cols.txt"
91
+ key_cols_minus_parent_cols_fname =
92
+ File.join opts[:outdir],
93
+ "#{opts[:base]}.snazzy_clades_key_cols_minus_parent_cols.txt"
94
+ key_cols_minus_sibling_cols_fname =
95
+ File.join opts[:outdir],
96
+ "#{opts[:base]}.snazzy_clades_key_cols_minus_sibling_cols.txt"
97
+
98
+ info_f =
99
+ File.open(clades_fname, "w")
100
+ clade_members_f =
101
+ File.open(members_fname, "w")
102
+ key_cols_f =
103
+ File.open(key_cols_fname, "w")
104
+ key_cols_minus_parent_cols_f =
105
+ File.open(key_cols_minus_parent_cols_fname, "w")
106
+ key_cols_minus_sibling_cols_f =
107
+ File.open(key_cols_minus_sibling_cols_fname, "w")
84
108
 
85
- clades_fname = File.join opts[:outdir],
86
- "#{opts[:base]}.snazzy_clades.txt"
87
- members_fname = File.join opts[:outdir],
88
- "#{opts[:base]}.snazzy_clades_clade_members.txt"
89
- all_key_cols_fname = File.join opts[:outdir],
90
- "#{opts[:base]}.snazzy_clades_key_cols.txt"
91
- key_cols_fname = File.join opts[:outdir],
92
- "#{opts[:base]}.snazzy_clades_key_cols.txt"
93
- key_cols_minus_parent_cols_fname = File.join opts[:outdir],
94
- "#{opts[:base]}.snazzy_clades_key_cols_minus_parent_cols.txt"
95
- key_cols_minus_sibling_cols_fname = File.join opts[:outdir],
96
- "#{opts[:base]}.snazzy_clades_key_cols_minus_sibling_cols.txt"
97
-
98
- info_f = File.open(clades_fname, "w")
99
- clade_members_f = File.open(members_fname, "w")
100
- key_cols_f = File.open(key_cols_fname, "w")
101
- key_cols_minus_parent_cols_f = File.open(key_cols_minus_parent_cols_fname, "w")
102
- key_cols_minus_sibling_cols_f = File.open(key_cols_minus_sibling_cols_fname, "w")
103
109
 
104
110
  begin
105
111
  # info is { metadata_category => metadata_tag , ... }
@@ -115,27 +121,34 @@ begin
115
121
  clade.all_leaves].join "\t"
116
122
 
117
123
  key_cols_all_leaves =
118
- get_low_ent_cols clade.all_leaves, leaf2attrs, opts[:entropy_cutoff]
124
+ TreeClusters.low_ent_cols clade.all_leaves,
125
+ leaf2attrs,
126
+ opts[:entropy_cutoff]
119
127
  key_cols_all_sibling_leaves =
120
- get_low_ent_cols clade.all_sibling_leaves, leaf2attrs, opts[:entropy_cutoff]
128
+ TreeClusters.low_ent_cols clade.all_sibling_leaves,
129
+ leaf2attrs,
130
+ opts[:entropy_cutoff]
121
131
  key_cols_parent_leaves =
122
- get_low_ent_cols clade.parent_leaves, leaf2attrs, opts[:entropy_cutoff]
132
+ TreeClusters.low_ent_cols clade.parent_leaves,
133
+ leaf2attrs,
134
+ opts[:entropy_cutoff]
123
135
 
124
136
  key_cols_all_minus_sibling =
125
137
  key_cols_all_leaves - key_cols_all_sibling_leaves
126
138
  key_cols_all_minus_parent =
127
139
  key_cols_all_leaves - key_cols_parent_leaves
128
140
 
129
- key_cols_f.puts [clade_id,
130
- key_cols_all_leaves.count,
131
- key_cols_all_leaves.to_a].join "\t"
132
- key_cols_minus_parent_cols_f.puts [clade_id,
133
- key_cols_all_minus_parent.count,
134
- key_cols_all_minus_parent.to_a].join "\t"
135
- key_cols_minus_sibling_cols_f.puts [clade_id,
136
- key_cols_all_minus_sibling.count,
137
- key_cols_all_minus_sibling.to_a].join "\t"
141
+ puts_info key_cols_f,
142
+ clade_id,
143
+ key_cols_all_leaves
144
+
145
+ puts_info key_cols_minus_parent_cols_f,
146
+ clade_id,
147
+ key_cols_all_minus_parent
138
148
 
149
+ puts_info key_cols_minus_sibling_cols_f,
150
+ clade_id,
151
+ key_cols_all_minus_sibling
139
152
  end
140
153
  ensure
141
154
  info_f.close
data/lib/tree_clusters.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  require "abort_if"
2
2
  require "Newick"
3
3
  require "set"
4
+ require "parse_fasta"
5
+ require "shannon"
4
6
  require "tree_clusters/version"
5
7
 
6
8
  include AbortIf
@@ -25,6 +27,96 @@ end
25
27
  # Top level namespace of the Gem.
26
28
  module TreeClusters
27
29
 
30
+ # Given an ary of strings, find the most common string in the ary.
31
+ #
32
+ # @param bases [Array<String>] an array of strings
33
+ #
34
+ # @return most_common_str [String] the most common string in the ary.
35
+ #
36
+ # @example Upper case and lower case count as the same.
37
+ # TreeClusters::consensus %w[a A C T] #=> "A"
38
+ # @example Ties take the one closest to the end
39
+ # TreeClusters::consensus %w[a c T t C t g] #=> "T"
40
+ #
41
+ # @note Each string is upcase'd before frequencies are calculated.
42
+ def consensus bases
43
+ bases.
44
+ map(&:upcase).
45
+ group_by(&:itself).
46
+ sort_by { |_, bases| bases.count }.
47
+ reverse.
48
+ first.
49
+ first
50
+ end
51
+
52
+ def read_alignment aln_fname
53
+ leaf2attrs = TreeClusters::Attrs.new
54
+ aln_len = nil
55
+ ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
56
+ leaf2attrs[rec.id] = { aln: rec.seq.chars }
57
+
58
+ aln_len ||= rec.seq.length
59
+
60
+ abort_unless aln_len == rec.seq.length,
61
+ "Aln len mismatch for #{rec.id}"
62
+ end
63
+
64
+ leaf2attrs
65
+ end
66
+
67
+ def low_ent_cols leaves, leaf2attrs, entropy_cutoff
68
+ low_ent_cols = []
69
+ alns = leaf2attrs.attrs leaves, :aln
70
+ aln_cols = alns.transpose
71
+
72
+ aln_cols.each_with_index do |aln_col, aln_col_idx|
73
+ has_gaps = aln_col.any? { |aa| aa == "-" }
74
+ low_entropy =
75
+ Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
76
+
77
+ if !has_gaps && low_entropy
78
+ low_ent_cols << (aln_col_idx + 1)
79
+ end
80
+ end
81
+
82
+ Set.new low_ent_cols
83
+ end
84
+
85
+ def check_ids tree, mapping, aln
86
+ tree_ids = Set.new(NewickTree.fromFile(tree).taxa)
87
+
88
+ mapping_ids = Set.new
89
+ File.open(mapping, "rt").each_line.with_index do |line, idx|
90
+ unless idx.zero?
91
+ id, *rest = line.chomp.split
92
+
93
+ mapping_ids << id
94
+ end
95
+ end
96
+
97
+ aln_ids = Set.new
98
+ ParseFasta::SeqFile.open(aln).each_record do |rec|
99
+ aln_ids << rec.id
100
+ end
101
+
102
+ if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
103
+ AbortIf::logger.error { "Seq IDs did not match in all input files" }
104
+
105
+ tree_ids = tree_ids.to_a.sort
106
+ mapping_ids = mapping_ids.to_a.sort
107
+ aln_ids = aln_ids.to_a.sort
108
+
109
+ AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
110
+ AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
111
+ AbortIf::logger.debug { ["aln_ids", aln_ids].join "\t" }
112
+
113
+ raise AbortIf::Exit
114
+ else
115
+ true
116
+ end
117
+ end
118
+
119
+
28
120
  # Given a NewickTree, return an array of all Clades in that tree.
29
121
  #
30
122
  # @param tree [NewickTree] a NewickTree object
@@ -1,3 +1,3 @@
1
1
  module TreeClusters
2
- VERSION = "0.4.0"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -0,0 +1,6 @@
1
+ >s1
2
+ AAA
3
+ >s2
4
+ AAAA
5
+ >s3
6
+ AA
data/test_files/small.aln CHANGED
@@ -1,14 +1,14 @@
1
- >a-1
1
+ >a-1 apple
2
2
  AAAA
3
- >a-2
3
+ >a-2 pie
4
4
  AAAT
5
- >b-1
5
+ >b-1 is
6
6
  CCCC
7
- >b-2
8
- CCCT
9
- >bb-1
7
+ >b-2 really
8
+ cccT
9
+ >bb-1 good
10
10
  CCTG
11
- >bbb-1
12
- CCGG
13
- >bbb-2
11
+ >bbb-1 and
12
+ CCGg
13
+ >bbb-2 tasty
14
14
  CGGG
@@ -0,0 +1,4 @@
1
+ >apple
2
+ aRSOIN
3
+ >pie
4
+ ARSOIT
@@ -0,0 +1,2 @@
1
+ snazzy pie
2
+ apple thing
@@ -0,0 +1 @@
1
+ (a, b);
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tree_clusters
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan Moore
@@ -191,11 +191,15 @@ files:
191
191
  - exe/snazzy_clades_key_cols
192
192
  - lib/tree_clusters.rb
193
193
  - lib/tree_clusters/version.rb
194
+ - test_files/bad.aln
194
195
  - test_files/non_bifurcating.aln
195
196
  - test_files/non_bifurcating.tre
196
197
  - test_files/small.aln
197
198
  - test_files/small.mapping
198
199
  - test_files/small.tre
200
+ - test_files/small_aln_bad_ids
201
+ - test_files/small_mapping_bad_ids
202
+ - test_files/small_tree_bad_ids
199
203
  - test_files/test.tre
200
204
  - tree_clusters.gemspec
201
205
  homepage: https://github.com/mooreryan/tree_clusters