Dendrograms 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source :gemcutter
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "bundler", ">= 1.0.0"
10
+ gem "jeweler"
11
+ end
12
+
13
+ gem "progressbar"
data/Gemfile.lock ADDED
@@ -0,0 +1,22 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.8.3)
6
+ bundler (~> 1.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ rdoc
10
+ json (1.6.6)
11
+ progressbar (0.10.0)
12
+ rake (0.9.2.2)
13
+ rdoc (3.12)
14
+ json (~> 1.4)
15
+
16
+ PLATFORMS
17
+ ruby
18
+
19
+ DEPENDENCIES
20
+ bundler (>= 1.0.0)
21
+ jeweler
22
+ progressbar
data/README.markdown ADDED
@@ -0,0 +1,3 @@
1
+ # Dendrograms
2
+
3
+ A pure Ruby implementation of Clauset's Hierarchical Random Graph tools. Installs two CLI applications, `fitHRG` and `consensusHRG`.
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "Dendrograms"
16
+ gem.homepage = "http://github.com/doches/Dendrograms"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{Ruby implementation of Clauset's Hierarchical Random Graphs}
19
+ gem.description = %Q{Ruby implementation of Clauset's Hierarchical Random Graphs}
20
+ gem.email = "trevor@texasexpat.net"
21
+ gem.authors = ["Trevor Fountain"]
22
+ gem.version = "0.0.1"
23
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
24
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
25
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
26
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
27
+ gem.add_runtime_dependency 'progressbar'
28
+ end
29
+ Jeweler::RubygemsDotOrgTasks.new
data/bin/consensusHRG ADDED
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__),"..","lib")
4
+
5
+ Description = "Re-implementation, basically, of Clauset's consensusHRG tool\nTakes a .dendro file and a wordmap; outputs a consensus hierarchy"
6
+ Usage = "ruby #{$0} file.dendro file.wordmap > file.consensus.dot"
7
+ Num_Args = 2
8
+
9
+ require 'Dendrograms'
10
+ include Dendrograms
11
+ require 'progressbar'
12
+
13
+ verbose = check_flag("-v","--verbose")
14
+ samples = check_opt("-s","--samples","300").to_i
15
+ spread = check_opt("-S","--spread","100").to_i
16
+
17
+ if ARGV.size != Num_Args
18
+ STDERR.puts Description
19
+ STDERR.puts " "
20
+ STDERR.puts "Usage: #{Usage}"
21
+ exit(1)
22
+ end
23
+
24
+ STDERR.puts "#{samples} samples with a spread of #{spread} \n -> #{samples*spread} resamples"
25
+
26
+ dendro_file = ARGV.shift
27
+ graph_file = dendro_file.gsub("-best.dendro",".pairs")
28
+ wordmap_file = ARGV.shift
29
+ @wordmap = {}
30
+ IO.foreach(wordmap_file) do |line|
31
+ word,index = *(line.strip.split(/\s+/))
32
+ @wordmap[index] = word
33
+ end
34
+
35
+ graph = Graph.new(graph_file)
36
+ dendrogram = Dendrogram.new(graph, dendro_file)
37
+
38
+ progress = ProgressBar.new("Sampling",samples) if not verbose
39
+ clusters = {}
40
+ sample_index = 0
41
+ STDERR.puts ["MCMC STEPS","LIKELIHOOD","TIME"].join("\t") if verbose
42
+ start = Time.now.to_i
43
+ while sample_index < samples
44
+ spread.times { dendrogram.sample! }
45
+ dclusters = dendrogram.clusters.map { |cluster| cluster.reject { |x| x.nil? }.sort.join("_") }.uniq
46
+ dclusters.each do |cluster|
47
+ clusters[cluster] ||= 0
48
+ clusters[cluster] += 1
49
+ end
50
+ STDERR.puts [dendrogram.mcmc_steps, dendrogram.likelihood,"#{Time.now.to_i-start}s"].join("\t") if verbose
51
+
52
+ sample_index += 1
53
+ progress.inc if not verbose
54
+ end
55
+ progress.finish if not verbose
56
+
57
+ clusters.reject! { |k,v| v <= samples/2.0 }
58
+ #clusters.map { |k,v| [k,v] }.sort { |a,b| a[1] <=> b[1] }.each { |k,v| STDERR.puts "#{v}:\t#{k.gsub('_',", ")}" }
59
+
60
+ keep = clusters.map { |pair| pair[0].split("_").map { |x| x.to_i } }.sort { |b,a| a.size <=> b.size }
61
+ keep.unshift keep.flatten.uniq
62
+ keep.uniq!
63
+
64
+ hnodes = [ConsensusNode.new(keep.shift)]
65
+ while keep.size > 0
66
+ cluster = keep.shift
67
+ lca = hnodes.reject { |x| not x.contains(cluster) }.sort { |a,b| a.size <=> b.size }[0]
68
+ new_node = ConsensusNode.new(cluster)
69
+ lca.add_child(new_node)
70
+ hnodes.push new_node
71
+ end
72
+
73
+ puts "graph {"
74
+ hnodes[0].to_dot(@wordmap)
75
+ puts "}"
data/bin/fitHRG ADDED
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__),"..","lib")
4
+
5
+ Description = "Re-implementation, basically, of Clauset's fitHRG tool. Takes a .pairs file and fits a HRG over the graph\nProduces a .dendro file with the fit HRG and a .info file with information about the run; these are updated as higher-likelihood dendrograms are found.\n\nIf you pass in an optional partial dendrogra, fitHRG will continue sampling from that saved point."
6
+ Usage = "ruby #{$0} file.pairs [file.dendro]"
7
+ Num_Args = 1
8
+
9
+ require 'Dendrograms'
10
+ include Dendrograms
11
+
12
+ verbose = check_flag("-v","--verbose")
13
+ verbose_saved = check_flag("-s", "--saved")
14
+
15
+ if ARGV.size < Num_Args
16
+ STDERR.puts Description
17
+ STDERR.puts " "
18
+ STDERR.puts "Usage: #{Usage}"
19
+ exit(1)
20
+ end
21
+
22
+ pairs_file = ARGV.shift
23
+ dendrogram_file = pairs_file.gsub(/\.pairs$/,"-best.dendro")
24
+ info_file = pairs_file.gsub(/\.pairs$/,"-best.info")
25
+
26
+ graph = Graph.new(pairs_file)
27
+ dendrogram = nil
28
+ if ARGV.empty?
29
+ dendrogram = Dendrogram.new(graph)
30
+ else
31
+ dendrogram = Dendrogram.new(graph, ARGV.shift)
32
+ end
33
+
34
+ best_likelihood = dendrogram.sample!
35
+ best_steps = 0
36
+ start = Time.now.to_i
37
+ STDERR.puts ["MCMC","LIKELIHOOD","BEST LIKEL.","AT MCMC","TIME"].join("\t") if verbose
38
+ while true
39
+ saved = false
40
+ likelihood = dendrogram.sample!
41
+ if likelihood > best_likelihood
42
+ best_likelihood = likelihood
43
+ dendrogram.save(dendrogram_file,info_file)
44
+ best_steps = dendrogram.mcmc_steps
45
+ saved = true
46
+ end
47
+
48
+ if (saved and verbose_saved) or dendrogram.mcmc_steps % 1000 == 0
49
+ STDERR.puts [dendrogram.mcmc_steps, dendrogram.likelihood, best_likelihood, best_steps, "#{Time.now.to_i-start}s"].join("\t") if verbose
50
+ end
51
+ end
@@ -0,0 +1,4 @@
1
+ require 'Dendrograms/Graph'
2
+ require 'Dendrograms/Dendrogram'
3
+ require 'Dendrograms/Consensus'
4
+ require 'Dendrograms/cli'
@@ -0,0 +1,50 @@
1
+ module Dendrograms
2
+
3
+ class ConsensusNode
4
+ attr_reader :index
5
+ @@index = 0
6
+ def initialize(children)
7
+ @children = children
8
+ @index = @@index
9
+ @@index += 1
10
+ @leaves = children.dup
11
+ end
12
+
13
+ @@leaves = {}
14
+ def ConsensusNode.leaf(leaf)
15
+ @@leaves[leaf] ||= @@leaves.size
16
+ return @@leaves[leaf]
17
+ end
18
+
19
+ def children
20
+ @children.map { |x| x.is_a?(ConsensusNode) ? x.children : x }.flatten
21
+ end
22
+
23
+ def add_child(node)
24
+ @children = @children - node.children
25
+ @children.push node
26
+ end
27
+
28
+ def contains(set)
29
+ (@leaves & set).size == set.size
30
+ end
31
+
32
+ def size
33
+ @leaves.size
34
+ end
35
+
36
+ def to_dot(wordmap)
37
+ puts "\tINTERNAL_#{@index} [shape=point, label=\"\"];"
38
+ @children.each do |child|
39
+ if child.is_a?(ConsensusNode)
40
+ puts "\tINTERNAL_#{@index} -- INTERNAL_#{child.index};"
41
+ child.to_dot(wordmap)
42
+ else
43
+ puts "\tLEAF_#{ConsensusNode.leaf(child)} [shape=none, label=\"#{wordmap[child.to_s]}\"];"
44
+ puts "\tINTERNAL_#{@index} -- LEAF_#{ConsensusNode.leaf(child)};"
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ end
@@ -0,0 +1,337 @@
1
+ module Dendrograms
2
+
3
+ # Represents a single node in a dendrogram. Provides methods for transformation, and for computing likelihood
4
+ class DendrogramNode
5
+ attr_accessor :index, :left, :right
6
+ @@index = 0
7
+
8
+ Epsilon = 0.00000000001
9
+
10
+ def initialize(left, right)
11
+ @left = left
12
+ @right = right
13
+ @index = @@index
14
+
15
+ @@index += 1
16
+ @child_cache = nil
17
+ end
18
+
19
+ def to_s
20
+ [@index,
21
+ (@left.is_a?(DendrogramNode) ? "#{@left.index} (D)" : "#{@left} (G)"),
22
+ (@right.is_a?(DendrogramNode) ? "#{@right.index} (D)" : "#{@right} (G)")].join("\t")
23
+ end
24
+
25
+ @@leaves = {}
26
+ def DendrogramNode.resetLeaves
27
+ @@leaves = {}
28
+ end
29
+
30
+ def DendrogramNode.linkToLeaf(node, leaf, wordmap)
31
+ dot = []
32
+ if @@leaves[leaf].nil?
33
+ @@leaves[leaf] = @@leaves.size
34
+ label = wordmap.nil? ? leaf : wordmap[leaf.to_s]
35
+ dot.push "LEAF_#{@@leaves[leaf]} [shape=none, label=\"#{label}\"];"
36
+ end
37
+ dot.push "#{node} -- LEAF_#{@@leaves[leaf]};"
38
+ return dot
39
+ end
40
+
41
+ def to_dot(graph, wordmap=nil, likelihood=false)
42
+ dot = self.dot_node(graph,wordmap, likelihood)
43
+
44
+ [@left, @right].each do |child|
45
+ if child.is_a?(DendrogramNode)
46
+ dot.push "INTERNAL_#{@index} -- INTERNAL_#{child.index};"
47
+ else
48
+ DendrogramNode.linkToLeaf("INTERNAL_#{@index}",child,wordmap).each { |x| dot.push x }
49
+ end
50
+ end
51
+
52
+ return "\t#{dot.join("\n\t")}"
53
+ end
54
+
55
+ def dot_node(graph, wordmap=nil, likelihood=false, decorate=true)
56
+ label = "\"\""
57
+ shape = "point"
58
+ color = "black"
59
+ if likelihood != false
60
+ theta = self.connectedness(graph)[0]
61
+ theta = (theta*100).to_i/100.0
62
+ shape = "none"
63
+ label = "\"#{theta}\""
64
+ color = theta > likelihood ? "blue" : "red"
65
+ end
66
+ if decorate
67
+ return ["INTERNAL#{@index} [shape=#{shape},label=#{label},fontcolor=#{color},color=red];"]
68
+ else
69
+ return ["INTERNAL#{@index} [shape=point, label=\"\"];"]
70
+ end
71
+ end
72
+
73
+ def hierarchy_dot(graph, wordmap, likelihood)
74
+ dot = self.dot_node(graph,wordmap,likelihood,false)
75
+ theta = self.connectedness(graph)[0]
76
+
77
+ if theta < likelihood
78
+ if @left.is_a?(DendrogramNode)
79
+ dot.push "INTERNAL#{@index} -- INTERNAL#{@left.index};"
80
+ dot.push @left.hierarchy_dot(graph,wordmap,likelihood)
81
+ else
82
+ dot.push "INTERNAL#{@index} -- LEAF#{@left};"
83
+ dot.push "LEAF#{@left} [shape=none, label=\"#{wordmap[@left.to_s]}\"];"
84
+ end
85
+ if @right.is_a?(DendrogramNode)
86
+ dot.push "INTERNAL#{@index} -- INTERNAL#{@right.index};"
87
+ dot.push @right.hierarchy_dot(graph,wordmap,likelihood)
88
+ else
89
+ dot.push "INTERNAL#{@index} -- LEAF#{@right};"
90
+ dot.push "LEAF#{@right} [shape=none, label=\"#{wordmap[@right.to_s]}\"];"
91
+ end
92
+ else
93
+ dot.push self.children.map { |x| ["INTERNAL#{@index} -- LEAF#{x};","LEAF#{x} [shape=none, label=\"#{wordmap[x.to_s]}\"];"] }
94
+ # dot.push self.children.map { |x| "LEAF#{x} [shape=none, label=\"#{wordmap[x.to_s]}\"];" }
95
+ # dot.push self.children.map { |x| "INTERNAL#{@index} -- LEAF#{x};" }
96
+ end
97
+
98
+ dot.flatten
99
+ end
100
+
101
+ def children(force = false)
102
+ if force or @child_cache.nil?
103
+ @child_cache = [@left.is_a?(DendrogramNode) ? @left.children() : @left,
104
+ @right.is_a?(DendrogramNode) ? @right.children() : @right].flatten
105
+ end
106
+
107
+ return @child_cache
108
+ end
109
+
110
+ def connectedness(graph)
111
+ left_children = @left.is_a?(DendrogramNode) ? @left.children : [@left]
112
+ right_children = @right.is_a?(DendrogramNode) ? @right.children : [@right]
113
+
114
+ links = graph.edges_between(left_children, right_children).to_f
115
+ max_links = (left_children.size * right_children.size)
116
+ theta = links / max_links.to_f
117
+
118
+ return [theta, max_links]
119
+ end
120
+
121
+ def likelihood(graph)
122
+ theta,max_links = *self.connectedness(graph)
123
+ theta = Epsilon if theta <= 0.0
124
+ theta = 1.0-Epsilon if theta >= 1.0
125
+ # l = (theta**links) * (1-theta)**(max_links-links)
126
+ h = -theta*Math.log(theta) - (1-theta)*Math.log(1-theta)
127
+ return -h * max_links
128
+ end
129
+
130
+ def mutable?
131
+ @left.is_a?(DendrogramNode) or @right.is_a?(DendrogramNode)
132
+ end
133
+
134
+ def get_mutation
135
+ # Are we swapping children with the left or the right child?if rand > 0.5
136
+ child = nil
137
+ if @left.is_a?(DendrogramNode)
138
+ child = @left
139
+ else
140
+ child = @right
141
+ end
142
+
143
+ # Are we swapping the child's left or right child?
144
+ do_left = false
145
+ if rand > 0.5
146
+ do_left = true
147
+ end
148
+
149
+ return {:child => child, :do_left => do_left, :local_child => (child == @left ? @right : @left)}
150
+ end
151
+
152
+ def mutate!(mutation = nil)
153
+ mutation ||= self.get_mutation
154
+
155
+ if mutation[:do_left]
156
+ temp = mutation[:child].left
157
+ mutation[:child].left = mutation[:local_child]
158
+ if mutation[:local_child] == @left
159
+ @left = temp
160
+ else
161
+ @right = temp
162
+ end
163
+ mutation[:local_child] = temp
164
+ else
165
+ temp = mutation[:child].right
166
+ mutation[:child].right = mutation[:local_child]
167
+ if mutation[:local_child] == @left
168
+ @left = temp
169
+ else
170
+ @right = temp
171
+ end
172
+ mutation[:local_child] = temp
173
+ end
174
+
175
+ mutation[:child].children(true)
176
+ self.children(true)
177
+ return mutation
178
+ end
179
+ end
180
+
181
+ # Takes a Graph, builds a dendrogram, and provides methods to sample, compute likelihood, and save (with optional info)
182
+ class Dendrogram
183
+ attr_reader :graph, :likelihood, :mcmc_steps, :root
184
+
185
+ def initialize(graph, tree_file=nil)
186
+ @graph = graph
187
+ @nodes = []
188
+ @likelihoods = []
189
+ @likelihood = 0
190
+ @mcmc_steps = 0
191
+
192
+ if tree_file
193
+ index_map = {}
194
+ IO.foreach(tree_file) do |line|
195
+ if line =~ /^(\d+)\t(\d+) \(([D|G])\)\t(\d+) \(([D|G])\)/
196
+ index, left, ltype, right, rtype = $1.to_i, $2.to_i, $3, $4.to_i, $5
197
+
198
+ node = DendrogramNode.new(left, right)
199
+ node.index = index
200
+ @nodes.push node
201
+ index_map[node.index] = node
202
+
203
+ node.left = [left] if ltype == "D"
204
+ node.right = [right] if rtype == "D"
205
+ end
206
+ end
207
+ # Update mappings
208
+ @nodes.each do |node|
209
+ node.left = index_map[node.left[0]] if node.left.is_a?(Array)
210
+ node.right = index_map[node.right[0]] if node.right.is_a?(Array)
211
+ node.index = @nodes.index(node)
212
+ end
213
+ # Find root
214
+ @root = @nodes.sort { |b,a| a.children.size <=> b.children.size }[0]
215
+
216
+ # Update MCMC, if possible
217
+ begin
218
+ info_file = tree_file.gsub(".dendro",".info")
219
+ if File.exists?(info_file)
220
+ status = YAML.load_file(info_file)
221
+ @mcmc_steps = status[:mcmc]
222
+ end
223
+ rescue
224
+ STDERR.puts "Unable to load MCMC status from .info; carrying on."
225
+ end
226
+ else
227
+ # Incrementally construct a balanced dendrogram
228
+ remaining = graph.nodes.dup.sort_by { rand }
229
+
230
+ while remaining.size > 1
231
+ a = remaining.pop
232
+ b = remaining.shift
233
+
234
+ node = DendrogramNode.new(a,b)
235
+ @nodes.push node
236
+ remaining.push(node)
237
+ remaining = remaining.sort_by { rand }
238
+ end
239
+
240
+ # Hold on to the last remaining node; it's the root
241
+ @root = remaining.shift
242
+ end
243
+
244
+ # Initialise likelihoods
245
+ @nodes.each_with_index { |node, index| @likelihoods[index] = node.likelihood(@graph) }
246
+ # Compute starting likelihood
247
+ @likelihood = @likelihoods.inject(0) { |s,x| s += x }
248
+ end
249
+
250
+ # Returns the mean node likelihood
251
+ def mean_likelihood
252
+ mean = @likelihoods.map { |x| Math.exp(x) }.inject(0) { |s,x| s += x } / @likelihoods.size.to_f
253
+ STDERR.puts "Mean likelihood: #{mean}"
254
+ return mean
255
+ end
256
+
257
+ def mean_theta
258
+ @nodes.map { |x| x.connectedness(@graph)[0] }.inject(0) { |s,x| s += x } / @nodes.size.to_f
259
+ end
260
+
261
+ # Returns the median node connectednes
262
+ def median_theta
263
+ v = @nodes.map { |x| x.connectedness(@graph)[0] }
264
+ return v[v.size/2]
265
+ end
266
+
267
+ def sample!
268
+ mutate = nil
269
+ while true
270
+ node = @nodes[(rand*@nodes.size).to_i]
271
+ if node.mutable?
272
+ mutate = node
273
+ break
274
+ end
275
+ end
276
+
277
+ # Mutate tree
278
+ mutation = mutate.mutate!
279
+
280
+ old_likelihood = @likelihood
281
+ self.update_likelihood([mutate, mutate.left, mutate.right])
282
+
283
+ if not (@likelihood > old_likelihood or Math.log(rand) < @likelihood - old_likelihood)
284
+ mutate.mutate!(mutation)
285
+ self.update_likelihood([mutate, mutate.left, mutate.right])
286
+ end
287
+ @mcmc_steps += 1
288
+
289
+ return @likelihood
290
+ end
291
+
292
+ def clusters
293
+ @nodes.map { |node| node.children }
294
+ end
295
+
296
+ # Update the likelihood given two modified nodes
297
+ def update_likelihood(nodes)
298
+ # Compute new likelihood
299
+ nodes.each do |node|
300
+ if node.is_a?(DendrogramNode)
301
+ @likelihood -= @likelihoods[node.index]
302
+ @likelihoods[node.index] = node.likelihood(@graph)
303
+ @likelihood += @likelihoods[node.index]
304
+ end
305
+ end
306
+ end
307
+
308
+ def save(tree_file, info_file)
309
+ fout = File.open(tree_file,'w')
310
+ fout.puts @nodes.map { |node| node.to_s }.join("\n")
311
+ fout.close
312
+
313
+ fout = File.open(info_file, 'w')
314
+ fout.puts({:likelihood => @likelihood, :mcmc => @mcmc_steps}.to_yaml)
315
+ fout.close
316
+
317
+ self.to_dot(tree_file.gsub(/\.[^\.]+$/,".dot"))
318
+ end
319
+
320
+ def get_dot(wordmap=nil, likelihood=false)
321
+ DendrogramNode.resetLeaves
322
+ ["graph {",@nodes.map { |node| node.to_dot(@graph,wordmap,likelihood) }.join("\n"),"}"].join("\n")
323
+ end
324
+
325
+ def get_hierarchy_dot(wordmap=nil, likelihood=false)
326
+ DendrogramNode.resetLeaves
327
+ ["graph {", @root.hierarchy_dot(@graph, wordmap, likelihood).map { |x| "\t#{x}" }.join("\n"), "}"].join("\n")
328
+ end
329
+
330
+ def to_dot(dot_file, wordmap=nil, likelihood=false)
331
+ fout = File.open(dot_file,'w')
332
+ fout.puts self.get_dot(wordmap,likelihood)
333
+ fout.close
334
+ end
335
+ end
336
+
337
+ end
@@ -0,0 +1,40 @@
1
+ module Dendrograms
2
+
3
+ # Loads a graph from a .pairs file, and computes the number of edges between sets of nodes
4
+
5
+ class Graph
6
+ attr_reader :nodes
7
+
8
+ # Load a .pairs file
9
+ def initialize(pairs)
10
+ @nodes = {}
11
+ @edges = {}
12
+
13
+ IO.foreach(pairs) do |line|
14
+ from, to = *(line.strip.split(/\s+/).map { |x| x.to_i })
15
+ @nodes[from] = true
16
+ @nodes[to] = true
17
+ @edges[edge_key(from,to)] = true
18
+ @edges[edge_key(to, from)] = true
19
+ end
20
+ @nodes = @nodes.keys
21
+ end
22
+
23
+ # Compute the number of edges between two sets of nodes
24
+ def edges_between(set_a, set_b)
25
+ count = 0
26
+ set_a.each do |a|
27
+ set_b.each do |b|
28
+ count += 1 if @edges[edge_key(a,b)]
29
+ end
30
+ end
31
+ return count
32
+ end
33
+
34
+ # Return a unique key for an edge between A and B
35
+ def edge_key(a,b)
36
+ "#{a}_#{b}".to_sym
37
+ end
38
+ end
39
+
40
+ end
@@ -0,0 +1,35 @@
1
+ module Dendrograms
2
+
3
+ def check_flag(short,long)
4
+ present = false
5
+ [short,long].each do |opt|
6
+ present = true if ARGV.include?(opt)
7
+ ARGV.reject! { |x| x == opt }
8
+ end
9
+ return present
10
+ end
11
+
12
+ def check_opt(short,long,default)
13
+ value = default
14
+ [short, long].each do |opt|
15
+ if ARGV.include?(opt)
16
+ index = ARGV.index(opt)
17
+ value = ARGV[index+1]
18
+ ARGV.delete_at(index)
19
+ ARGV.delete_at(index)
20
+ break
21
+ end
22
+ end
23
+ return value
24
+ end
25
+
26
+ def usage(description, usage, args)
27
+ if ARGV.size != args
28
+ STDERR.puts description
29
+ STDERR.puts " "
30
+ STDERR.puts "Usage: #{usage}"
31
+ exit(1)
32
+ end
33
+ end
34
+
35
+ end
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Apply a wordmap to all files with a given prefix
4
+ #
5
+ # Usage: apply_wordmap path/to/wordmap prefix/files
6
+
7
+ require 'progressbar'
8
+
9
+ if ARGV.size < 2
10
+ STDERR.puts "Apply a wordmap to all files with a given prefix"
11
+ STDERR.puts ""
12
+ STDERR.puts "Usage: #{$0} path/to/wordmap prefix/files [-w|--warn]"
13
+ exit(0)
14
+ end
15
+
16
+ @debug = ARGV.include?("-w") or ARGV.include?("--warn")
17
+ ARGV.reject! { |x| x == "-w" or x == "--warn" }
18
+
19
+ wordmap_path = ARGV.shift
20
+ prefix = ARGV.shift
21
+
22
+ def process(file)
23
+ ext = file.split(".").pop
24
+
25
+ txt = IO.readlines(file).join("").strip
26
+ case ext
27
+ when "dot"
28
+ STDERR.puts "#{file} (DOT)"
29
+ progress = ProgressBar.new("Applying",@wordmap.size)
30
+ @wordmap.each_pair { |index,word| txt.gsub!("\"#{index}\"","\"#{word}\"") if not word.nil?; progress.inc }
31
+ progress.finish
32
+ return txt
33
+ =begin
34
+ when "matrix"
35
+ STDERR.puts "#{file} (MATRIX)"
36
+ progress = ProgressBar.new("Applying",@wordmap.size)
37
+ @wordmap.each_pair { |index,word| txt.gsub!(/^#{index}\:/,"#{word}:") if not word.nil?; progress.inc }
38
+ progress.finish
39
+ return txt
40
+ when "txt"
41
+ STDERR.puts "#{file} (TXT)"
42
+ progress = ProgressBar.new("Applying",@wordmap.size)
43
+ @wordmap.each_pair { |index,word| txt.gsub!("<#{index}>",word) if not word.nil?; progress.inc }
44
+ progress.finish
45
+ return txt
46
+ when "graph"
47
+ STDERR.puts "#{file} (GRAPH)"
48
+ progress = ProgressBar.new("Applying",@wordmap.size*2)
49
+ @wordmap.each_pair { |index,word| txt.gsub!(/^#{index}\t/,"#{word}\t") if not word.nil?; progress.inc }
50
+ @wordmap.each_pair { |index,word| txt.gsub!(/\t#{index}\t/,"\t#{word}\t") if not word.nil?; progress.inc }
51
+ progress.finish
52
+ return txt
53
+ =end
54
+ else
55
+ STDERR.puts "Unrecognised file extension \"#{ext}\"" if @debug
56
+ return nil
57
+ end
58
+ end
59
+
60
+ @wordmap = {}
61
+ IO.foreach(wordmap_path) do |line|
62
+ word,index = *(line.strip.split(/\s+/))
63
+ index = index.to_i
64
+ @wordmap[index] = word if @wordmap[index].nil?
65
+ end
66
+
67
+ Dir.glob("#{prefix}*").each do |file|
68
+ new_filename = file.split(".").reject { |x| x == "human" }
69
+ ext = new_filename.pop
70
+ new_filename = new_filename.push("human").push(ext).join(".")
71
+ if not File.exists?(new_filename)
72
+ txt = process(file)
73
+ if not txt.nil?
74
+ fout = File.open(new_filename,'w')
75
+ fout.puts txt
76
+ fout.close
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,37 @@
1
+ require 'lib/cli'
2
+ require 'lib/Dendrogram'
3
+ require 'lib/Graph'
4
+
5
+ usage(
6
+ "Takes a .dendro and outputs a clusterval-friendly yaml flat clustering",
7
+ "ruby #{$0} file.dendro > file.yaml",
8
+ 1
9
+ )
10
+
11
+ dendro_f = ARGV.shift
12
+ graph_f = Dir.glob(File.join(File.dirname(dendro_f),"*.pairs"))[0]
13
+ wordmap_f = Dir.glob(File.join(File.dirname(dendro_f),"*.wordmap"))[0]
14
+
15
+ @graph = Graph.new(graph_f)
16
+ @dendrogram = Dendrogram.new(@graph, dendro_f)
17
+ @wordmap = {}
18
+ IO.foreach(wordmap_f) do |line|
19
+ word,index = *(line.strip.split(/\s+/))
20
+ @wordmap[index] = word
21
+ end
22
+
23
+ @min = @dendrogram.mean_theta
24
+
25
+ @clusters = {}
26
+ def identify_subtree_clusters(node)
27
+ return @clusters[@clusters.size.to_s] = [@wordmap[node.to_s]] if node.is_a?(Fixnum)
28
+ theta = node.connectedness(@graph)[0]
29
+ if theta > @min
30
+ @clusters[@clusters.size.to_s] = node.children.map { |x| @wordmap[x.to_s] }
31
+ else
32
+ return [identify_subtree_clusters(node.left), identify_subtree_clusters(node.right)]
33
+ end
34
+ end
35
+
36
+ identify_subtree_clusters(@dendrogram.root)
37
+ puts @clusters.to_yaml
@@ -0,0 +1,28 @@
1
+ Description = "Takes a .dendro and outputs a .dot file"
2
+ Usage = "ruby #{$0} file.dendro > file.dot"
3
+ Num_Args = 1
4
+
5
+ if ARGV.size != Num_Args
6
+ STDERR.puts Description
7
+ STDERR.puts " "
8
+ STDERR.puts "Usage: #{Usage}"
9
+ exit(1)
10
+ end
11
+
12
+ require 'lib/Dendrogram'
13
+ require 'lib/Graph'
14
+
15
+ dendro_f = ARGV.shift
16
+ graph_f = Dir.glob(File.join(File.dirname(dendro_f),"*.pairs"))[0]
17
+ wordmap_f = Dir.glob(File.join(File.dirname(dendro_f),"*.wordmap"))[0]
18
+ @min = ARGV.shift.to_f
19
+
20
+ @graph = Graph.new(graph_f)
21
+ @dendrogram = Dendrogram.new(@graph, dendro_f)
22
+ @wordmap = {}
23
+ IO.foreach(wordmap_f) do |line|
24
+ word,index = *(line.strip.split(/\s+/))
25
+ @wordmap[index] = word
26
+ end
27
+
28
+ puts @dendrogram.get_dot(@wordmap, @dendrogram.mean_theta)
@@ -0,0 +1,32 @@
1
+ Description = "Takes a .hrg and a -names.lut and outputs a .dendro"
2
+ Usage = "ruby #{$0} file.hrg file.lut > file.dendro"
3
+ Num_Args = 2
4
+
5
+ if ARGV.size != Num_Args
6
+ STDERR.puts Description
7
+ STDERR.puts " "
8
+ STDERR.puts "Usage: #{Usage}"
9
+ exit(1)
10
+ end
11
+
12
+ hrg_f = ARGV.shift
13
+ lut_f = ARGV.shift
14
+ names = nil
15
+ IO.foreach(lut_f) do |line|
16
+ if names.nil?
17
+ names = {}
18
+ else
19
+ virtual, real = *(line.strip.split(/\s+/))
20
+ names[virtual] = real
21
+ end
22
+ end
23
+
24
+ #[ 0 ] L= 46 (D) R= 29 (D) p= 0 e= 0 n= 49
25
+ IO.foreach(hrg_f) do |line|
26
+ if line =~ /\[ (\d+) \] L= (\d+) \(([D|G])\) R= (\d+) \(([D|G])\)/
27
+ index, left, left_type, right, right_type = $1, $2, $3, $4, $5
28
+ left = names[left] if left_type == "G"
29
+ right = names[right] if right_type == "G"
30
+ puts "#{index}\t#{left} (#{left_type})\t#{right} (#{right_type})"
31
+ end
32
+ end
@@ -0,0 +1,64 @@
1
+ # Takes a .hrg (and an optional wordmap) and outputs a .dot
2
+ #
3
+ # Usage: ruby #{$0} file.hrg [file.wordmap] > file.dot
4
+
5
+ hrg = ARGV.shift
6
+ wordmap_f = ARGV.empty? ? nil : ARGV.shift
7
+
8
+ @map_names = true
9
+ @names = {}
10
+ IO.foreach(hrg.gsub("_best-dendro.hrg","-names.lut")) do |line|
11
+ if not line =~ /virtual/
12
+ virtual, real = *(line.strip.split(/\s+/))
13
+ @names[virtual] = real
14
+ end
15
+ end
16
+
17
+ @wordmap = nil
18
+ if not wordmap_f.nil?
19
+ @wordmap = {}
20
+ IO.foreach(wordmap_f) do |line|
21
+ word, index = *(line.strip.split(/\s+/))
22
+ @wordmap[index] = word
23
+ end
24
+ end
25
+
26
+ @nodes = {}
27
+ def internal(index)
28
+ if @nodes[index].nil?
29
+ @nodes[index] = @nodes.size
30
+ puts "\tINTERNAL#{@nodes[index]} [shape=point,label=\"\"];"
31
+ end
32
+
33
+ "INTERNAL#{@nodes[index]}"
34
+ end
35
+
36
+ @leaves = {}
37
+ def leaf(index)
38
+ index = @names[index] if @map_names
39
+ if @leaves[index].nil?
40
+ @leaves[index] = @leaves.size
41
+ label = index
42
+ label = @wordmap[label] if not @wordmap.nil?
43
+ puts "\tLEAF#{@leaves[index]} [shape=none,label=\"#{label}\"];"
44
+ end
45
+
46
+ "LEAF#{@leaves[index]}"
47
+ end
48
+
49
+ def node(type,index)
50
+ if type == "D"
51
+ return internal(index)
52
+ else
53
+ return leaf(index)
54
+ end
55
+ end
56
+
57
+ puts "graph {"
58
+ IO.foreach(hrg) do |line|
59
+ if line =~ /\[ (\d+) \] L= (\d+) \((D|G)\) R= (\d+) \((D|G)\)/
60
+ dnode = node("D",$1)
61
+ [[$3, $2], [$5, $4]].map { |pair| node(pair[0],pair[1]) }.each { |x| puts "\t#{dnode} -- #{x};" }
62
+ end
63
+ end
64
+ puts "}"
data/scripts/render.sh ADDED
@@ -0,0 +1,6 @@
1
+ #!/bin/bash
2
+
3
+ rm $1/*-best.human.dot
4
+ ruby scripts/apply_wordmap.rb $1/*.wordmap $1
5
+ dot -Tpng $1/*-best.human.dot -o render.png
6
+ open render.png
metadata ADDED
@@ -0,0 +1,139 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: Dendrograms
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Trevor Fountain
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-04-03 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ type: :runtime
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ hash: 3
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ name: progressbar
32
+ version_requirements: *id001
33
+ prerelease: false
34
+ - !ruby/object:Gem::Dependency
35
+ type: :development
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ hash: 23
42
+ segments:
43
+ - 1
44
+ - 0
45
+ - 0
46
+ version: 1.0.0
47
+ name: bundler
48
+ version_requirements: *id002
49
+ prerelease: false
50
+ - !ruby/object:Gem::Dependency
51
+ type: :development
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ name: jeweler
62
+ version_requirements: *id003
63
+ prerelease: false
64
+ - !ruby/object:Gem::Dependency
65
+ type: :runtime
66
+ requirement: &id004 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ hash: 3
72
+ segments:
73
+ - 0
74
+ version: "0"
75
+ name: progressbar
76
+ version_requirements: *id004
77
+ prerelease: false
78
+ description: Ruby implementation of Clauset's Hierarchical Random Graphs
79
+ email: trevor@texasexpat.net
80
+ executables:
81
+ - fitHRG
82
+ - consensusHRG
83
+ extensions: []
84
+
85
+ extra_rdoc_files:
86
+ - README.markdown
87
+ files:
88
+ - Gemfile
89
+ - Gemfile.lock
90
+ - README.markdown
91
+ - Rakefile
92
+ - bin/consensusHRG
93
+ - bin/fitHRG
94
+ - lib/Dendrograms.rb
95
+ - lib/Dendrograms/Consensus.rb
96
+ - lib/Dendrograms/Dendrogram.rb
97
+ - lib/Dendrograms/Graph.rb
98
+ - lib/Dendrograms/cli.rb
99
+ - scripts/apply_wordmap.rb
100
+ - scripts/dendro2clusters.rb
101
+ - scripts/dendro2hdot.rb
102
+ - scripts/hrg2dendro.rb
103
+ - scripts/hrg2dot.rb
104
+ - scripts/render.sh
105
+ homepage: http://github.com/doches/Dendrograms
106
+ licenses:
107
+ - MIT
108
+ post_install_message:
109
+ rdoc_options: []
110
+
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ none: false
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ hash: 3
119
+ segments:
120
+ - 0
121
+ version: "0"
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ none: false
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ hash: 3
128
+ segments:
129
+ - 0
130
+ version: "0"
131
+ requirements: []
132
+
133
+ rubyforge_project:
134
+ rubygems_version: 1.8.15
135
+ signing_key:
136
+ specification_version: 3
137
+ summary: Ruby implementation of Clauset's Hierarchical Random Graphs
138
+ test_files: []
139
+