Dendrograms 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source :gemcutter
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "bundler", ">= 1.0.0"
10
+ gem "jeweler"
11
+ end
12
+
13
+ gem "progressbar"
data/Gemfile.lock ADDED
@@ -0,0 +1,22 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ git (1.2.5)
5
+ jeweler (1.8.3)
6
+ bundler (~> 1.0)
7
+ git (>= 1.2.5)
8
+ rake
9
+ rdoc
10
+ json (1.6.6)
11
+ progressbar (0.10.0)
12
+ rake (0.9.2.2)
13
+ rdoc (3.12)
14
+ json (~> 1.4)
15
+
16
+ PLATFORMS
17
+ ruby
18
+
19
+ DEPENDENCIES
20
+ bundler (>= 1.0.0)
21
+ jeweler
22
+ progressbar
data/README.markdown ADDED
@@ -0,0 +1,3 @@
1
+ # Dendrograms
2
+
3
+ A pure Ruby implementation of Clauset's Hierarchical Random Graph tools. Installs two CLI applications, `fitHRG` and `consensusHRG`.
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "Dendrograms"
16
+ gem.homepage = "http://github.com/doches/Dendrograms"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{Ruby implementation of Clauset's Hierarchical Random Graphs}
19
+ gem.description = %Q{Ruby implementation of Clauset's Hierarchical Random Graphs}
20
+ gem.email = "trevor@texasexpat.net"
21
+ gem.authors = ["Trevor Fountain"]
22
+ gem.version = "0.0.1"
23
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
24
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
25
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
26
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
27
+ gem.add_runtime_dependency 'progressbar'
28
+ end
29
+ Jeweler::RubygemsDotOrgTasks.new
data/bin/consensusHRG ADDED
@@ -0,0 +1,75 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__),"..","lib")
4
+
5
+ Description = "Re-implementation, basically, of Clauset's consensusHRG tool\nTakes a .dendro file and a wordmap; outputs a consensus hierarchy"
6
+ Usage = "ruby #{$0} file.dendro file.wordmap > file.consensus.dot"
7
+ Num_Args = 2
8
+
9
+ require 'Dendrograms'
10
+ include Dendrograms
11
+ require 'progressbar'
12
+
13
+ verbose = check_flag("-v","--verbose")
14
+ samples = check_opt("-s","--samples","300").to_i
15
+ spread = check_opt("-S","--spread","100").to_i
16
+
17
+ if ARGV.size != Num_Args
18
+ STDERR.puts Description
19
+ STDERR.puts " "
20
+ STDERR.puts "Usage: #{Usage}"
21
+ exit(1)
22
+ end
23
+
24
+ STDERR.puts "#{samples} samples with a spread of #{spread} \n -> #{samples*spread} resamples"
25
+
26
+ dendro_file = ARGV.shift
27
+ graph_file = dendro_file.gsub("-best.dendro",".pairs")
28
+ wordmap_file = ARGV.shift
29
+ @wordmap = {}
30
+ IO.foreach(wordmap_file) do |line|
31
+ word,index = *(line.strip.split(/\s+/))
32
+ @wordmap[index] = word
33
+ end
34
+
35
+ graph = Graph.new(graph_file)
36
+ dendrogram = Dendrogram.new(graph, dendro_file)
37
+
38
+ progress = ProgressBar.new("Sampling",samples) if not verbose
39
+ clusters = {}
40
+ sample_index = 0
41
+ STDERR.puts ["MCMC STEPS","LIKELIHOOD","TIME"].join("\t") if verbose
42
+ start = Time.now.to_i
43
+ while sample_index < samples
44
+ spread.times { dendrogram.sample! }
45
+ dclusters = dendrogram.clusters.map { |cluster| cluster.reject { |x| x.nil? }.sort.join("_") }.uniq
46
+ dclusters.each do |cluster|
47
+ clusters[cluster] ||= 0
48
+ clusters[cluster] += 1
49
+ end
50
+ STDERR.puts [dendrogram.mcmc_steps, dendrogram.likelihood,"#{Time.now.to_i-start}s"].join("\t") if verbose
51
+
52
+ sample_index += 1
53
+ progress.inc if not verbose
54
+ end
55
+ progress.finish if not verbose
56
+
57
+ clusters.reject! { |k,v| v <= samples/2.0 }
58
+ #clusters.map { |k,v| [k,v] }.sort { |a,b| a[1] <=> b[1] }.each { |k,v| STDERR.puts "#{v}:\t#{k.gsub('_',", ")}" }
59
+
60
+ keep = clusters.map { |pair| pair[0].split("_").map { |x| x.to_i } }.sort { |b,a| a.size <=> b.size }
61
+ keep.unshift keep.flatten.uniq
62
+ keep.uniq!
63
+
64
+ hnodes = [ConsensusNode.new(keep.shift)]
65
+ while keep.size > 0
66
+ cluster = keep.shift
67
+ lca = hnodes.reject { |x| not x.contains(cluster) }.sort { |a,b| a.size <=> b.size }[0]
68
+ new_node = ConsensusNode.new(cluster)
69
+ lca.add_child(new_node)
70
+ hnodes.push new_node
71
+ end
72
+
73
+ puts "graph {"
74
+ hnodes[0].to_dot(@wordmap)
75
+ puts "}"
data/bin/fitHRG ADDED
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $LOAD_PATH.unshift File.join(File.dirname(__FILE__),"..","lib")
4
+
5
+ Description = "Re-implementation, basically, of Clauset's fitHRG tool. Takes a .pairs file and fits a HRG over the graph\nProduces a .dendro file with the fit HRG and a .info file with information about the run; these are updated as higher-likelihood dendrograms are found.\n\nIf you pass in an optional partial dendrogra, fitHRG will continue sampling from that saved point."
6
+ Usage = "ruby #{$0} file.pairs [file.dendro]"
7
+ Num_Args = 1
8
+
9
+ require 'Dendrograms'
10
+ include Dendrograms
11
+
12
+ verbose = check_flag("-v","--verbose")
13
+ verbose_saved = check_flag("-s", "--saved")
14
+
15
+ if ARGV.size < Num_Args
16
+ STDERR.puts Description
17
+ STDERR.puts " "
18
+ STDERR.puts "Usage: #{Usage}"
19
+ exit(1)
20
+ end
21
+
22
+ pairs_file = ARGV.shift
23
+ dendrogram_file = pairs_file.gsub(/\.pairs$/,"-best.dendro")
24
+ info_file = pairs_file.gsub(/\.pairs$/,"-best.info")
25
+
26
+ graph = Graph.new(pairs_file)
27
+ dendrogram = nil
28
+ if ARGV.empty?
29
+ dendrogram = Dendrogram.new(graph)
30
+ else
31
+ dendrogram = Dendrogram.new(graph, ARGV.shift)
32
+ end
33
+
34
+ best_likelihood = dendrogram.sample!
35
+ best_steps = 0
36
+ start = Time.now.to_i
37
+ STDERR.puts ["MCMC","LIKELIHOOD","BEST LIKEL.","AT MCMC","TIME"].join("\t") if verbose
38
+ while true
39
+ saved = false
40
+ likelihood = dendrogram.sample!
41
+ if likelihood > best_likelihood
42
+ best_likelihood = likelihood
43
+ dendrogram.save(dendrogram_file,info_file)
44
+ best_steps = dendrogram.mcmc_steps
45
+ saved = true
46
+ end
47
+
48
+ if (saved and verbose_saved) or dendrogram.mcmc_steps % 1000 == 0
49
+ STDERR.puts [dendrogram.mcmc_steps, dendrogram.likelihood, best_likelihood, best_steps, "#{Time.now.to_i-start}s"].join("\t") if verbose
50
+ end
51
+ end
@@ -0,0 +1,4 @@
1
+ require 'Dendrograms/Graph'
2
+ require 'Dendrograms/Dendrogram'
3
+ require 'Dendrograms/Consensus'
4
+ require 'Dendrograms/cli'
@@ -0,0 +1,50 @@
1
+ module Dendrograms
2
+
3
+ class ConsensusNode
4
+ attr_reader :index
5
+ @@index = 0
6
+ def initialize(children)
7
+ @children = children
8
+ @index = @@index
9
+ @@index += 1
10
+ @leaves = children.dup
11
+ end
12
+
13
+ @@leaves = {}
14
+ def ConsensusNode.leaf(leaf)
15
+ @@leaves[leaf] ||= @@leaves.size
16
+ return @@leaves[leaf]
17
+ end
18
+
19
+ def children
20
+ @children.map { |x| x.is_a?(ConsensusNode) ? x.children : x }.flatten
21
+ end
22
+
23
+ def add_child(node)
24
+ @children = @children - node.children
25
+ @children.push node
26
+ end
27
+
28
+ def contains(set)
29
+ (@leaves & set).size == set.size
30
+ end
31
+
32
+ def size
33
+ @leaves.size
34
+ end
35
+
36
+ def to_dot(wordmap)
37
+ puts "\tINTERNAL_#{@index} [shape=point, label=\"\"];"
38
+ @children.each do |child|
39
+ if child.is_a?(ConsensusNode)
40
+ puts "\tINTERNAL_#{@index} -- INTERNAL_#{child.index};"
41
+ child.to_dot(wordmap)
42
+ else
43
+ puts "\tLEAF_#{ConsensusNode.leaf(child)} [shape=none, label=\"#{wordmap[child.to_s]}\"];"
44
+ puts "\tINTERNAL_#{@index} -- LEAF_#{ConsensusNode.leaf(child)};"
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ end
@@ -0,0 +1,337 @@
1
+ module Dendrograms
2
+
3
+ # Represents a single node in a dendrogram. Provides methods for transformation, and for computing likelihood
4
+ class DendrogramNode
5
+ attr_accessor :index, :left, :right
6
+ @@index = 0
7
+
8
+ Epsilon = 0.00000000001
9
+
10
+ def initialize(left, right)
11
+ @left = left
12
+ @right = right
13
+ @index = @@index
14
+
15
+ @@index += 1
16
+ @child_cache = nil
17
+ end
18
+
19
+ def to_s
20
+ [@index,
21
+ (@left.is_a?(DendrogramNode) ? "#{@left.index} (D)" : "#{@left} (G)"),
22
+ (@right.is_a?(DendrogramNode) ? "#{@right.index} (D)" : "#{@right} (G)")].join("\t")
23
+ end
24
+
25
+ @@leaves = {}
26
+ def DendrogramNode.resetLeaves
27
+ @@leaves = {}
28
+ end
29
+
30
+ def DendrogramNode.linkToLeaf(node, leaf, wordmap)
31
+ dot = []
32
+ if @@leaves[leaf].nil?
33
+ @@leaves[leaf] = @@leaves.size
34
+ label = wordmap.nil? ? leaf : wordmap[leaf.to_s]
35
+ dot.push "LEAF_#{@@leaves[leaf]} [shape=none, label=\"#{label}\"];"
36
+ end
37
+ dot.push "#{node} -- LEAF_#{@@leaves[leaf]};"
38
+ return dot
39
+ end
40
+
41
+ def to_dot(graph, wordmap=nil, likelihood=false)
42
+ dot = self.dot_node(graph,wordmap, likelihood)
43
+
44
+ [@left, @right].each do |child|
45
+ if child.is_a?(DendrogramNode)
46
+ dot.push "INTERNAL_#{@index} -- INTERNAL_#{child.index};"
47
+ else
48
+ DendrogramNode.linkToLeaf("INTERNAL_#{@index}",child,wordmap).each { |x| dot.push x }
49
+ end
50
+ end
51
+
52
+ return "\t#{dot.join("\n\t")}"
53
+ end
54
+
55
+ def dot_node(graph, wordmap=nil, likelihood=false, decorate=true)
56
+ label = "\"\""
57
+ shape = "point"
58
+ color = "black"
59
+ if likelihood != false
60
+ theta = self.connectedness(graph)[0]
61
+ theta = (theta*100).to_i/100.0
62
+ shape = "none"
63
+ label = "\"#{theta}\""
64
+ color = theta > likelihood ? "blue" : "red"
65
+ end
66
+ if decorate
67
+ return ["INTERNAL#{@index} [shape=#{shape},label=#{label},fontcolor=#{color},color=red];"]
68
+ else
69
+ return ["INTERNAL#{@index} [shape=point, label=\"\"];"]
70
+ end
71
+ end
72
+
73
+ def hierarchy_dot(graph, wordmap, likelihood)
74
+ dot = self.dot_node(graph,wordmap,likelihood,false)
75
+ theta = self.connectedness(graph)[0]
76
+
77
+ if theta < likelihood
78
+ if @left.is_a?(DendrogramNode)
79
+ dot.push "INTERNAL#{@index} -- INTERNAL#{@left.index};"
80
+ dot.push @left.hierarchy_dot(graph,wordmap,likelihood)
81
+ else
82
+ dot.push "INTERNAL#{@index} -- LEAF#{@left};"
83
+ dot.push "LEAF#{@left} [shape=none, label=\"#{wordmap[@left.to_s]}\"];"
84
+ end
85
+ if @right.is_a?(DendrogramNode)
86
+ dot.push "INTERNAL#{@index} -- INTERNAL#{@right.index};"
87
+ dot.push @right.hierarchy_dot(graph,wordmap,likelihood)
88
+ else
89
+ dot.push "INTERNAL#{@index} -- LEAF#{@right};"
90
+ dot.push "LEAF#{@right} [shape=none, label=\"#{wordmap[@right.to_s]}\"];"
91
+ end
92
+ else
93
+ dot.push self.children.map { |x| ["INTERNAL#{@index} -- LEAF#{x};","LEAF#{x} [shape=none, label=\"#{wordmap[x.to_s]}\"];"] }
94
+ # dot.push self.children.map { |x| "LEAF#{x} [shape=none, label=\"#{wordmap[x.to_s]}\"];" }
95
+ # dot.push self.children.map { |x| "INTERNAL#{@index} -- LEAF#{x};" }
96
+ end
97
+
98
+ dot.flatten
99
+ end
100
+
101
+ def children(force = false)
102
+ if force or @child_cache.nil?
103
+ @child_cache = [@left.is_a?(DendrogramNode) ? @left.children() : @left,
104
+ @right.is_a?(DendrogramNode) ? @right.children() : @right].flatten
105
+ end
106
+
107
+ return @child_cache
108
+ end
109
+
110
+ def connectedness(graph)
111
+ left_children = @left.is_a?(DendrogramNode) ? @left.children : [@left]
112
+ right_children = @right.is_a?(DendrogramNode) ? @right.children : [@right]
113
+
114
+ links = graph.edges_between(left_children, right_children).to_f
115
+ max_links = (left_children.size * right_children.size)
116
+ theta = links / max_links.to_f
117
+
118
+ return [theta, max_links]
119
+ end
120
+
121
+ def likelihood(graph)
122
+ theta,max_links = *self.connectedness(graph)
123
+ theta = Epsilon if theta <= 0.0
124
+ theta = 1.0-Epsilon if theta >= 1.0
125
+ # l = (theta**links) * (1-theta)**(max_links-links)
126
+ h = -theta*Math.log(theta) - (1-theta)*Math.log(1-theta)
127
+ return -h * max_links
128
+ end
129
+
130
+ def mutable?
131
+ @left.is_a?(DendrogramNode) or @right.is_a?(DendrogramNode)
132
+ end
133
+
134
+ def get_mutation
135
+ # Are we swapping children with the left or the right child?if rand > 0.5
136
+ child = nil
137
+ if @left.is_a?(DendrogramNode)
138
+ child = @left
139
+ else
140
+ child = @right
141
+ end
142
+
143
+ # Are we swapping the child's left or right child?
144
+ do_left = false
145
+ if rand > 0.5
146
+ do_left = true
147
+ end
148
+
149
+ return {:child => child, :do_left => do_left, :local_child => (child == @left ? @right : @left)}
150
+ end
151
+
152
+ def mutate!(mutation = nil)
153
+ mutation ||= self.get_mutation
154
+
155
+ if mutation[:do_left]
156
+ temp = mutation[:child].left
157
+ mutation[:child].left = mutation[:local_child]
158
+ if mutation[:local_child] == @left
159
+ @left = temp
160
+ else
161
+ @right = temp
162
+ end
163
+ mutation[:local_child] = temp
164
+ else
165
+ temp = mutation[:child].right
166
+ mutation[:child].right = mutation[:local_child]
167
+ if mutation[:local_child] == @left
168
+ @left = temp
169
+ else
170
+ @right = temp
171
+ end
172
+ mutation[:local_child] = temp
173
+ end
174
+
175
+ mutation[:child].children(true)
176
+ self.children(true)
177
+ return mutation
178
+ end
179
+ end
180
+
181
+ # Takes a Graph, builds a dendrogram, and provides methods to sample, compute likelihood, and save (with optional info)
182
+ class Dendrogram
183
+ attr_reader :graph, :likelihood, :mcmc_steps, :root
184
+
185
+ def initialize(graph, tree_file=nil)
186
+ @graph = graph
187
+ @nodes = []
188
+ @likelihoods = []
189
+ @likelihood = 0
190
+ @mcmc_steps = 0
191
+
192
+ if tree_file
193
+ index_map = {}
194
+ IO.foreach(tree_file) do |line|
195
+ if line =~ /^(\d+)\t(\d+) \(([D|G])\)\t(\d+) \(([D|G])\)/
196
+ index, left, ltype, right, rtype = $1.to_i, $2.to_i, $3, $4.to_i, $5
197
+
198
+ node = DendrogramNode.new(left, right)
199
+ node.index = index
200
+ @nodes.push node
201
+ index_map[node.index] = node
202
+
203
+ node.left = [left] if ltype == "D"
204
+ node.right = [right] if rtype == "D"
205
+ end
206
+ end
207
+ # Update mappings
208
+ @nodes.each do |node|
209
+ node.left = index_map[node.left[0]] if node.left.is_a?(Array)
210
+ node.right = index_map[node.right[0]] if node.right.is_a?(Array)
211
+ node.index = @nodes.index(node)
212
+ end
213
+ # Find root
214
+ @root = @nodes.sort { |b,a| a.children.size <=> b.children.size }[0]
215
+
216
+ # Update MCMC, if possible
217
+ begin
218
+ info_file = tree_file.gsub(".dendro",".info")
219
+ if File.exists?(info_file)
220
+ status = YAML.load_file(info_file)
221
+ @mcmc_steps = status[:mcmc]
222
+ end
223
+ rescue
224
+ STDERR.puts "Unable to load MCMC status from .info; carrying on."
225
+ end
226
+ else
227
+ # Incrementally construct a balanced dendrogram
228
+ remaining = graph.nodes.dup.sort_by { rand }
229
+
230
+ while remaining.size > 1
231
+ a = remaining.pop
232
+ b = remaining.shift
233
+
234
+ node = DendrogramNode.new(a,b)
235
+ @nodes.push node
236
+ remaining.push(node)
237
+ remaining = remaining.sort_by { rand }
238
+ end
239
+
240
+ # Hold on to the last remaining node; it's the root
241
+ @root = remaining.shift
242
+ end
243
+
244
+ # Initialise likelihoods
245
+ @nodes.each_with_index { |node, index| @likelihoods[index] = node.likelihood(@graph) }
246
+ # Compute starting likelihood
247
+ @likelihood = @likelihoods.inject(0) { |s,x| s += x }
248
+ end
249
+
250
+ # Returns the mean node likelihood
251
+ def mean_likelihood
252
+ mean = @likelihoods.map { |x| Math.exp(x) }.inject(0) { |s,x| s += x } / @likelihoods.size.to_f
253
+ STDERR.puts "Mean likelihood: #{mean}"
254
+ return mean
255
+ end
256
+
257
+ def mean_theta
258
+ @nodes.map { |x| x.connectedness(@graph)[0] }.inject(0) { |s,x| s += x } / @nodes.size.to_f
259
+ end
260
+
261
+ # Returns the median node connectednes
262
+ def median_theta
263
+ v = @nodes.map { |x| x.connectedness(@graph)[0] }
264
+ return v[v.size/2]
265
+ end
266
+
267
+ def sample!
268
+ mutate = nil
269
+ while true
270
+ node = @nodes[(rand*@nodes.size).to_i]
271
+ if node.mutable?
272
+ mutate = node
273
+ break
274
+ end
275
+ end
276
+
277
+ # Mutate tree
278
+ mutation = mutate.mutate!
279
+
280
+ old_likelihood = @likelihood
281
+ self.update_likelihood([mutate, mutate.left, mutate.right])
282
+
283
+ if not (@likelihood > old_likelihood or Math.log(rand) < @likelihood - old_likelihood)
284
+ mutate.mutate!(mutation)
285
+ self.update_likelihood([mutate, mutate.left, mutate.right])
286
+ end
287
+ @mcmc_steps += 1
288
+
289
+ return @likelihood
290
+ end
291
+
292
+ def clusters
293
+ @nodes.map { |node| node.children }
294
+ end
295
+
296
+ # Update the likelihood given two modified nodes
297
+ def update_likelihood(nodes)
298
+ # Compute new likelihood
299
+ nodes.each do |node|
300
+ if node.is_a?(DendrogramNode)
301
+ @likelihood -= @likelihoods[node.index]
302
+ @likelihoods[node.index] = node.likelihood(@graph)
303
+ @likelihood += @likelihoods[node.index]
304
+ end
305
+ end
306
+ end
307
+
308
+ def save(tree_file, info_file)
309
+ fout = File.open(tree_file,'w')
310
+ fout.puts @nodes.map { |node| node.to_s }.join("\n")
311
+ fout.close
312
+
313
+ fout = File.open(info_file, 'w')
314
+ fout.puts({:likelihood => @likelihood, :mcmc => @mcmc_steps}.to_yaml)
315
+ fout.close
316
+
317
+ self.to_dot(tree_file.gsub(/\.[^\.]+$/,".dot"))
318
+ end
319
+
320
+ def get_dot(wordmap=nil, likelihood=false)
321
+ DendrogramNode.resetLeaves
322
+ ["graph {",@nodes.map { |node| node.to_dot(@graph,wordmap,likelihood) }.join("\n"),"}"].join("\n")
323
+ end
324
+
325
+ def get_hierarchy_dot(wordmap=nil, likelihood=false)
326
+ DendrogramNode.resetLeaves
327
+ ["graph {", @root.hierarchy_dot(@graph, wordmap, likelihood).map { |x| "\t#{x}" }.join("\n"), "}"].join("\n")
328
+ end
329
+
330
+ def to_dot(dot_file, wordmap=nil, likelihood=false)
331
+ fout = File.open(dot_file,'w')
332
+ fout.puts self.get_dot(wordmap,likelihood)
333
+ fout.close
334
+ end
335
+ end
336
+
337
+ end
@@ -0,0 +1,40 @@
1
+ module Dendrograms
2
+
3
+ # Loads a graph from a .pairs file, and computes the number of edges between sets of nodes
4
+
5
+ class Graph
6
+ attr_reader :nodes
7
+
8
+ # Load a .pairs file
9
+ def initialize(pairs)
10
+ @nodes = {}
11
+ @edges = {}
12
+
13
+ IO.foreach(pairs) do |line|
14
+ from, to = *(line.strip.split(/\s+/).map { |x| x.to_i })
15
+ @nodes[from] = true
16
+ @nodes[to] = true
17
+ @edges[edge_key(from,to)] = true
18
+ @edges[edge_key(to, from)] = true
19
+ end
20
+ @nodes = @nodes.keys
21
+ end
22
+
23
+ # Compute the number of edges between two sets of nodes
24
+ def edges_between(set_a, set_b)
25
+ count = 0
26
+ set_a.each do |a|
27
+ set_b.each do |b|
28
+ count += 1 if @edges[edge_key(a,b)]
29
+ end
30
+ end
31
+ return count
32
+ end
33
+
34
+ # Return a unique key for an edge between A and B
35
+ def edge_key(a,b)
36
+ "#{a}_#{b}".to_sym
37
+ end
38
+ end
39
+
40
+ end
@@ -0,0 +1,35 @@
1
+ module Dendrograms
2
+
3
+ def check_flag(short,long)
4
+ present = false
5
+ [short,long].each do |opt|
6
+ present = true if ARGV.include?(opt)
7
+ ARGV.reject! { |x| x == opt }
8
+ end
9
+ return present
10
+ end
11
+
12
+ def check_opt(short,long,default)
13
+ value = default
14
+ [short, long].each do |opt|
15
+ if ARGV.include?(opt)
16
+ index = ARGV.index(opt)
17
+ value = ARGV[index+1]
18
+ ARGV.delete_at(index)
19
+ ARGV.delete_at(index)
20
+ break
21
+ end
22
+ end
23
+ return value
24
+ end
25
+
26
+ def usage(description, usage, args)
27
+ if ARGV.size != args
28
+ STDERR.puts description
29
+ STDERR.puts " "
30
+ STDERR.puts "Usage: #{usage}"
31
+ exit(1)
32
+ end
33
+ end
34
+
35
+ end
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Apply a wordmap to all files with a given prefix
4
+ #
5
+ # Usage: apply_wordmap path/to/wordmap prefix/files
6
+
7
+ require 'progressbar'
8
+
9
+ if ARGV.size < 2
10
+ STDERR.puts "Apply a wordmap to all files with a given prefix"
11
+ STDERR.puts ""
12
+ STDERR.puts "Usage: #{$0} path/to/wordmap prefix/files [-w|--warn]"
13
+ exit(0)
14
+ end
15
+
16
+ @debug = ARGV.include?("-w") or ARGV.include?("--warn")
17
+ ARGV.reject! { |x| x == "-w" or x == "--warn" }
18
+
19
+ wordmap_path = ARGV.shift
20
+ prefix = ARGV.shift
21
+
22
+ def process(file)
23
+ ext = file.split(".").pop
24
+
25
+ txt = IO.readlines(file).join("").strip
26
+ case ext
27
+ when "dot"
28
+ STDERR.puts "#{file} (DOT)"
29
+ progress = ProgressBar.new("Applying",@wordmap.size)
30
+ @wordmap.each_pair { |index,word| txt.gsub!("\"#{index}\"","\"#{word}\"") if not word.nil?; progress.inc }
31
+ progress.finish
32
+ return txt
33
+ =begin
34
+ when "matrix"
35
+ STDERR.puts "#{file} (MATRIX)"
36
+ progress = ProgressBar.new("Applying",@wordmap.size)
37
+ @wordmap.each_pair { |index,word| txt.gsub!(/^#{index}\:/,"#{word}:") if not word.nil?; progress.inc }
38
+ progress.finish
39
+ return txt
40
+ when "txt"
41
+ STDERR.puts "#{file} (TXT)"
42
+ progress = ProgressBar.new("Applying",@wordmap.size)
43
+ @wordmap.each_pair { |index,word| txt.gsub!("<#{index}>",word) if not word.nil?; progress.inc }
44
+ progress.finish
45
+ return txt
46
+ when "graph"
47
+ STDERR.puts "#{file} (GRAPH)"
48
+ progress = ProgressBar.new("Applying",@wordmap.size*2)
49
+ @wordmap.each_pair { |index,word| txt.gsub!(/^#{index}\t/,"#{word}\t") if not word.nil?; progress.inc }
50
+ @wordmap.each_pair { |index,word| txt.gsub!(/\t#{index}\t/,"\t#{word}\t") if not word.nil?; progress.inc }
51
+ progress.finish
52
+ return txt
53
+ =end
54
+ else
55
+ STDERR.puts "Unrecognised file extension \"#{ext}\"" if @debug
56
+ return nil
57
+ end
58
+ end
59
+
60
+ @wordmap = {}
61
+ IO.foreach(wordmap_path) do |line|
62
+ word,index = *(line.strip.split(/\s+/))
63
+ index = index.to_i
64
+ @wordmap[index] = word if @wordmap[index].nil?
65
+ end
66
+
67
+ Dir.glob("#{prefix}*").each do |file|
68
+ new_filename = file.split(".").reject { |x| x == "human" }
69
+ ext = new_filename.pop
70
+ new_filename = new_filename.push("human").push(ext).join(".")
71
+ if not File.exists?(new_filename)
72
+ txt = process(file)
73
+ if not txt.nil?
74
+ fout = File.open(new_filename,'w')
75
+ fout.puts txt
76
+ fout.close
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,37 @@
1
+ require 'lib/cli'
2
+ require 'lib/Dendrogram'
3
+ require 'lib/Graph'
4
+
5
+ usage(
6
+ "Takes a .dendro and outputs a clusterval-friendly yaml flat clustering",
7
+ "ruby #{$0} file.dendro > file.yaml",
8
+ 1
9
+ )
10
+
11
+ dendro_f = ARGV.shift
12
+ graph_f = Dir.glob(File.join(File.dirname(dendro_f),"*.pairs"))[0]
13
+ wordmap_f = Dir.glob(File.join(File.dirname(dendro_f),"*.wordmap"))[0]
14
+
15
+ @graph = Graph.new(graph_f)
16
+ @dendrogram = Dendrogram.new(@graph, dendro_f)
17
+ @wordmap = {}
18
+ IO.foreach(wordmap_f) do |line|
19
+ word,index = *(line.strip.split(/\s+/))
20
+ @wordmap[index] = word
21
+ end
22
+
23
+ @min = @dendrogram.mean_theta
24
+
25
+ @clusters = {}
26
+ def identify_subtree_clusters(node)
27
+ return @clusters[@clusters.size.to_s] = [@wordmap[node.to_s]] if node.is_a?(Fixnum)
28
+ theta = node.connectedness(@graph)[0]
29
+ if theta > @min
30
+ @clusters[@clusters.size.to_s] = node.children.map { |x| @wordmap[x.to_s] }
31
+ else
32
+ return [identify_subtree_clusters(node.left), identify_subtree_clusters(node.right)]
33
+ end
34
+ end
35
+
36
+ identify_subtree_clusters(@dendrogram.root)
37
+ puts @clusters.to_yaml
@@ -0,0 +1,28 @@
1
+ Description = "Takes a .dendro and outputs a .dot file"
2
+ Usage = "ruby #{$0} file.dendro > file.dot"
3
+ Num_Args = 1
4
+
5
+ if ARGV.size != Num_Args
6
+ STDERR.puts Description
7
+ STDERR.puts " "
8
+ STDERR.puts "Usage: #{Usage}"
9
+ exit(1)
10
+ end
11
+
12
+ require 'lib/Dendrogram'
13
+ require 'lib/Graph'
14
+
15
+ dendro_f = ARGV.shift
16
+ graph_f = Dir.glob(File.join(File.dirname(dendro_f),"*.pairs"))[0]
17
+ wordmap_f = Dir.glob(File.join(File.dirname(dendro_f),"*.wordmap"))[0]
18
+ @min = ARGV.shift.to_f
19
+
20
+ @graph = Graph.new(graph_f)
21
+ @dendrogram = Dendrogram.new(@graph, dendro_f)
22
+ @wordmap = {}
23
+ IO.foreach(wordmap_f) do |line|
24
+ word,index = *(line.strip.split(/\s+/))
25
+ @wordmap[index] = word
26
+ end
27
+
28
+ puts @dendrogram.get_dot(@wordmap, @dendrogram.mean_theta)
@@ -0,0 +1,32 @@
1
+ Description = "Takes a .hrg and a -names.lut and outputs a .dendro"
2
+ Usage = "ruby #{$0} file.hrg file.lut > file.dendro"
3
+ Num_Args = 2
4
+
5
+ if ARGV.size != Num_Args
6
+ STDERR.puts Description
7
+ STDERR.puts " "
8
+ STDERR.puts "Usage: #{Usage}"
9
+ exit(1)
10
+ end
11
+
12
+ hrg_f = ARGV.shift
13
+ lut_f = ARGV.shift
14
+ names = nil
15
+ IO.foreach(lut_f) do |line|
16
+ if names.nil?
17
+ names = {}
18
+ else
19
+ virtual, real = *(line.strip.split(/\s+/))
20
+ names[virtual] = real
21
+ end
22
+ end
23
+
24
+ #[ 0 ] L= 46 (D) R= 29 (D) p= 0 e= 0 n= 49
25
+ IO.foreach(hrg_f) do |line|
26
+ if line =~ /\[ (\d+) \] L= (\d+) \(([D|G])\) R= (\d+) \(([D|G])\)/
27
+ index, left, left_type, right, right_type = $1, $2, $3, $4, $5
28
+ left = names[left] if left_type == "G"
29
+ right = names[right] if right_type == "G"
30
+ puts "#{index}\t#{left} (#{left_type})\t#{right} (#{right_type})"
31
+ end
32
+ end
@@ -0,0 +1,64 @@
1
+ # Takes a .hrg (and an optional wordmap) and outputs a .dot
2
+ #
3
+ # Usage: ruby #{$0} file.hrg [file.wordmap] > file.dot
4
+
5
+ hrg = ARGV.shift
6
+ wordmap_f = ARGV.empty? ? nil : ARGV.shift
7
+
8
+ @map_names = true
9
+ @names = {}
10
+ IO.foreach(hrg.gsub("_best-dendro.hrg","-names.lut")) do |line|
11
+ if not line =~ /virtual/
12
+ virtual, real = *(line.strip.split(/\s+/))
13
+ @names[virtual] = real
14
+ end
15
+ end
16
+
17
+ @wordmap = nil
18
+ if not wordmap_f.nil?
19
+ @wordmap = {}
20
+ IO.foreach(wordmap_f) do |line|
21
+ word, index = *(line.strip.split(/\s+/))
22
+ @wordmap[index] = word
23
+ end
24
+ end
25
+
26
+ @nodes = {}
27
+ def internal(index)
28
+ if @nodes[index].nil?
29
+ @nodes[index] = @nodes.size
30
+ puts "\tINTERNAL#{@nodes[index]} [shape=point,label=\"\"];"
31
+ end
32
+
33
+ "INTERNAL#{@nodes[index]}"
34
+ end
35
+
36
+ @leaves = {}
37
+ def leaf(index)
38
+ index = @names[index] if @map_names
39
+ if @leaves[index].nil?
40
+ @leaves[index] = @leaves.size
41
+ label = index
42
+ label = @wordmap[label] if not @wordmap.nil?
43
+ puts "\tLEAF#{@leaves[index]} [shape=none,label=\"#{label}\"];"
44
+ end
45
+
46
+ "LEAF#{@leaves[index]}"
47
+ end
48
+
49
+ def node(type,index)
50
+ if type == "D"
51
+ return internal(index)
52
+ else
53
+ return leaf(index)
54
+ end
55
+ end
56
+
57
+ puts "graph {"
58
+ IO.foreach(hrg) do |line|
59
+ if line =~ /\[ (\d+) \] L= (\d+) \((D|G)\) R= (\d+) \((D|G)\)/
60
+ dnode = node("D",$1)
61
+ [[$3, $2], [$5, $4]].map { |pair| node(pair[0],pair[1]) }.each { |x| puts "\t#{dnode} -- #{x};" }
62
+ end
63
+ end
64
+ puts "}"
data/scripts/render.sh ADDED
@@ -0,0 +1,6 @@
1
+ #!/bin/bash
2
+
3
+ rm $1/*-best.human.dot
4
+ ruby scripts/apply_wordmap.rb $1/*.wordmap $1
5
+ dot -Tpng $1/*-best.human.dot -o render.png
6
+ open render.png
metadata ADDED
@@ -0,0 +1,139 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: Dendrograms
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Trevor Fountain
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-04-03 00:00:00 Z
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ type: :runtime
22
+ requirement: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ hash: 3
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ name: progressbar
32
+ version_requirements: *id001
33
+ prerelease: false
34
+ - !ruby/object:Gem::Dependency
35
+ type: :development
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ hash: 23
42
+ segments:
43
+ - 1
44
+ - 0
45
+ - 0
46
+ version: 1.0.0
47
+ name: bundler
48
+ version_requirements: *id002
49
+ prerelease: false
50
+ - !ruby/object:Gem::Dependency
51
+ type: :development
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ name: jeweler
62
+ version_requirements: *id003
63
+ prerelease: false
64
+ - !ruby/object:Gem::Dependency
65
+ type: :runtime
66
+ requirement: &id004 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ hash: 3
72
+ segments:
73
+ - 0
74
+ version: "0"
75
+ name: progressbar
76
+ version_requirements: *id004
77
+ prerelease: false
78
+ description: Ruby implementation of Clauset's Hierarchical Random Graphs
79
+ email: trevor@texasexpat.net
80
+ executables:
81
+ - fitHRG
82
+ - consensusHRG
83
+ extensions: []
84
+
85
+ extra_rdoc_files:
86
+ - README.markdown
87
+ files:
88
+ - Gemfile
89
+ - Gemfile.lock
90
+ - README.markdown
91
+ - Rakefile
92
+ - bin/consensusHRG
93
+ - bin/fitHRG
94
+ - lib/Dendrograms.rb
95
+ - lib/Dendrograms/Consensus.rb
96
+ - lib/Dendrograms/Dendrogram.rb
97
+ - lib/Dendrograms/Graph.rb
98
+ - lib/Dendrograms/cli.rb
99
+ - scripts/apply_wordmap.rb
100
+ - scripts/dendro2clusters.rb
101
+ - scripts/dendro2hdot.rb
102
+ - scripts/hrg2dendro.rb
103
+ - scripts/hrg2dot.rb
104
+ - scripts/render.sh
105
+ homepage: http://github.com/doches/Dendrograms
106
+ licenses:
107
+ - MIT
108
+ post_install_message:
109
+ rdoc_options: []
110
+
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ none: false
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ hash: 3
119
+ segments:
120
+ - 0
121
+ version: "0"
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ none: false
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ hash: 3
128
+ segments:
129
+ - 0
130
+ version: "0"
131
+ requirements: []
132
+
133
+ rubyforge_project:
134
+ rubygems_version: 1.8.15
135
+ signing_key:
136
+ specification_version: 3
137
+ summary: Ruby implementation of Clauset's Hierarchical Random Graphs
138
+ test_files: []
139
+