clusta 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +66 -0
- data/VERSION +1 -1
- data/bin/clusta +1 -28
- data/lib/clusta.rb +12 -3
- data/lib/clusta/geometry.rb +53 -8
- data/lib/clusta/geometry/all.rb +3 -0
- data/lib/clusta/geometry/assortativity.rb +2 -2
- data/lib/clusta/geometry/degree.rb +3 -1
- data/lib/clusta/geometry/{edge_degree_pair.rb → degree_pair.rb} +3 -3
- data/lib/clusta/geometry/directed/degree.rb +3 -1
- data/lib/clusta/geometry/directed/{edge_degree_pair.rb → degree_pair.rb} +4 -3
- data/lib/clusta/geometry/directed/edge.rb +4 -2
- data/lib/clusta/geometry/directed/{arrow.rb → neighbor.rb} +1 -1
- data/lib/clusta/geometry/directed/neighborhood.rb +31 -0
- data/lib/clusta/geometry/edge.rb +6 -4
- data/lib/clusta/geometry/element.rb +10 -117
- data/lib/clusta/geometry/{arrow.rb → neighbor.rb} +3 -3
- data/lib/clusta/geometry/neighborhood.rb +41 -0
- data/lib/clusta/geometry/vertex.rb +4 -1
- data/lib/clusta/runner.rb +101 -4
- data/lib/clusta/schema.rb +100 -0
- data/lib/clusta/serialization.rb +63 -0
- data/lib/clusta/serialization/json.rb +86 -0
- data/lib/clusta/serialization/tsv.rb +81 -0
- data/lib/clusta/transforms.rb +59 -26
- data/lib/clusta/transforms/{edge_degree_pairs_to_assortativities.rb → degree_pairs_to_assortativities.rb} +7 -3
- data/lib/clusta/transforms/edges_to_degrees.rb +5 -0
- data/lib/clusta/transforms/{edges_to_vertex_arrows.rb → edges_to_neighborhoods.rb} +11 -6
- data/lib/clusta/transforms/import.rb +6 -0
- data/lib/clusta/transforms/neighborhoods_to_degree_pairs.rb +70 -0
- data/lib/clusta/transforms/pm3d.rb +46 -0
- data/lib/clusta/transforms/prune_edges.rb +34 -0
- data/spec/clusta/schema_spec.rb +36 -0
- data/spec/clusta/serialization/json_spec.rb +133 -0
- data/spec/clusta/serialization/tsv_spec.rb +133 -0
- data/spec/clusta/serialization_spec.rb +27 -0
- data/spec/clusta/transforms/degree_pairs_to_assortativities_spec.rb +13 -0
- data/spec/clusta/transforms/{edges_to_vertex_arrows_spec.rb → edges_to_neighborhoods_spec.rb} +5 -5
- data/spec/clusta/transforms/import_spec.rb +9 -0
- data/spec/clusta/transforms/neighborhoods_to_degree_pairs_spec.rb +21 -0
- data/spec/clusta/transforms/prune_edges_spec.rb +22 -0
- data/spec/data/assortativities/directed.tsv +4 -0
- data/spec/data/assortativities/undirected.tsv +7 -0
- data/spec/data/degree_pairs/directed.tsv +10 -0
- data/spec/data/degree_pairs/undirected.tsv +18 -0
- data/spec/data/external/vertices.tsv +9 -0
- data/spec/data/imports/vertices.labeled.tsv +9 -0
- data/spec/data/neighborhoods/directed.unweighted.tsv +7 -0
- data/spec/data/neighborhoods/directed.weighted.tsv +7 -0
- data/spec/data/neighborhoods/undirected.unweighted.tsv +9 -0
- data/spec/data/neighborhoods/undirected.weighted.tsv +9 -0
- data/spec/data/pruned_edges/directed.unweighted.tsv +1 -0
- data/spec/data/pruned_edges/directed.weighted.tsv +3 -0
- data/spec/data/pruned_edges/undirected.unweighted.tsv +1 -0
- data/spec/data/pruned_edges/undirected.weighted.tsv +3 -0
- data/spec/support/transforms_spec_helper.rb +5 -1
- metadata +47 -23
- data/lib/clusta/geometry/directed/vertex_arrows.rb +0 -25
- data/lib/clusta/geometry/vertex_arrows.rb +0 -45
- data/lib/clusta/transforms/vertex_arrows_to_edge_degree_pairs.rb +0 -63
- data/spec/clusta/geometry/element_spec.rb +0 -191
- data/spec/data/vertex_arrows/directed.unweighted.tsv +0 -7
- data/spec/data/vertex_arrows/directed.weighted.tsv +0 -7
- data/spec/data/vertex_arrows/undirected.unweighted.tsv +0 -9
- data/spec/data/vertex_arrows/undirected.weighted.tsv +0 -9
data/README.rdoc
CHANGED
@@ -0,0 +1,66 @@
|
|
1
|
+
= Clusta
|
2
|
+
|
3
|
+
Clusta is a Ruby gem for network analysis built on top of
|
4
|
+
Wukong[http://github.com/mrflip/wukong].
|
5
|
+
|
6
|
+
Wukong lets you write Ruby scripts that run on your laptop as well as
|
7
|
+
on a Hadoop cluster.
|
8
|
+
|
9
|
+
Clusta is:
|
10
|
+
|
11
|
+
- classes that make describing the geometry of networks easy
|
12
|
+
- network algorithms written with these classes to use Wukong
|
13
|
+
- a shim command-line program for running these algorithms
|
14
|
+
|
15
|
+
Start with a file containing edges:
|
16
|
+
|
17
|
+
Edge 1 2
|
18
|
+
Edge 2 3
|
19
|
+
Edge 1 4
|
20
|
+
Edge 4 5
|
21
|
+
Edge 5 6
|
22
|
+
Edge 5 7
|
23
|
+
Edge 6 8
|
24
|
+
Edge 7 8
|
25
|
+
Edge 8 9
|
26
|
+
|
27
|
+
Run this through a transformation named +edges_to_degrees+:
|
28
|
+
|
29
|
+
$ clusta --transform=edges_to_degrees /local/edges.tsv -
|
30
|
+
Degree 1 2
|
31
|
+
Degree 2 2
|
32
|
+
Degree 3 1
|
33
|
+
Degree 4 2
|
34
|
+
Degree 5 3
|
35
|
+
Degree 6 2
|
36
|
+
Degree 7 2
|
37
|
+
Degree 8 3
|
38
|
+
Degree 9 1
|
39
|
+
|
40
|
+
Chain transformations together:
|
41
|
+
|
42
|
+
$ clusta --transform=edges_to_neighborhoods /local/edges.tsv - | clusta --transform=neighborhoods_to_degree_pairs - - | clusta --transform=degree_pairs_to_assortativities - -
|
43
|
+
Assortativity 1 2 1
|
44
|
+
Assortativity 1 3 1
|
45
|
+
Assortativity 2 1 1
|
46
|
+
Assortativity 2 2 4
|
47
|
+
Assortativity 2 3 5
|
48
|
+
Assortativity 3 1 1
|
49
|
+
Assortativity 3 2 5
|
50
|
+
|
51
|
+
And then leverage Wukong when you're ready:
|
52
|
+
|
53
|
+
$ clusta --run=hadoop --transform=edges_to_neighborhoods /hdfs/edges.tsv /hdfs/neighborhoods.tsv
|
54
|
+
I, [2012-03-03T21:00:39.992750 #25835] INFO -- : Launching hadoop!
|
55
|
+
I, [2012-03-03T21:00:39.992979 #25835] INFO -- : Running
|
56
|
+
|
57
|
+
/usr/lib/hadoop/bin/hadoop \
|
58
|
+
jar /usr/lib/hadoop/contrib/streaming/hadoop-*streaming*.jar \
|
59
|
+
-D mapred.job.name='clusta---spec/data/edges/undirected.unweighted.tsv----' \
|
60
|
+
-mapper '/usr/bin/ruby1.9.1 clusta --map --log_interval=10000 --log_seconds=30 --transform=edges_to_degrees' \
|
61
|
+
-reducer '/usr/bin/ruby1.9.1 clusta --reduce --log_interval=10000 --log_seconds=30 --transform=edges_to_degrees' \
|
62
|
+
-input 'spec/data/edges/undirected.unweighted.tsv' \
|
63
|
+
-output '-' \
|
64
|
+
-file '/home/user/projects/networks/clusta/bin/clusta'
|
65
|
+
...
|
66
|
+
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
data/bin/clusta
CHANGED
@@ -4,32 +4,5 @@ $: << File.expand_path('../lib', File.dirname(__FILE__)) unless $:.include?(File
|
|
4
4
|
|
5
5
|
require 'clusta'
|
6
6
|
|
7
|
-
|
8
|
-
"usage: #{File.basename(__FILE__)} --transform=TRANSFORM_NAME [ARGS ...]"
|
9
|
-
end
|
7
|
+
Clusta::Runner.new(File.basename(__FILE__), ARGV.dup).run! if $0 == __FILE__
|
10
8
|
|
11
|
-
def extract_transform_arg
|
12
|
-
transform_arg = ARGV.find_all { |arg| arg =~ Clusta::Transforms::ARG_REGEXP }.first
|
13
|
-
if transform_arg.nil?
|
14
|
-
$stderr.puts(usage)
|
15
|
-
exit(1)
|
16
|
-
end
|
17
|
-
# ARGV.delete_if { |arg| arg =~ Clusta::Transforms::ARG_REGEXP }
|
18
|
-
transform_arg
|
19
|
-
end
|
20
|
-
|
21
|
-
def add_default_run_arg
|
22
|
-
ARGV.unshift('--run=local') unless ARGV.detect { |arg| arg =~ /--run/ }
|
23
|
-
end
|
24
|
-
|
25
|
-
if $0 == __FILE__
|
26
|
-
begin
|
27
|
-
add_default_run_arg
|
28
|
-
transform = Clusta::Transforms.from_arg(extract_transform_arg)
|
29
|
-
script = Clusta::Transforms.script_for(transform)
|
30
|
-
script.run
|
31
|
-
rescue Clusta::Error => e
|
32
|
-
$stderr.puts e.message
|
33
|
-
exit(1)
|
34
|
-
end
|
35
|
-
end
|
data/lib/clusta.rb
CHANGED
@@ -14,11 +14,20 @@ module Clusta
|
|
14
14
|
File.basename(path).gsub(/\.rb$/, '')
|
15
15
|
end
|
16
16
|
|
17
|
+
def self.require_path path
|
18
|
+
File.join(File.dirname(path), File.basename(path).gsub(/\.rb$/, ''))
|
19
|
+
end
|
20
|
+
|
17
21
|
Error = Class.new(StandardError)
|
22
|
+
ArgumentError = Class.new(Error)
|
18
23
|
DirectednessMismatchError = Class.new(Error)
|
19
24
|
AmbiguousArgumentsError = Class.new(Error)
|
25
|
+
NotImplementedError = Class.new(Error)
|
26
|
+
SortError = Class.new(Error)
|
20
27
|
|
21
|
-
autoload :Geometry,
|
22
|
-
autoload :Transforms,
|
23
|
-
|
28
|
+
autoload :Geometry, 'clusta/geometry'
|
29
|
+
autoload :Transforms, 'clusta/transforms'
|
30
|
+
autoload :Runner, 'clusta/runner'
|
31
|
+
autoload :Schema, 'clusta/schema'
|
32
|
+
autoload :Serialization, 'clusta/serialization'
|
24
33
|
end
|
data/lib/clusta/geometry.rb
CHANGED
@@ -2,24 +2,69 @@ module Clusta
|
|
2
2
|
|
3
3
|
module Geometry
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
def self.names
|
6
|
+
@names ||= {}
|
7
|
+
end
|
8
8
|
|
9
|
-
def self.
|
10
|
-
|
11
|
-
|
9
|
+
def self.register_element klass, name=nil
|
10
|
+
if name
|
11
|
+
Wukong::RESOURCE_CLASS_MAP[name] = klass
|
12
|
+
else
|
13
|
+
klass.all_stream_names.each do |name|
|
14
|
+
Wukong::RESOURCE_CLASS_MAP[name] = klass
|
15
|
+
end
|
16
|
+
end
|
12
17
|
end
|
13
18
|
|
19
|
+
def self.from_name name
|
20
|
+
begin
|
21
|
+
const_get(Clusta.classify(name))
|
22
|
+
rescue NameError => e
|
23
|
+
raise Error.new("No such transform: '#{name}'")
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
14
27
|
Dir[File.join(File.dirname(__FILE__), "geometry/*.rb")].each do |path|
|
15
28
|
require_name = Clusta.require_name(path)
|
16
|
-
|
29
|
+
autoload Clusta.classify(require_name), "clusta/geometry/#{require_name}"
|
30
|
+
names[require_name] ||= {} unless require_name == 'all'
|
17
31
|
end
|
18
32
|
|
19
33
|
Dir[File.join(File.dirname(__FILE__), "geometry/directed/*.rb")].each do |path|
|
20
34
|
require_name = Clusta.require_name(path)
|
21
|
-
|
35
|
+
autoload ("Directed" + Clusta.classify(require_name)), "clusta/geometry/directed/#{require_name}"
|
36
|
+
names[require_name] ||= {}
|
37
|
+
names[require_name][:directed] = true
|
22
38
|
end
|
23
39
|
|
40
|
+
def self.listing
|
41
|
+
[].tap do |out|
|
42
|
+
out << "Known geometries:"
|
43
|
+
out << ''
|
44
|
+
names.keys.sort.each do |element_name|
|
45
|
+
element = from_name(element_name)
|
46
|
+
if names[element_name][:directed]
|
47
|
+
directed_element = from_name("directed_#{element_name}")
|
48
|
+
else
|
49
|
+
directed_element = nil
|
50
|
+
end
|
51
|
+
|
52
|
+
out << " #{element}"
|
53
|
+
stream_names = element.all_stream_names.sort
|
54
|
+
stream_names.concat(directed_element.all_stream_names.sort) if directed_element
|
55
|
+
out << " streams as: #{stream_names.uniq.join(', ')}"
|
56
|
+
out << ''
|
57
|
+
end
|
58
|
+
end.join("\n")
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.load_from path
|
62
|
+
class_eval(File.read(path), path)
|
63
|
+
require_name = Clusta.require_name(path)
|
64
|
+
names[require_name] ||= {}
|
65
|
+
names[require_name][:directed] = true if require_name =~ /^directed_/
|
66
|
+
end
|
67
|
+
|
68
|
+
|
24
69
|
end
|
25
70
|
end
|
@@ -4,8 +4,8 @@ module Clusta
|
|
4
4
|
|
5
5
|
class Assortativity < Element
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
key :source_degree_value, :type => :int
|
8
|
+
key :target_degree_value, :type => :int
|
9
9
|
field :count, :type => :int
|
10
10
|
|
11
11
|
def directed?
|
@@ -1,9 +1,9 @@
|
|
1
1
|
module Clusta
|
2
2
|
module Geometry
|
3
3
|
|
4
|
-
class
|
5
|
-
|
6
|
-
|
4
|
+
class DegreePair < Element
|
5
|
+
key :source_label
|
6
|
+
key :target_label
|
7
7
|
field :source_degree_value, :type => :int
|
8
8
|
field :target_degree_value, :type => :int
|
9
9
|
|
@@ -1,9 +1,10 @@
|
|
1
1
|
module Clusta
|
2
2
|
module Geometry
|
3
3
|
|
4
|
-
class
|
5
|
-
|
6
|
-
|
4
|
+
class DirectedDegreePair < Element
|
5
|
+
|
6
|
+
key :source_label
|
7
|
+
key :target_label
|
7
8
|
field :source_in_degree_value, :type => :int
|
8
9
|
field :source_out_degree_value, :type => :int
|
9
10
|
field :target_in_degree_value, :type => :int
|
@@ -2,6 +2,8 @@ module Clusta
|
|
2
2
|
module Geometry
|
3
3
|
|
4
4
|
class DirectedEdge < Edge
|
5
|
+
|
6
|
+
abbreviate 'DE'
|
5
7
|
|
6
8
|
def directed?
|
7
9
|
true
|
@@ -15,8 +17,8 @@ module Clusta
|
|
15
17
|
DirectedDegree.new(target_label, 1, 0)
|
16
18
|
end
|
17
19
|
|
18
|
-
def
|
19
|
-
|
20
|
+
def neighbor
|
21
|
+
DirectedNeighbor.new(target_label, weight)
|
20
22
|
end
|
21
23
|
|
22
24
|
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Clusta
|
2
|
+
module Geometry
|
3
|
+
|
4
|
+
class DirectedNeighborhood < Neighborhood
|
5
|
+
|
6
|
+
def directed?
|
7
|
+
true
|
8
|
+
end
|
9
|
+
|
10
|
+
def degree_pairs
|
11
|
+
neighbors.map do |neighbor|
|
12
|
+
# This vertex's in-degree is not known to us; we just have
|
13
|
+
# its out-degree based on the size of this neighborhood.
|
14
|
+
#
|
15
|
+
# We don't know anything about each neighbor's degree than
|
16
|
+
# its in-degree is at least 1 b/c it's in this vertex's
|
17
|
+
# neighborhood.
|
18
|
+
DirectedDegreePair.new(label, neighbor.label, 0, size, 1, 0)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def reversed_degree_pairs
|
23
|
+
neighbors.map do |neighbor|
|
24
|
+
DirectedDegreePair.new(neighbor.label, label, 1, 0, 0, size)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
data/lib/clusta/geometry/edge.rb
CHANGED
@@ -3,8 +3,10 @@ module Clusta
|
|
3
3
|
|
4
4
|
class Edge < Element
|
5
5
|
|
6
|
-
|
7
|
-
|
6
|
+
abbreviate 'E'
|
7
|
+
|
8
|
+
key :source_label
|
9
|
+
key :target_label
|
8
10
|
field :weight, :optional => true
|
9
11
|
|
10
12
|
def weighted?
|
@@ -48,8 +50,8 @@ module Clusta
|
|
48
50
|
self.class.new(target_label, source_label, weight)
|
49
51
|
end
|
50
52
|
|
51
|
-
def
|
52
|
-
|
53
|
+
def neighbor
|
54
|
+
Neighbor.new(target_label, weight)
|
53
55
|
end
|
54
56
|
|
55
57
|
end
|
@@ -3,129 +3,22 @@ module Clusta
|
|
3
3
|
|
4
4
|
class Element
|
5
5
|
|
6
|
-
|
6
|
+
include Clusta::Schema
|
7
|
+
include Clusta::Serialization
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
subclass.instance_variable_set("@fields", @fields.dup)
|
13
|
-
super
|
14
|
-
end
|
15
|
-
|
16
|
-
def self.field_names
|
17
|
-
@fields.map { |field| field[:name].to_s }
|
18
|
-
end
|
19
|
-
|
20
|
-
def self.has_optional_field?
|
21
|
-
@fields.any? { |field| field[:optional] }
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.optional_field
|
25
|
-
@fields.detect { |field| field[:optional] }
|
26
|
-
end
|
27
|
-
|
28
|
-
def self.from_string string
|
29
|
-
return string unless string.is_a?(String)
|
30
|
-
args = string.split(';')
|
31
|
-
klass_name = args.shift
|
32
|
-
raise ArgumentError.new("Elements instantiated from a string must match the format 'klass;[field1;[field2;]...]'") unless klass_name
|
33
|
-
Wukong.class_from_resource(klass_name).new(*args)
|
34
|
-
end
|
35
|
-
|
36
|
-
def self.field name, options={}
|
37
|
-
raise AmbiguousArgumentsError.new("Cannot define a second optional field #{name} because field #{optional_field[:name]} is already optional.") if has_optional_field?
|
38
|
-
attr_reader name
|
39
|
-
case options[:type]
|
40
|
-
when :int
|
41
|
-
define_method "#{name}=" do |val|
|
42
|
-
instance_variable_set("@#{name}", val.to_i)
|
43
|
-
end
|
44
|
-
when :float
|
45
|
-
define_method "#{name}=" do |val|
|
46
|
-
instance_variable_set("@#{name}", val.to_f)
|
47
|
-
end
|
48
|
-
when :geometry
|
49
|
-
define_method "#{name}=" do |val|
|
50
|
-
instance_variable_set("@#{name}", self.class.from_string(val))
|
51
|
-
end
|
52
|
-
else
|
53
|
-
define_method "#{name}=" do |val|
|
54
|
-
instance_variable_set("@#{name}", val)
|
55
|
-
end
|
56
|
-
end
|
57
|
-
@fields << options.merge(:name => name)
|
58
|
-
end
|
59
|
-
|
60
|
-
def fields
|
61
|
-
self.class.fields
|
62
|
-
end
|
63
|
-
|
64
|
-
def self.input_fields name
|
65
|
-
alias_method name, :input_fields
|
9
|
+
if defined?(Settings) && Settings[:serialize] == 'json'
|
10
|
+
include Clusta::Serialization::JSON
|
11
|
+
else
|
12
|
+
include Clusta::Serialization::TSV
|
66
13
|
end
|
67
14
|
|
68
|
-
def self.
|
69
|
-
|
70
|
-
|
71
|
-
else
|
72
|
-
to_s.split("::").last
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
def stream_name
|
77
|
-
self.class.stream_name
|
78
|
-
end
|
79
|
-
|
80
|
-
def initialize *args
|
81
|
-
self.class.fields.each_with_index do |field, index|
|
82
|
-
suffix = case index.to_s
|
83
|
-
when /1$/ then 'st'
|
84
|
-
when /2$/ then 'nd'
|
85
|
-
when /3$/ then 'rd'
|
86
|
-
else 'th'
|
87
|
-
end
|
88
|
-
case
|
89
|
-
when field[:optional]
|
90
|
-
self.send("#{field[:name]}=", args[index]) if args[index]
|
91
|
-
when args[index].nil?
|
92
|
-
raise ArgumentError.new("A #{self.class} requires a non-nil value for #{field[:name]} as its #{index}#{suffix} argument.")
|
93
|
-
else
|
94
|
-
self.send("#{field[:name]}=", args[index])
|
95
|
-
end
|
96
|
-
end
|
97
|
-
self.set_input_fields(*(args[self.class.fields.size..-1] || []))
|
98
|
-
end
|
99
|
-
|
100
|
-
def set_input_fields *input_fields
|
101
|
-
self.input_fields = input_fields.map do |field|
|
102
|
-
if field =~ /^[A-Z].*;/
|
103
|
-
self.class.from_string(field)
|
104
|
-
else
|
105
|
-
field
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
def output_fields
|
111
|
-
input_fields.map(&:to_s)
|
112
|
-
end
|
113
|
-
|
114
|
-
def to_flat
|
115
|
-
[stream_name].tap do |record|
|
116
|
-
fields.each do |field|
|
117
|
-
value = send(field[:name])
|
118
|
-
record << value.to_s unless value.nil? && field[:optional]
|
119
|
-
end
|
120
|
-
end.concat(output_fields)
|
121
|
-
end
|
122
|
-
|
123
|
-
def to_s
|
124
|
-
to_flat.join(';')
|
15
|
+
def self.inherited subclass
|
16
|
+
Clusta::Geometry.register_element(subclass)
|
17
|
+
super
|
125
18
|
end
|
126
19
|
|
127
20
|
end
|
128
|
-
|
21
|
+
|
129
22
|
end
|
130
23
|
end
|
131
24
|
|