clusta 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +66 -0
- data/VERSION +1 -1
- data/bin/clusta +1 -28
- data/lib/clusta.rb +12 -3
- data/lib/clusta/geometry.rb +53 -8
- data/lib/clusta/geometry/all.rb +3 -0
- data/lib/clusta/geometry/assortativity.rb +2 -2
- data/lib/clusta/geometry/degree.rb +3 -1
- data/lib/clusta/geometry/{edge_degree_pair.rb → degree_pair.rb} +3 -3
- data/lib/clusta/geometry/directed/degree.rb +3 -1
- data/lib/clusta/geometry/directed/{edge_degree_pair.rb → degree_pair.rb} +4 -3
- data/lib/clusta/geometry/directed/edge.rb +4 -2
- data/lib/clusta/geometry/directed/{arrow.rb → neighbor.rb} +1 -1
- data/lib/clusta/geometry/directed/neighborhood.rb +31 -0
- data/lib/clusta/geometry/edge.rb +6 -4
- data/lib/clusta/geometry/element.rb +10 -117
- data/lib/clusta/geometry/{arrow.rb → neighbor.rb} +3 -3
- data/lib/clusta/geometry/neighborhood.rb +41 -0
- data/lib/clusta/geometry/vertex.rb +4 -1
- data/lib/clusta/runner.rb +101 -4
- data/lib/clusta/schema.rb +100 -0
- data/lib/clusta/serialization.rb +63 -0
- data/lib/clusta/serialization/json.rb +86 -0
- data/lib/clusta/serialization/tsv.rb +81 -0
- data/lib/clusta/transforms.rb +59 -26
- data/lib/clusta/transforms/{edge_degree_pairs_to_assortativities.rb → degree_pairs_to_assortativities.rb} +7 -3
- data/lib/clusta/transforms/edges_to_degrees.rb +5 -0
- data/lib/clusta/transforms/{edges_to_vertex_arrows.rb → edges_to_neighborhoods.rb} +11 -6
- data/lib/clusta/transforms/import.rb +6 -0
- data/lib/clusta/transforms/neighborhoods_to_degree_pairs.rb +70 -0
- data/lib/clusta/transforms/pm3d.rb +46 -0
- data/lib/clusta/transforms/prune_edges.rb +34 -0
- data/spec/clusta/schema_spec.rb +36 -0
- data/spec/clusta/serialization/json_spec.rb +133 -0
- data/spec/clusta/serialization/tsv_spec.rb +133 -0
- data/spec/clusta/serialization_spec.rb +27 -0
- data/spec/clusta/transforms/degree_pairs_to_assortativities_spec.rb +13 -0
- data/spec/clusta/transforms/{edges_to_vertex_arrows_spec.rb → edges_to_neighborhoods_spec.rb} +5 -5
- data/spec/clusta/transforms/import_spec.rb +9 -0
- data/spec/clusta/transforms/neighborhoods_to_degree_pairs_spec.rb +21 -0
- data/spec/clusta/transforms/prune_edges_spec.rb +22 -0
- data/spec/data/assortativities/directed.tsv +4 -0
- data/spec/data/assortativities/undirected.tsv +7 -0
- data/spec/data/degree_pairs/directed.tsv +10 -0
- data/spec/data/degree_pairs/undirected.tsv +18 -0
- data/spec/data/external/vertices.tsv +9 -0
- data/spec/data/imports/vertices.labeled.tsv +9 -0
- data/spec/data/neighborhoods/directed.unweighted.tsv +7 -0
- data/spec/data/neighborhoods/directed.weighted.tsv +7 -0
- data/spec/data/neighborhoods/undirected.unweighted.tsv +9 -0
- data/spec/data/neighborhoods/undirected.weighted.tsv +9 -0
- data/spec/data/pruned_edges/directed.unweighted.tsv +1 -0
- data/spec/data/pruned_edges/directed.weighted.tsv +3 -0
- data/spec/data/pruned_edges/undirected.unweighted.tsv +1 -0
- data/spec/data/pruned_edges/undirected.weighted.tsv +3 -0
- data/spec/support/transforms_spec_helper.rb +5 -1
- metadata +47 -23
- data/lib/clusta/geometry/directed/vertex_arrows.rb +0 -25
- data/lib/clusta/geometry/vertex_arrows.rb +0 -45
- data/lib/clusta/transforms/vertex_arrows_to_edge_degree_pairs.rb +0 -63
- data/spec/clusta/geometry/element_spec.rb +0 -191
- data/spec/data/vertex_arrows/directed.unweighted.tsv +0 -7
- data/spec/data/vertex_arrows/directed.weighted.tsv +0 -7
- data/spec/data/vertex_arrows/undirected.unweighted.tsv +0 -9
- data/spec/data/vertex_arrows/undirected.weighted.tsv +0 -9
data/README.rdoc
CHANGED
@@ -0,0 +1,66 @@
|
|
1
|
+
= Clusta
|
2
|
+
|
3
|
+
Clusta is a Ruby gem for network analysis built on top of
|
4
|
+
Wukong[http://github.com/mrflip/wukong].
|
5
|
+
|
6
|
+
Wukong lets you write Ruby scripts that run on your laptop as well as
|
7
|
+
on a Hadoop cluster.
|
8
|
+
|
9
|
+
Clusta is:
|
10
|
+
|
11
|
+
- classes that make describing the geometry of networks easy
|
12
|
+
- network algorithms written with these classes to use Wukong
|
13
|
+
- a shim command-line program for running these algorithms
|
14
|
+
|
15
|
+
Start with a file containing edges:
|
16
|
+
|
17
|
+
Edge 1 2
|
18
|
+
Edge 2 3
|
19
|
+
Edge 1 4
|
20
|
+
Edge 4 5
|
21
|
+
Edge 5 6
|
22
|
+
Edge 5 7
|
23
|
+
Edge 6 8
|
24
|
+
Edge 7 8
|
25
|
+
Edge 8 9
|
26
|
+
|
27
|
+
Run this through a transformation named +edges_to_degrees+:
|
28
|
+
|
29
|
+
$ clusta --transform=edges_to_degrees /local/edges.tsv -
|
30
|
+
Degree 1 2
|
31
|
+
Degree 2 2
|
32
|
+
Degree 3 1
|
33
|
+
Degree 4 2
|
34
|
+
Degree 5 3
|
35
|
+
Degree 6 2
|
36
|
+
Degree 7 2
|
37
|
+
Degree 8 3
|
38
|
+
Degree 9 1
|
39
|
+
|
40
|
+
Chain transformations together:
|
41
|
+
|
42
|
+
$ clusta --transform=edges_to_neighborhoods /local/edges.tsv - | clusta --transform=neighborhoods_to_degree_pairs - - | clusta --transform=degree_pairs_to_assortativities - -
|
43
|
+
Assortativity 1 2 1
|
44
|
+
Assortativity 1 3 1
|
45
|
+
Assortativity 2 1 1
|
46
|
+
Assortativity 2 2 4
|
47
|
+
Assortativity 2 3 5
|
48
|
+
Assortativity 3 1 1
|
49
|
+
Assortativity 3 2 5
|
50
|
+
|
51
|
+
And then leverage Wukong when you're ready:
|
52
|
+
|
53
|
+
$ clusta --run=hadoop --transform=edges_to_neighborhoods /hdfs/edges.tsv /hdfs/neighborhoods.tsv
|
54
|
+
I, [2012-03-03T21:00:39.992750 #25835] INFO -- : Launching hadoop!
|
55
|
+
I, [2012-03-03T21:00:39.992979 #25835] INFO -- : Running
|
56
|
+
|
57
|
+
/usr/lib/hadoop/bin/hadoop \
|
58
|
+
jar /usr/lib/hadoop/contrib/streaming/hadoop-*streaming*.jar \
|
59
|
+
-D mapred.job.name='clusta---spec/data/edges/undirected.unweighted.tsv----' \
|
60
|
+
-mapper '/usr/bin/ruby1.9.1 clusta --map --log_interval=10000 --log_seconds=30 --transform=edges_to_degrees' \
|
61
|
+
-reducer '/usr/bin/ruby1.9.1 clusta --reduce --log_interval=10000 --log_seconds=30 --transform=edges_to_degrees' \
|
62
|
+
-input 'spec/data/edges/undirected.unweighted.tsv' \
|
63
|
+
-output '-' \
|
64
|
+
-file '/home/user/projects/networks/clusta/bin/clusta'
|
65
|
+
...
|
66
|
+
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.2
|
data/bin/clusta
CHANGED
@@ -4,32 +4,5 @@ $: << File.expand_path('../lib', File.dirname(__FILE__)) unless $:.include?(File
|
|
4
4
|
|
5
5
|
require 'clusta'
|
6
6
|
|
7
|
-
|
8
|
-
"usage: #{File.basename(__FILE__)} --transform=TRANSFORM_NAME [ARGS ...]"
|
9
|
-
end
|
7
|
+
Clusta::Runner.new(File.basename(__FILE__), ARGV.dup).run! if $0 == __FILE__
|
10
8
|
|
11
|
-
def extract_transform_arg
|
12
|
-
transform_arg = ARGV.find_all { |arg| arg =~ Clusta::Transforms::ARG_REGEXP }.first
|
13
|
-
if transform_arg.nil?
|
14
|
-
$stderr.puts(usage)
|
15
|
-
exit(1)
|
16
|
-
end
|
17
|
-
# ARGV.delete_if { |arg| arg =~ Clusta::Transforms::ARG_REGEXP }
|
18
|
-
transform_arg
|
19
|
-
end
|
20
|
-
|
21
|
-
def add_default_run_arg
|
22
|
-
ARGV.unshift('--run=local') unless ARGV.detect { |arg| arg =~ /--run/ }
|
23
|
-
end
|
24
|
-
|
25
|
-
if $0 == __FILE__
|
26
|
-
begin
|
27
|
-
add_default_run_arg
|
28
|
-
transform = Clusta::Transforms.from_arg(extract_transform_arg)
|
29
|
-
script = Clusta::Transforms.script_for(transform)
|
30
|
-
script.run
|
31
|
-
rescue Clusta::Error => e
|
32
|
-
$stderr.puts e.message
|
33
|
-
exit(1)
|
34
|
-
end
|
35
|
-
end
|
data/lib/clusta.rb
CHANGED
@@ -14,11 +14,20 @@ module Clusta
|
|
14
14
|
File.basename(path).gsub(/\.rb$/, '')
|
15
15
|
end
|
16
16
|
|
17
|
+
def self.require_path path
|
18
|
+
File.join(File.dirname(path), File.basename(path).gsub(/\.rb$/, ''))
|
19
|
+
end
|
20
|
+
|
17
21
|
Error = Class.new(StandardError)
|
22
|
+
ArgumentError = Class.new(Error)
|
18
23
|
DirectednessMismatchError = Class.new(Error)
|
19
24
|
AmbiguousArgumentsError = Class.new(Error)
|
25
|
+
NotImplementedError = Class.new(Error)
|
26
|
+
SortError = Class.new(Error)
|
20
27
|
|
21
|
-
autoload :Geometry,
|
22
|
-
autoload :Transforms,
|
23
|
-
|
28
|
+
autoload :Geometry, 'clusta/geometry'
|
29
|
+
autoload :Transforms, 'clusta/transforms'
|
30
|
+
autoload :Runner, 'clusta/runner'
|
31
|
+
autoload :Schema, 'clusta/schema'
|
32
|
+
autoload :Serialization, 'clusta/serialization'
|
24
33
|
end
|
data/lib/clusta/geometry.rb
CHANGED
@@ -2,24 +2,69 @@ module Clusta
|
|
2
2
|
|
3
3
|
module Geometry
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
def self.names
|
6
|
+
@names ||= {}
|
7
|
+
end
|
8
8
|
|
9
|
-
def self.
|
10
|
-
|
11
|
-
|
9
|
+
def self.register_element klass, name=nil
|
10
|
+
if name
|
11
|
+
Wukong::RESOURCE_CLASS_MAP[name] = klass
|
12
|
+
else
|
13
|
+
klass.all_stream_names.each do |name|
|
14
|
+
Wukong::RESOURCE_CLASS_MAP[name] = klass
|
15
|
+
end
|
16
|
+
end
|
12
17
|
end
|
13
18
|
|
19
|
+
def self.from_name name
|
20
|
+
begin
|
21
|
+
const_get(Clusta.classify(name))
|
22
|
+
rescue NameError => e
|
23
|
+
raise Error.new("No such transform: '#{name}'")
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
14
27
|
Dir[File.join(File.dirname(__FILE__), "geometry/*.rb")].each do |path|
|
15
28
|
require_name = Clusta.require_name(path)
|
16
|
-
|
29
|
+
autoload Clusta.classify(require_name), "clusta/geometry/#{require_name}"
|
30
|
+
names[require_name] ||= {} unless require_name == 'all'
|
17
31
|
end
|
18
32
|
|
19
33
|
Dir[File.join(File.dirname(__FILE__), "geometry/directed/*.rb")].each do |path|
|
20
34
|
require_name = Clusta.require_name(path)
|
21
|
-
|
35
|
+
autoload ("Directed" + Clusta.classify(require_name)), "clusta/geometry/directed/#{require_name}"
|
36
|
+
names[require_name] ||= {}
|
37
|
+
names[require_name][:directed] = true
|
22
38
|
end
|
23
39
|
|
40
|
+
def self.listing
|
41
|
+
[].tap do |out|
|
42
|
+
out << "Known geometries:"
|
43
|
+
out << ''
|
44
|
+
names.keys.sort.each do |element_name|
|
45
|
+
element = from_name(element_name)
|
46
|
+
if names[element_name][:directed]
|
47
|
+
directed_element = from_name("directed_#{element_name}")
|
48
|
+
else
|
49
|
+
directed_element = nil
|
50
|
+
end
|
51
|
+
|
52
|
+
out << " #{element}"
|
53
|
+
stream_names = element.all_stream_names.sort
|
54
|
+
stream_names.concat(directed_element.all_stream_names.sort) if directed_element
|
55
|
+
out << " streams as: #{stream_names.uniq.join(', ')}"
|
56
|
+
out << ''
|
57
|
+
end
|
58
|
+
end.join("\n")
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.load_from path
|
62
|
+
class_eval(File.read(path), path)
|
63
|
+
require_name = Clusta.require_name(path)
|
64
|
+
names[require_name] ||= {}
|
65
|
+
names[require_name][:directed] = true if require_name =~ /^directed_/
|
66
|
+
end
|
67
|
+
|
68
|
+
|
24
69
|
end
|
25
70
|
end
|
@@ -4,8 +4,8 @@ module Clusta
|
|
4
4
|
|
5
5
|
class Assortativity < Element
|
6
6
|
|
7
|
-
|
8
|
-
|
7
|
+
key :source_degree_value, :type => :int
|
8
|
+
key :target_degree_value, :type => :int
|
9
9
|
field :count, :type => :int
|
10
10
|
|
11
11
|
def directed?
|
@@ -1,9 +1,9 @@
|
|
1
1
|
module Clusta
|
2
2
|
module Geometry
|
3
3
|
|
4
|
-
class
|
5
|
-
|
6
|
-
|
4
|
+
class DegreePair < Element
|
5
|
+
key :source_label
|
6
|
+
key :target_label
|
7
7
|
field :source_degree_value, :type => :int
|
8
8
|
field :target_degree_value, :type => :int
|
9
9
|
|
@@ -1,9 +1,10 @@
|
|
1
1
|
module Clusta
|
2
2
|
module Geometry
|
3
3
|
|
4
|
-
class
|
5
|
-
|
6
|
-
|
4
|
+
class DirectedDegreePair < Element
|
5
|
+
|
6
|
+
key :source_label
|
7
|
+
key :target_label
|
7
8
|
field :source_in_degree_value, :type => :int
|
8
9
|
field :source_out_degree_value, :type => :int
|
9
10
|
field :target_in_degree_value, :type => :int
|
@@ -2,6 +2,8 @@ module Clusta
|
|
2
2
|
module Geometry
|
3
3
|
|
4
4
|
class DirectedEdge < Edge
|
5
|
+
|
6
|
+
abbreviate 'DE'
|
5
7
|
|
6
8
|
def directed?
|
7
9
|
true
|
@@ -15,8 +17,8 @@ module Clusta
|
|
15
17
|
DirectedDegree.new(target_label, 1, 0)
|
16
18
|
end
|
17
19
|
|
18
|
-
def
|
19
|
-
|
20
|
+
def neighbor
|
21
|
+
DirectedNeighbor.new(target_label, weight)
|
20
22
|
end
|
21
23
|
|
22
24
|
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Clusta
|
2
|
+
module Geometry
|
3
|
+
|
4
|
+
class DirectedNeighborhood < Neighborhood
|
5
|
+
|
6
|
+
def directed?
|
7
|
+
true
|
8
|
+
end
|
9
|
+
|
10
|
+
def degree_pairs
|
11
|
+
neighbors.map do |neighbor|
|
12
|
+
# This vertex's in-degree is not known to us; we just have
|
13
|
+
# its out-degree based on the size of this neighborhood.
|
14
|
+
#
|
15
|
+
# We don't know anything about each neighbor's degree than
|
16
|
+
# its in-degree is at least 1 b/c it's in this vertex's
|
17
|
+
# neighborhood.
|
18
|
+
DirectedDegreePair.new(label, neighbor.label, 0, size, 1, 0)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def reversed_degree_pairs
|
23
|
+
neighbors.map do |neighbor|
|
24
|
+
DirectedDegreePair.new(neighbor.label, label, 1, 0, 0, size)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
data/lib/clusta/geometry/edge.rb
CHANGED
@@ -3,8 +3,10 @@ module Clusta
|
|
3
3
|
|
4
4
|
class Edge < Element
|
5
5
|
|
6
|
-
|
7
|
-
|
6
|
+
abbreviate 'E'
|
7
|
+
|
8
|
+
key :source_label
|
9
|
+
key :target_label
|
8
10
|
field :weight, :optional => true
|
9
11
|
|
10
12
|
def weighted?
|
@@ -48,8 +50,8 @@ module Clusta
|
|
48
50
|
self.class.new(target_label, source_label, weight)
|
49
51
|
end
|
50
52
|
|
51
|
-
def
|
52
|
-
|
53
|
+
def neighbor
|
54
|
+
Neighbor.new(target_label, weight)
|
53
55
|
end
|
54
56
|
|
55
57
|
end
|
@@ -3,129 +3,22 @@ module Clusta
|
|
3
3
|
|
4
4
|
class Element
|
5
5
|
|
6
|
-
|
6
|
+
include Clusta::Schema
|
7
|
+
include Clusta::Serialization
|
7
8
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
subclass.instance_variable_set("@fields", @fields.dup)
|
13
|
-
super
|
14
|
-
end
|
15
|
-
|
16
|
-
def self.field_names
|
17
|
-
@fields.map { |field| field[:name].to_s }
|
18
|
-
end
|
19
|
-
|
20
|
-
def self.has_optional_field?
|
21
|
-
@fields.any? { |field| field[:optional] }
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.optional_field
|
25
|
-
@fields.detect { |field| field[:optional] }
|
26
|
-
end
|
27
|
-
|
28
|
-
def self.from_string string
|
29
|
-
return string unless string.is_a?(String)
|
30
|
-
args = string.split(';')
|
31
|
-
klass_name = args.shift
|
32
|
-
raise ArgumentError.new("Elements instantiated from a string must match the format 'klass;[field1;[field2;]...]'") unless klass_name
|
33
|
-
Wukong.class_from_resource(klass_name).new(*args)
|
34
|
-
end
|
35
|
-
|
36
|
-
def self.field name, options={}
|
37
|
-
raise AmbiguousArgumentsError.new("Cannot define a second optional field #{name} because field #{optional_field[:name]} is already optional.") if has_optional_field?
|
38
|
-
attr_reader name
|
39
|
-
case options[:type]
|
40
|
-
when :int
|
41
|
-
define_method "#{name}=" do |val|
|
42
|
-
instance_variable_set("@#{name}", val.to_i)
|
43
|
-
end
|
44
|
-
when :float
|
45
|
-
define_method "#{name}=" do |val|
|
46
|
-
instance_variable_set("@#{name}", val.to_f)
|
47
|
-
end
|
48
|
-
when :geometry
|
49
|
-
define_method "#{name}=" do |val|
|
50
|
-
instance_variable_set("@#{name}", self.class.from_string(val))
|
51
|
-
end
|
52
|
-
else
|
53
|
-
define_method "#{name}=" do |val|
|
54
|
-
instance_variable_set("@#{name}", val)
|
55
|
-
end
|
56
|
-
end
|
57
|
-
@fields << options.merge(:name => name)
|
58
|
-
end
|
59
|
-
|
60
|
-
def fields
|
61
|
-
self.class.fields
|
62
|
-
end
|
63
|
-
|
64
|
-
def self.input_fields name
|
65
|
-
alias_method name, :input_fields
|
9
|
+
if defined?(Settings) && Settings[:serialize] == 'json'
|
10
|
+
include Clusta::Serialization::JSON
|
11
|
+
else
|
12
|
+
include Clusta::Serialization::TSV
|
66
13
|
end
|
67
14
|
|
68
|
-
def self.
|
69
|
-
|
70
|
-
|
71
|
-
else
|
72
|
-
to_s.split("::").last
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
def stream_name
|
77
|
-
self.class.stream_name
|
78
|
-
end
|
79
|
-
|
80
|
-
def initialize *args
|
81
|
-
self.class.fields.each_with_index do |field, index|
|
82
|
-
suffix = case index.to_s
|
83
|
-
when /1$/ then 'st'
|
84
|
-
when /2$/ then 'nd'
|
85
|
-
when /3$/ then 'rd'
|
86
|
-
else 'th'
|
87
|
-
end
|
88
|
-
case
|
89
|
-
when field[:optional]
|
90
|
-
self.send("#{field[:name]}=", args[index]) if args[index]
|
91
|
-
when args[index].nil?
|
92
|
-
raise ArgumentError.new("A #{self.class} requires a non-nil value for #{field[:name]} as its #{index}#{suffix} argument.")
|
93
|
-
else
|
94
|
-
self.send("#{field[:name]}=", args[index])
|
95
|
-
end
|
96
|
-
end
|
97
|
-
self.set_input_fields(*(args[self.class.fields.size..-1] || []))
|
98
|
-
end
|
99
|
-
|
100
|
-
def set_input_fields *input_fields
|
101
|
-
self.input_fields = input_fields.map do |field|
|
102
|
-
if field =~ /^[A-Z].*;/
|
103
|
-
self.class.from_string(field)
|
104
|
-
else
|
105
|
-
field
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
def output_fields
|
111
|
-
input_fields.map(&:to_s)
|
112
|
-
end
|
113
|
-
|
114
|
-
def to_flat
|
115
|
-
[stream_name].tap do |record|
|
116
|
-
fields.each do |field|
|
117
|
-
value = send(field[:name])
|
118
|
-
record << value.to_s unless value.nil? && field[:optional]
|
119
|
-
end
|
120
|
-
end.concat(output_fields)
|
121
|
-
end
|
122
|
-
|
123
|
-
def to_s
|
124
|
-
to_flat.join(';')
|
15
|
+
def self.inherited subclass
|
16
|
+
Clusta::Geometry.register_element(subclass)
|
17
|
+
super
|
125
18
|
end
|
126
19
|
|
127
20
|
end
|
128
|
-
|
21
|
+
|
129
22
|
end
|
130
23
|
end
|
131
24
|
|