clusta 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +66 -0
- data/VERSION +1 -1
- data/bin/clusta +1 -28
- data/lib/clusta.rb +12 -3
- data/lib/clusta/geometry.rb +53 -8
- data/lib/clusta/geometry/all.rb +3 -0
- data/lib/clusta/geometry/assortativity.rb +2 -2
- data/lib/clusta/geometry/degree.rb +3 -1
- data/lib/clusta/geometry/{edge_degree_pair.rb → degree_pair.rb} +3 -3
- data/lib/clusta/geometry/directed/degree.rb +3 -1
- data/lib/clusta/geometry/directed/{edge_degree_pair.rb → degree_pair.rb} +4 -3
- data/lib/clusta/geometry/directed/edge.rb +4 -2
- data/lib/clusta/geometry/directed/{arrow.rb → neighbor.rb} +1 -1
- data/lib/clusta/geometry/directed/neighborhood.rb +31 -0
- data/lib/clusta/geometry/edge.rb +6 -4
- data/lib/clusta/geometry/element.rb +10 -117
- data/lib/clusta/geometry/{arrow.rb → neighbor.rb} +3 -3
- data/lib/clusta/geometry/neighborhood.rb +41 -0
- data/lib/clusta/geometry/vertex.rb +4 -1
- data/lib/clusta/runner.rb +101 -4
- data/lib/clusta/schema.rb +100 -0
- data/lib/clusta/serialization.rb +63 -0
- data/lib/clusta/serialization/json.rb +86 -0
- data/lib/clusta/serialization/tsv.rb +81 -0
- data/lib/clusta/transforms.rb +59 -26
- data/lib/clusta/transforms/{edge_degree_pairs_to_assortativities.rb → degree_pairs_to_assortativities.rb} +7 -3
- data/lib/clusta/transforms/edges_to_degrees.rb +5 -0
- data/lib/clusta/transforms/{edges_to_vertex_arrows.rb → edges_to_neighborhoods.rb} +11 -6
- data/lib/clusta/transforms/import.rb +6 -0
- data/lib/clusta/transforms/neighborhoods_to_degree_pairs.rb +70 -0
- data/lib/clusta/transforms/pm3d.rb +46 -0
- data/lib/clusta/transforms/prune_edges.rb +34 -0
- data/spec/clusta/schema_spec.rb +36 -0
- data/spec/clusta/serialization/json_spec.rb +133 -0
- data/spec/clusta/serialization/tsv_spec.rb +133 -0
- data/spec/clusta/serialization_spec.rb +27 -0
- data/spec/clusta/transforms/degree_pairs_to_assortativities_spec.rb +13 -0
- data/spec/clusta/transforms/{edges_to_vertex_arrows_spec.rb → edges_to_neighborhoods_spec.rb} +5 -5
- data/spec/clusta/transforms/import_spec.rb +9 -0
- data/spec/clusta/transforms/neighborhoods_to_degree_pairs_spec.rb +21 -0
- data/spec/clusta/transforms/prune_edges_spec.rb +22 -0
- data/spec/data/assortativities/directed.tsv +4 -0
- data/spec/data/assortativities/undirected.tsv +7 -0
- data/spec/data/degree_pairs/directed.tsv +10 -0
- data/spec/data/degree_pairs/undirected.tsv +18 -0
- data/spec/data/external/vertices.tsv +9 -0
- data/spec/data/imports/vertices.labeled.tsv +9 -0
- data/spec/data/neighborhoods/directed.unweighted.tsv +7 -0
- data/spec/data/neighborhoods/directed.weighted.tsv +7 -0
- data/spec/data/neighborhoods/undirected.unweighted.tsv +9 -0
- data/spec/data/neighborhoods/undirected.weighted.tsv +9 -0
- data/spec/data/pruned_edges/directed.unweighted.tsv +1 -0
- data/spec/data/pruned_edges/directed.weighted.tsv +3 -0
- data/spec/data/pruned_edges/undirected.unweighted.tsv +1 -0
- data/spec/data/pruned_edges/undirected.weighted.tsv +3 -0
- data/spec/support/transforms_spec_helper.rb +5 -1
- metadata +47 -23
- data/lib/clusta/geometry/directed/vertex_arrows.rb +0 -25
- data/lib/clusta/geometry/vertex_arrows.rb +0 -45
- data/lib/clusta/transforms/vertex_arrows_to_edge_degree_pairs.rb +0 -63
- data/spec/clusta/geometry/element_spec.rb +0 -191
- data/spec/data/vertex_arrows/directed.unweighted.tsv +0 -7
- data/spec/data/vertex_arrows/directed.weighted.tsv +0 -7
- data/spec/data/vertex_arrows/undirected.unweighted.tsv +0 -9
- data/spec/data/vertex_arrows/undirected.weighted.tsv +0 -9
@@ -0,0 +1,41 @@
|
|
1
|
+
module Clusta
|
2
|
+
module Geometry
|
3
|
+
|
4
|
+
class Neighborhood < Vertex
|
5
|
+
|
6
|
+
extra_inputs :neighbors
|
7
|
+
|
8
|
+
def joins? label
|
9
|
+
neighbors.detect { |neighbor| neighbor.label == label }
|
10
|
+
end
|
11
|
+
|
12
|
+
def directed?
|
13
|
+
false
|
14
|
+
end
|
15
|
+
|
16
|
+
def size
|
17
|
+
neighbors.size
|
18
|
+
end
|
19
|
+
|
20
|
+
def degree_pairs
|
21
|
+
neighbors.map do |neighbor|
|
22
|
+
# This vertex's degree is just the size of this
|
23
|
+
# neighborhood.
|
24
|
+
#
|
25
|
+
# We don't know anything about each neighbor's degree other
|
26
|
+
# than it must be at least 1 b/c it's in this vertex's
|
27
|
+
# neighborhood.
|
28
|
+
DegreePair.new(label, neighbor.label, size, 1)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def reversed_degree_pairs
|
33
|
+
neighbors.map do |neighbor|
|
34
|
+
DegreePair.new(neighbor.label, label, 1, size)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
data/lib/clusta/runner.rb
CHANGED
@@ -1,15 +1,112 @@
|
|
1
|
+
require 'configliere'
|
2
|
+
|
3
|
+
Settings.use :commandline
|
4
|
+
|
5
|
+
Settings.define :transform, :description => "The name of the tranformation to run.", :required => false
|
6
|
+
Settings.define :list_transforms, :description => "List known transformations.", :required => false, :type => :boolean, :default => false
|
7
|
+
Settings.define :class_names, :description => "The output format for class names, one of: 'long', 'medium', or 'short'.", :required => true, :default => 'medium'
|
8
|
+
Settings.define :serialize, :description => "The serialization format for data, one of: 'json' or 'tsv'.", :required => true, :default => 'tsv'
|
9
|
+
Settings.define :transforms_path, :description => "A colon-separated list of directories to require transform definitions.", :required => false, :default => ''
|
10
|
+
Settings.define :geometry_path, :description => "A colon-separated list of directories to require geometry definitions.", :required => false, :default => ''
|
11
|
+
|
1
12
|
module Clusta
|
2
13
|
|
3
14
|
class Runner
|
4
15
|
|
5
|
-
|
6
|
-
|
16
|
+
RUN_ARG_REGEXP = /--run=./
|
17
|
+
|
18
|
+
|
19
|
+
def initialize name, argv
|
20
|
+
@name = name
|
21
|
+
@argv = argv
|
7
22
|
end
|
8
23
|
|
9
24
|
def run!
|
10
|
-
|
25
|
+
begin
|
26
|
+
Settings.resolve!
|
27
|
+
case
|
28
|
+
when Settings[:list_transforms]
|
29
|
+
load_transforms!
|
30
|
+
list_transforms!
|
31
|
+
when Settings[:list_geometry]
|
32
|
+
load_geometry!
|
33
|
+
list_geometry!
|
34
|
+
when Settings[:transform]
|
35
|
+
load_transforms!
|
36
|
+
load_geometry!
|
37
|
+
run_transform!
|
38
|
+
when Settings[:map_command] || Settings[:reduce_command]
|
39
|
+
run_map_reduce!
|
40
|
+
else
|
41
|
+
print_help!
|
42
|
+
end
|
43
|
+
rescue Clusta::Error => e
|
44
|
+
$stderr.puts "ERROR: #{e.message}"
|
45
|
+
exit(1)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def load_transforms!
|
50
|
+
rb_files_within(:transforms_path) do |path|
|
51
|
+
Clusta::Transforms.load_from(path)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def load_geometry!
|
56
|
+
rb_files_within(:geometry_path) do |path|
|
57
|
+
Clusta::Geometry.load_from(path)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def rb_files_within key, &block
|
62
|
+
return if Settings[key].nil? || Settings[key].empty?
|
63
|
+
Settings[key].split(':').each do |dir|
|
64
|
+
expanded = File.expand_path(dir)
|
65
|
+
unless File.directory?(expanded)
|
66
|
+
$stderr.puts("WARNING: #{expanded} is not a directory")
|
67
|
+
next
|
68
|
+
end
|
69
|
+
Dir[File.join(expanded, '*.rb')].each do |path|
|
70
|
+
yield path
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def list_transforms!
|
76
|
+
puts Clusta::Transforms.listing
|
77
|
+
end
|
78
|
+
|
79
|
+
def list_geometry!
|
80
|
+
puts Clusta::Geometry.listing
|
81
|
+
end
|
82
|
+
|
83
|
+
def run_transform!
|
84
|
+
transform = Clusta::Transforms.from_name(Settings[:transform])
|
85
|
+
::ARGV.replace(@argv)
|
86
|
+
::ARGV.push('--run=local') unless ARGV.any? { |arg| arg =~ self.class::RUN_ARG_REGEXP }
|
87
|
+
script = Clusta::Transforms.script_for(transform)
|
88
|
+
script.run
|
89
|
+
end
|
90
|
+
|
91
|
+
def run_map_reduce!
|
92
|
+
::ARGV.replace(@argv)
|
93
|
+
::ARGV.push('--run=local') unless ARGV.any? { |arg| arg =~ self.class::RUN_ARG_REGEXP }
|
94
|
+
begin
|
95
|
+
s = Wukong::Script.new(nil, nil)
|
96
|
+
rescue RuntimeError => e
|
97
|
+
raise Error.new(e.message)
|
98
|
+
end
|
99
|
+
s.run
|
100
|
+
end
|
101
|
+
|
102
|
+
def print_help!
|
103
|
+
begin
|
104
|
+
s = Wukong::Script.new(nil, nil)
|
105
|
+
rescue RuntimeError => e
|
106
|
+
raise Error.new(e.message)
|
107
|
+
end
|
108
|
+
s.run
|
11
109
|
end
|
12
110
|
|
13
111
|
end
|
14
|
-
|
15
112
|
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
module Clusta
|
2
|
+
module Schema
|
3
|
+
|
4
|
+
def extra_inputs
|
5
|
+
@extra_inputs ||= []
|
6
|
+
end
|
7
|
+
attr_writer :extra_inputs
|
8
|
+
|
9
|
+
def extra_outputs
|
10
|
+
extra_inputs.map(&:to_s)
|
11
|
+
end
|
12
|
+
|
13
|
+
def fields
|
14
|
+
self.class.fields
|
15
|
+
end
|
16
|
+
|
17
|
+
def keys
|
18
|
+
self.class.keys
|
19
|
+
end
|
20
|
+
|
21
|
+
def non_key_fields
|
22
|
+
self.class.non_key_fields
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.included klass
|
26
|
+
klass.extend(ClassMethods)
|
27
|
+
class << klass ; attr_reader :fields ; end
|
28
|
+
klass.instance_variable_set('@fields', [])
|
29
|
+
end
|
30
|
+
|
31
|
+
module ClassMethods
|
32
|
+
|
33
|
+
def extra_inputs name
|
34
|
+
alias_method name, :extra_inputs
|
35
|
+
end
|
36
|
+
|
37
|
+
def inherited(subclass)
|
38
|
+
subclass.instance_variable_set("@fields", @fields.dup)
|
39
|
+
super
|
40
|
+
end
|
41
|
+
|
42
|
+
def field_names
|
43
|
+
@fields.map { |field| field[:name].to_s }
|
44
|
+
end
|
45
|
+
|
46
|
+
def has_optional_field?
|
47
|
+
@fields.any? { |field| field[:optional] }
|
48
|
+
end
|
49
|
+
|
50
|
+
def has_non_key_field?
|
51
|
+
@fields.any? { |field| ! field[:key] }
|
52
|
+
end
|
53
|
+
|
54
|
+
def optional_field
|
55
|
+
@fields.detect { |field| field[:optional] }
|
56
|
+
end
|
57
|
+
|
58
|
+
def keys
|
59
|
+
@fields.find_all { |field| field[:key] }
|
60
|
+
end
|
61
|
+
|
62
|
+
def non_key_fields
|
63
|
+
@fields.find_all { |field| ! field[:key] }
|
64
|
+
end
|
65
|
+
|
66
|
+
def field name, options={}
|
67
|
+
raise AmbiguousArgumentsError.new("Cannot define a second optional field #{name} because field #{optional_field[:name]} is already optional.") if has_optional_field?
|
68
|
+
|
69
|
+
raise SortError.new("The first field defined must be a key that can be sorted on.") if @fields.empty? && (!options[:key])
|
70
|
+
raise SortError.new("Cannot define a key field #{name} because some non-key fields have already been defined.") if options[:key] && has_non_key_field?
|
71
|
+
raise SortError.new("Key fields (#{name}) cannot have type :geometry") if options[:key] && options[:type] == :geometry
|
72
|
+
|
73
|
+
attr_reader name
|
74
|
+
|
75
|
+
case options[:type]
|
76
|
+
when :int
|
77
|
+
define_method "#{name}=" do |val|
|
78
|
+
instance_variable_set("@#{name}", val.to_i)
|
79
|
+
end
|
80
|
+
when :float
|
81
|
+
define_method "#{name}=" do |val|
|
82
|
+
instance_variable_set("@#{name}", val.to_f)
|
83
|
+
end
|
84
|
+
else
|
85
|
+
define_method "#{name}=" do |val|
|
86
|
+
instance_variable_set("@#{name}", val)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
@fields << options.merge(:name => name)
|
90
|
+
end
|
91
|
+
|
92
|
+
def key name, options={}
|
93
|
+
field name, options.merge(:key => true)
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Clusta
|
2
|
+
|
3
|
+
# Defines methods that allow a class to (de)serialize itself in a
|
4
|
+
# way compatabile with Wukong.
|
5
|
+
module Serialization
|
6
|
+
|
7
|
+
autoload :TSV, 'clusta/serialization/tsv'
|
8
|
+
autoload :JSON, 'clusta/serialization/json'
|
9
|
+
|
10
|
+
def self.included klass
|
11
|
+
klass.extend(ClassMethods)
|
12
|
+
end
|
13
|
+
|
14
|
+
def stream_name
|
15
|
+
self.class.stream_name
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize *args
|
19
|
+
process_args(*args)
|
20
|
+
end
|
21
|
+
|
22
|
+
def process_args *args
|
23
|
+
end
|
24
|
+
|
25
|
+
module ClassMethods
|
26
|
+
|
27
|
+
def set_stream_name string
|
28
|
+
Geometry.register_element self, string
|
29
|
+
@stream_name = string
|
30
|
+
end
|
31
|
+
|
32
|
+
def abbreviate string
|
33
|
+
Geometry.register_element self, string
|
34
|
+
@abbreviation = string
|
35
|
+
end
|
36
|
+
|
37
|
+
def abbreviation
|
38
|
+
@abbreviation
|
39
|
+
end
|
40
|
+
|
41
|
+
def all_stream_names
|
42
|
+
[stream_name].tap do |names|
|
43
|
+
names << abbreviation if abbreviation
|
44
|
+
names << to_s
|
45
|
+
names << to_s.split('::').last if respond_to?(:name) && name
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def stream_name
|
50
|
+
return @stream_name if @stream_name
|
51
|
+
case
|
52
|
+
when defined?(Settings) && Settings[:class_names].to_s == 'short' && abbreviation
|
53
|
+
@stream_name = abbreviation
|
54
|
+
when defined?(Settings) && Settings[:class_names].to_s == 'long'
|
55
|
+
@stream_name = to_s
|
56
|
+
else
|
57
|
+
@stream_name = to_s.split("::").last
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module Clusta
|
4
|
+
module Serialization
|
5
|
+
module JSON
|
6
|
+
|
7
|
+
def to_hash
|
8
|
+
{}.tap do |json|
|
9
|
+
fields.each do |field|
|
10
|
+
value = send(field[:name])
|
11
|
+
value = value.to_hash if value.respond_to?(:to_hash)
|
12
|
+
json[field[:name]] = value
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_flat
|
18
|
+
[stream_name].tap do |record|
|
19
|
+
keys.each do |key|
|
20
|
+
record << self.send(key[:name])
|
21
|
+
end
|
22
|
+
data = non_key_field_data
|
23
|
+
record << data.to_json unless data.empty?
|
24
|
+
record.concat(extra_outputs)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def non_key_field_data
|
29
|
+
{}.tap do |data|
|
30
|
+
non_key_fields.each do |field|
|
31
|
+
value = send(field[:name])
|
32
|
+
value = value.to_hash if value.respond_to?(:to_hash)
|
33
|
+
data[field[:name]] = value unless value.nil? && field[:optional]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def process_args *args
|
39
|
+
json_index = 0
|
40
|
+
self.class.keys.each_with_index do |key, index|
|
41
|
+
self.send("#{key[:name]}=", args[index])
|
42
|
+
json_index = index + 1
|
43
|
+
end
|
44
|
+
|
45
|
+
if args[json_index]
|
46
|
+
if args[json_index].is_a?(Hash)
|
47
|
+
data = args[json_index]
|
48
|
+
else
|
49
|
+
data = ::JSON.parse(args[json_index])
|
50
|
+
end
|
51
|
+
else
|
52
|
+
data = {}
|
53
|
+
end
|
54
|
+
|
55
|
+
non_key_fields.each do |field|
|
56
|
+
name = field[:name].to_s
|
57
|
+
case
|
58
|
+
when field[:optional]
|
59
|
+
self.send("#{name}=", data[name]) if data.has_key?(name)
|
60
|
+
when (!data.has_key?(name))
|
61
|
+
raise ArgumentError.new("A #{self.class} requires a non-nil value for #{name}.")
|
62
|
+
when field[:type] == :geometry
|
63
|
+
self.send("#{name}=", data[name])
|
64
|
+
else
|
65
|
+
self.send("#{name}=", data[name])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
self.extra_inputs = (args[(json_index + 1)..-1] || [])
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.included klass
|
73
|
+
klass.extend(ClassMethods)
|
74
|
+
end
|
75
|
+
|
76
|
+
module ClassMethods
|
77
|
+
|
78
|
+
def from_json_component data
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module Clusta
|
2
|
+
module Serialization
|
3
|
+
module TSV
|
4
|
+
|
5
|
+
def to_flat
|
6
|
+
[stream_name].tap do |record|
|
7
|
+
fields.each do |field|
|
8
|
+
value = send(field[:name])
|
9
|
+
value = value.to_tsv_component if field[:type] == :geometry
|
10
|
+
record << value.to_s unless value.nil? && field[:optional]
|
11
|
+
end
|
12
|
+
record.concat(extra_outputs)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def suffix index
|
17
|
+
case index.to_s
|
18
|
+
when /1$/ then 'st'
|
19
|
+
when /2$/ then 'nd'
|
20
|
+
when /3$/ then 'rd'
|
21
|
+
else 'th'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def process_args *args
|
26
|
+
self.class.fields.each_with_index do |field, index|
|
27
|
+
case
|
28
|
+
when field[:optional]
|
29
|
+
self.send("#{field[:name]}=", args[index]) if args[index]
|
30
|
+
when args[index].nil?
|
31
|
+
raise ArgumentError.new("A #{self.class} requires a non-nil value for #{field[:name]} as its #{index}#{suffix(index)} argument.")
|
32
|
+
when field[:type] == :geometry
|
33
|
+
self.send("#{field[:name]}=", self.class.from_tsv_component_string(args[index]))
|
34
|
+
else
|
35
|
+
self.send("#{field[:name]}=", args[index])
|
36
|
+
end
|
37
|
+
end
|
38
|
+
self.extra_inputs = (args[self.class.fields.size..-1] || [])
|
39
|
+
end
|
40
|
+
|
41
|
+
def extra_inputs= inputs
|
42
|
+
@extra_inputs = inputs.map do |input|
|
43
|
+
if input =~ /^[A-Z].*;/
|
44
|
+
self.class.from_tsv_component_string(input)
|
45
|
+
else
|
46
|
+
input
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def extra_outputs
|
52
|
+
@extra_inputs.map do |input|
|
53
|
+
if input.respond_to?(:to_tsv_component)
|
54
|
+
input.to_tsv_component
|
55
|
+
else
|
56
|
+
input
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.included klass
|
62
|
+
klass.extend(ClassMethods)
|
63
|
+
end
|
64
|
+
|
65
|
+
def to_tsv_component
|
66
|
+
to_flat.join(";")
|
67
|
+
end
|
68
|
+
|
69
|
+
module ClassMethods
|
70
|
+
|
71
|
+
def from_tsv_component_string string
|
72
|
+
return string unless string.is_a?(String)
|
73
|
+
args = string.split(';')
|
74
|
+
klass_name = args.shift
|
75
|
+
raise ArgumentError.new("Elements instantiated from a TSV component string must match the format 'klass;[field1;[field2;]...]'") unless klass_name
|
76
|
+
Wukong.class_from_resource(klass_name).new(*args)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|