clusta 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +66 -0
- data/VERSION +1 -1
- data/bin/clusta +1 -28
- data/lib/clusta.rb +12 -3
- data/lib/clusta/geometry.rb +53 -8
- data/lib/clusta/geometry/all.rb +3 -0
- data/lib/clusta/geometry/assortativity.rb +2 -2
- data/lib/clusta/geometry/degree.rb +3 -1
- data/lib/clusta/geometry/{edge_degree_pair.rb → degree_pair.rb} +3 -3
- data/lib/clusta/geometry/directed/degree.rb +3 -1
- data/lib/clusta/geometry/directed/{edge_degree_pair.rb → degree_pair.rb} +4 -3
- data/lib/clusta/geometry/directed/edge.rb +4 -2
- data/lib/clusta/geometry/directed/{arrow.rb → neighbor.rb} +1 -1
- data/lib/clusta/geometry/directed/neighborhood.rb +31 -0
- data/lib/clusta/geometry/edge.rb +6 -4
- data/lib/clusta/geometry/element.rb +10 -117
- data/lib/clusta/geometry/{arrow.rb → neighbor.rb} +3 -3
- data/lib/clusta/geometry/neighborhood.rb +41 -0
- data/lib/clusta/geometry/vertex.rb +4 -1
- data/lib/clusta/runner.rb +101 -4
- data/lib/clusta/schema.rb +100 -0
- data/lib/clusta/serialization.rb +63 -0
- data/lib/clusta/serialization/json.rb +86 -0
- data/lib/clusta/serialization/tsv.rb +81 -0
- data/lib/clusta/transforms.rb +59 -26
- data/lib/clusta/transforms/{edge_degree_pairs_to_assortativities.rb → degree_pairs_to_assortativities.rb} +7 -3
- data/lib/clusta/transforms/edges_to_degrees.rb +5 -0
- data/lib/clusta/transforms/{edges_to_vertex_arrows.rb → edges_to_neighborhoods.rb} +11 -6
- data/lib/clusta/transforms/import.rb +6 -0
- data/lib/clusta/transforms/neighborhoods_to_degree_pairs.rb +70 -0
- data/lib/clusta/transforms/pm3d.rb +46 -0
- data/lib/clusta/transforms/prune_edges.rb +34 -0
- data/spec/clusta/schema_spec.rb +36 -0
- data/spec/clusta/serialization/json_spec.rb +133 -0
- data/spec/clusta/serialization/tsv_spec.rb +133 -0
- data/spec/clusta/serialization_spec.rb +27 -0
- data/spec/clusta/transforms/degree_pairs_to_assortativities_spec.rb +13 -0
- data/spec/clusta/transforms/{edges_to_vertex_arrows_spec.rb → edges_to_neighborhoods_spec.rb} +5 -5
- data/spec/clusta/transforms/import_spec.rb +9 -0
- data/spec/clusta/transforms/neighborhoods_to_degree_pairs_spec.rb +21 -0
- data/spec/clusta/transforms/prune_edges_spec.rb +22 -0
- data/spec/data/assortativities/directed.tsv +4 -0
- data/spec/data/assortativities/undirected.tsv +7 -0
- data/spec/data/degree_pairs/directed.tsv +10 -0
- data/spec/data/degree_pairs/undirected.tsv +18 -0
- data/spec/data/external/vertices.tsv +9 -0
- data/spec/data/imports/vertices.labeled.tsv +9 -0
- data/spec/data/neighborhoods/directed.unweighted.tsv +7 -0
- data/spec/data/neighborhoods/directed.weighted.tsv +7 -0
- data/spec/data/neighborhoods/undirected.unweighted.tsv +9 -0
- data/spec/data/neighborhoods/undirected.weighted.tsv +9 -0
- data/spec/data/pruned_edges/directed.unweighted.tsv +1 -0
- data/spec/data/pruned_edges/directed.weighted.tsv +3 -0
- data/spec/data/pruned_edges/undirected.unweighted.tsv +1 -0
- data/spec/data/pruned_edges/undirected.weighted.tsv +3 -0
- data/spec/support/transforms_spec_helper.rb +5 -1
- metadata +47 -23
- data/lib/clusta/geometry/directed/vertex_arrows.rb +0 -25
- data/lib/clusta/geometry/vertex_arrows.rb +0 -45
- data/lib/clusta/transforms/vertex_arrows_to_edge_degree_pairs.rb +0 -63
- data/spec/clusta/geometry/element_spec.rb +0 -191
- data/spec/data/vertex_arrows/directed.unweighted.tsv +0 -7
- data/spec/data/vertex_arrows/directed.weighted.tsv +0 -7
- data/spec/data/vertex_arrows/undirected.unweighted.tsv +0 -9
- data/spec/data/vertex_arrows/undirected.weighted.tsv +0 -9
@@ -0,0 +1,41 @@
|
|
1
|
+
module Clusta
|
2
|
+
module Geometry
|
3
|
+
|
4
|
+
class Neighborhood < Vertex
|
5
|
+
|
6
|
+
extra_inputs :neighbors
|
7
|
+
|
8
|
+
def joins? label
|
9
|
+
neighbors.detect { |neighbor| neighbor.label == label }
|
10
|
+
end
|
11
|
+
|
12
|
+
def directed?
|
13
|
+
false
|
14
|
+
end
|
15
|
+
|
16
|
+
def size
|
17
|
+
neighbors.size
|
18
|
+
end
|
19
|
+
|
20
|
+
def degree_pairs
|
21
|
+
neighbors.map do |neighbor|
|
22
|
+
# This vertex's degree is just the size of this
|
23
|
+
# neighborhood.
|
24
|
+
#
|
25
|
+
# We don't know anything about each neighbor's degree other
|
26
|
+
# than it must be at least 1 b/c it's in this vertex's
|
27
|
+
# neighborhood.
|
28
|
+
DegreePair.new(label, neighbor.label, size, 1)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def reversed_degree_pairs
|
33
|
+
neighbors.map do |neighbor|
|
34
|
+
DegreePair.new(neighbor.label, label, 1, size)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
data/lib/clusta/runner.rb
CHANGED
@@ -1,15 +1,112 @@
|
|
1
|
+
require 'configliere'
|
2
|
+
|
3
|
+
Settings.use :commandline
|
4
|
+
|
5
|
+
Settings.define :transform, :description => "The name of the tranformation to run.", :required => false
|
6
|
+
Settings.define :list_transforms, :description => "List known transformations.", :required => false, :type => :boolean, :default => false
|
7
|
+
Settings.define :class_names, :description => "The output format for class names, one of: 'long', 'medium', or 'short'.", :required => true, :default => 'medium'
|
8
|
+
Settings.define :serialize, :description => "The serialization format for data, one of: 'json' or 'tsv'.", :required => true, :default => 'tsv'
|
9
|
+
Settings.define :transforms_path, :description => "A colon-separated list of directories to require transform definitions.", :required => false, :default => ''
|
10
|
+
Settings.define :geometry_path, :description => "A colon-separated list of directories to require geometry definitions.", :required => false, :default => ''
|
11
|
+
|
1
12
|
module Clusta
|
2
13
|
|
3
14
|
class Runner
|
4
15
|
|
5
|
-
|
6
|
-
|
16
|
+
RUN_ARG_REGEXP = /--run=./
|
17
|
+
|
18
|
+
|
19
|
+
def initialize name, argv
|
20
|
+
@name = name
|
21
|
+
@argv = argv
|
7
22
|
end
|
8
23
|
|
9
24
|
def run!
|
10
|
-
|
25
|
+
begin
|
26
|
+
Settings.resolve!
|
27
|
+
case
|
28
|
+
when Settings[:list_transforms]
|
29
|
+
load_transforms!
|
30
|
+
list_transforms!
|
31
|
+
when Settings[:list_geometry]
|
32
|
+
load_geometry!
|
33
|
+
list_geometry!
|
34
|
+
when Settings[:transform]
|
35
|
+
load_transforms!
|
36
|
+
load_geometry!
|
37
|
+
run_transform!
|
38
|
+
when Settings[:map_command] || Settings[:reduce_command]
|
39
|
+
run_map_reduce!
|
40
|
+
else
|
41
|
+
print_help!
|
42
|
+
end
|
43
|
+
rescue Clusta::Error => e
|
44
|
+
$stderr.puts "ERROR: #{e.message}"
|
45
|
+
exit(1)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def load_transforms!
|
50
|
+
rb_files_within(:transforms_path) do |path|
|
51
|
+
Clusta::Transforms.load_from(path)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def load_geometry!
|
56
|
+
rb_files_within(:geometry_path) do |path|
|
57
|
+
Clusta::Geometry.load_from(path)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def rb_files_within key, &block
|
62
|
+
return if Settings[key].nil? || Settings[key].empty?
|
63
|
+
Settings[key].split(':').each do |dir|
|
64
|
+
expanded = File.expand_path(dir)
|
65
|
+
unless File.directory?(expanded)
|
66
|
+
$stderr.puts("WARNING: #{expanded} is not a directory")
|
67
|
+
next
|
68
|
+
end
|
69
|
+
Dir[File.join(expanded, '*.rb')].each do |path|
|
70
|
+
yield path
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def list_transforms!
|
76
|
+
puts Clusta::Transforms.listing
|
77
|
+
end
|
78
|
+
|
79
|
+
def list_geometry!
|
80
|
+
puts Clusta::Geometry.listing
|
81
|
+
end
|
82
|
+
|
83
|
+
def run_transform!
|
84
|
+
transform = Clusta::Transforms.from_name(Settings[:transform])
|
85
|
+
::ARGV.replace(@argv)
|
86
|
+
::ARGV.push('--run=local') unless ARGV.any? { |arg| arg =~ self.class::RUN_ARG_REGEXP }
|
87
|
+
script = Clusta::Transforms.script_for(transform)
|
88
|
+
script.run
|
89
|
+
end
|
90
|
+
|
91
|
+
def run_map_reduce!
|
92
|
+
::ARGV.replace(@argv)
|
93
|
+
::ARGV.push('--run=local') unless ARGV.any? { |arg| arg =~ self.class::RUN_ARG_REGEXP }
|
94
|
+
begin
|
95
|
+
s = Wukong::Script.new(nil, nil)
|
96
|
+
rescue RuntimeError => e
|
97
|
+
raise Error.new(e.message)
|
98
|
+
end
|
99
|
+
s.run
|
100
|
+
end
|
101
|
+
|
102
|
+
def print_help!
|
103
|
+
begin
|
104
|
+
s = Wukong::Script.new(nil, nil)
|
105
|
+
rescue RuntimeError => e
|
106
|
+
raise Error.new(e.message)
|
107
|
+
end
|
108
|
+
s.run
|
11
109
|
end
|
12
110
|
|
13
111
|
end
|
14
|
-
|
15
112
|
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
module Clusta
|
2
|
+
module Schema
|
3
|
+
|
4
|
+
def extra_inputs
|
5
|
+
@extra_inputs ||= []
|
6
|
+
end
|
7
|
+
attr_writer :extra_inputs
|
8
|
+
|
9
|
+
def extra_outputs
|
10
|
+
extra_inputs.map(&:to_s)
|
11
|
+
end
|
12
|
+
|
13
|
+
def fields
|
14
|
+
self.class.fields
|
15
|
+
end
|
16
|
+
|
17
|
+
def keys
|
18
|
+
self.class.keys
|
19
|
+
end
|
20
|
+
|
21
|
+
def non_key_fields
|
22
|
+
self.class.non_key_fields
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.included klass
|
26
|
+
klass.extend(ClassMethods)
|
27
|
+
class << klass ; attr_reader :fields ; end
|
28
|
+
klass.instance_variable_set('@fields', [])
|
29
|
+
end
|
30
|
+
|
31
|
+
module ClassMethods
|
32
|
+
|
33
|
+
def extra_inputs name
|
34
|
+
alias_method name, :extra_inputs
|
35
|
+
end
|
36
|
+
|
37
|
+
def inherited(subclass)
|
38
|
+
subclass.instance_variable_set("@fields", @fields.dup)
|
39
|
+
super
|
40
|
+
end
|
41
|
+
|
42
|
+
def field_names
|
43
|
+
@fields.map { |field| field[:name].to_s }
|
44
|
+
end
|
45
|
+
|
46
|
+
def has_optional_field?
|
47
|
+
@fields.any? { |field| field[:optional] }
|
48
|
+
end
|
49
|
+
|
50
|
+
def has_non_key_field?
|
51
|
+
@fields.any? { |field| ! field[:key] }
|
52
|
+
end
|
53
|
+
|
54
|
+
def optional_field
|
55
|
+
@fields.detect { |field| field[:optional] }
|
56
|
+
end
|
57
|
+
|
58
|
+
def keys
|
59
|
+
@fields.find_all { |field| field[:key] }
|
60
|
+
end
|
61
|
+
|
62
|
+
def non_key_fields
|
63
|
+
@fields.find_all { |field| ! field[:key] }
|
64
|
+
end
|
65
|
+
|
66
|
+
def field name, options={}
|
67
|
+
raise AmbiguousArgumentsError.new("Cannot define a second optional field #{name} because field #{optional_field[:name]} is already optional.") if has_optional_field?
|
68
|
+
|
69
|
+
raise SortError.new("The first field defined must be a key that can be sorted on.") if @fields.empty? && (!options[:key])
|
70
|
+
raise SortError.new("Cannot define a key field #{name} because some non-key fields have already been defined.") if options[:key] && has_non_key_field?
|
71
|
+
raise SortError.new("Key fields (#{name}) cannot have type :geometry") if options[:key] && options[:type] == :geometry
|
72
|
+
|
73
|
+
attr_reader name
|
74
|
+
|
75
|
+
case options[:type]
|
76
|
+
when :int
|
77
|
+
define_method "#{name}=" do |val|
|
78
|
+
instance_variable_set("@#{name}", val.to_i)
|
79
|
+
end
|
80
|
+
when :float
|
81
|
+
define_method "#{name}=" do |val|
|
82
|
+
instance_variable_set("@#{name}", val.to_f)
|
83
|
+
end
|
84
|
+
else
|
85
|
+
define_method "#{name}=" do |val|
|
86
|
+
instance_variable_set("@#{name}", val)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
@fields << options.merge(:name => name)
|
90
|
+
end
|
91
|
+
|
92
|
+
def key name, options={}
|
93
|
+
field name, options.merge(:key => true)
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Clusta
|
2
|
+
|
3
|
+
# Defines methods that allow a class to (de)serialize itself in a
|
4
|
+
# way compatabile with Wukong.
|
5
|
+
module Serialization
|
6
|
+
|
7
|
+
autoload :TSV, 'clusta/serialization/tsv'
|
8
|
+
autoload :JSON, 'clusta/serialization/json'
|
9
|
+
|
10
|
+
def self.included klass
|
11
|
+
klass.extend(ClassMethods)
|
12
|
+
end
|
13
|
+
|
14
|
+
def stream_name
|
15
|
+
self.class.stream_name
|
16
|
+
end
|
17
|
+
|
18
|
+
def initialize *args
|
19
|
+
process_args(*args)
|
20
|
+
end
|
21
|
+
|
22
|
+
def process_args *args
|
23
|
+
end
|
24
|
+
|
25
|
+
module ClassMethods
|
26
|
+
|
27
|
+
def set_stream_name string
|
28
|
+
Geometry.register_element self, string
|
29
|
+
@stream_name = string
|
30
|
+
end
|
31
|
+
|
32
|
+
def abbreviate string
|
33
|
+
Geometry.register_element self, string
|
34
|
+
@abbreviation = string
|
35
|
+
end
|
36
|
+
|
37
|
+
def abbreviation
|
38
|
+
@abbreviation
|
39
|
+
end
|
40
|
+
|
41
|
+
def all_stream_names
|
42
|
+
[stream_name].tap do |names|
|
43
|
+
names << abbreviation if abbreviation
|
44
|
+
names << to_s
|
45
|
+
names << to_s.split('::').last if respond_to?(:name) && name
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def stream_name
|
50
|
+
return @stream_name if @stream_name
|
51
|
+
case
|
52
|
+
when defined?(Settings) && Settings[:class_names].to_s == 'short' && abbreviation
|
53
|
+
@stream_name = abbreviation
|
54
|
+
when defined?(Settings) && Settings[:class_names].to_s == 'long'
|
55
|
+
@stream_name = to_s
|
56
|
+
else
|
57
|
+
@stream_name = to_s.split("::").last
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'json'
|
2
|
+
|
3
|
+
module Clusta
|
4
|
+
module Serialization
|
5
|
+
module JSON
|
6
|
+
|
7
|
+
def to_hash
|
8
|
+
{}.tap do |json|
|
9
|
+
fields.each do |field|
|
10
|
+
value = send(field[:name])
|
11
|
+
value = value.to_hash if value.respond_to?(:to_hash)
|
12
|
+
json[field[:name]] = value
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_flat
|
18
|
+
[stream_name].tap do |record|
|
19
|
+
keys.each do |key|
|
20
|
+
record << self.send(key[:name])
|
21
|
+
end
|
22
|
+
data = non_key_field_data
|
23
|
+
record << data.to_json unless data.empty?
|
24
|
+
record.concat(extra_outputs)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def non_key_field_data
|
29
|
+
{}.tap do |data|
|
30
|
+
non_key_fields.each do |field|
|
31
|
+
value = send(field[:name])
|
32
|
+
value = value.to_hash if value.respond_to?(:to_hash)
|
33
|
+
data[field[:name]] = value unless value.nil? && field[:optional]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def process_args *args
|
39
|
+
json_index = 0
|
40
|
+
self.class.keys.each_with_index do |key, index|
|
41
|
+
self.send("#{key[:name]}=", args[index])
|
42
|
+
json_index = index + 1
|
43
|
+
end
|
44
|
+
|
45
|
+
if args[json_index]
|
46
|
+
if args[json_index].is_a?(Hash)
|
47
|
+
data = args[json_index]
|
48
|
+
else
|
49
|
+
data = ::JSON.parse(args[json_index])
|
50
|
+
end
|
51
|
+
else
|
52
|
+
data = {}
|
53
|
+
end
|
54
|
+
|
55
|
+
non_key_fields.each do |field|
|
56
|
+
name = field[:name].to_s
|
57
|
+
case
|
58
|
+
when field[:optional]
|
59
|
+
self.send("#{name}=", data[name]) if data.has_key?(name)
|
60
|
+
when (!data.has_key?(name))
|
61
|
+
raise ArgumentError.new("A #{self.class} requires a non-nil value for #{name}.")
|
62
|
+
when field[:type] == :geometry
|
63
|
+
self.send("#{name}=", data[name])
|
64
|
+
else
|
65
|
+
self.send("#{name}=", data[name])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
self.extra_inputs = (args[(json_index + 1)..-1] || [])
|
70
|
+
end
|
71
|
+
|
72
|
+
def self.included klass
|
73
|
+
klass.extend(ClassMethods)
|
74
|
+
end
|
75
|
+
|
76
|
+
module ClassMethods
|
77
|
+
|
78
|
+
def from_json_component data
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module Clusta
|
2
|
+
module Serialization
|
3
|
+
module TSV
|
4
|
+
|
5
|
+
def to_flat
|
6
|
+
[stream_name].tap do |record|
|
7
|
+
fields.each do |field|
|
8
|
+
value = send(field[:name])
|
9
|
+
value = value.to_tsv_component if field[:type] == :geometry
|
10
|
+
record << value.to_s unless value.nil? && field[:optional]
|
11
|
+
end
|
12
|
+
record.concat(extra_outputs)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def suffix index
|
17
|
+
case index.to_s
|
18
|
+
when /1$/ then 'st'
|
19
|
+
when /2$/ then 'nd'
|
20
|
+
when /3$/ then 'rd'
|
21
|
+
else 'th'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def process_args *args
|
26
|
+
self.class.fields.each_with_index do |field, index|
|
27
|
+
case
|
28
|
+
when field[:optional]
|
29
|
+
self.send("#{field[:name]}=", args[index]) if args[index]
|
30
|
+
when args[index].nil?
|
31
|
+
raise ArgumentError.new("A #{self.class} requires a non-nil value for #{field[:name]} as its #{index}#{suffix(index)} argument.")
|
32
|
+
when field[:type] == :geometry
|
33
|
+
self.send("#{field[:name]}=", self.class.from_tsv_component_string(args[index]))
|
34
|
+
else
|
35
|
+
self.send("#{field[:name]}=", args[index])
|
36
|
+
end
|
37
|
+
end
|
38
|
+
self.extra_inputs = (args[self.class.fields.size..-1] || [])
|
39
|
+
end
|
40
|
+
|
41
|
+
def extra_inputs= inputs
|
42
|
+
@extra_inputs = inputs.map do |input|
|
43
|
+
if input =~ /^[A-Z].*;/
|
44
|
+
self.class.from_tsv_component_string(input)
|
45
|
+
else
|
46
|
+
input
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def extra_outputs
|
52
|
+
@extra_inputs.map do |input|
|
53
|
+
if input.respond_to?(:to_tsv_component)
|
54
|
+
input.to_tsv_component
|
55
|
+
else
|
56
|
+
input
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.included klass
|
62
|
+
klass.extend(ClassMethods)
|
63
|
+
end
|
64
|
+
|
65
|
+
def to_tsv_component
|
66
|
+
to_flat.join(";")
|
67
|
+
end
|
68
|
+
|
69
|
+
module ClassMethods
|
70
|
+
|
71
|
+
def from_tsv_component_string string
|
72
|
+
return string unless string.is_a?(String)
|
73
|
+
args = string.split(';')
|
74
|
+
klass_name = args.shift
|
75
|
+
raise ArgumentError.new("Elements instantiated from a TSV component string must match the format 'klass;[field1;[field2;]...]'") unless klass_name
|
76
|
+
Wukong.class_from_resource(klass_name).new(*args)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|