clusta 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/README.rdoc +66 -0
  2. data/VERSION +1 -1
  3. data/bin/clusta +1 -28
  4. data/lib/clusta.rb +12 -3
  5. data/lib/clusta/geometry.rb +53 -8
  6. data/lib/clusta/geometry/all.rb +3 -0
  7. data/lib/clusta/geometry/assortativity.rb +2 -2
  8. data/lib/clusta/geometry/degree.rb +3 -1
  9. data/lib/clusta/geometry/{edge_degree_pair.rb → degree_pair.rb} +3 -3
  10. data/lib/clusta/geometry/directed/degree.rb +3 -1
  11. data/lib/clusta/geometry/directed/{edge_degree_pair.rb → degree_pair.rb} +4 -3
  12. data/lib/clusta/geometry/directed/edge.rb +4 -2
  13. data/lib/clusta/geometry/directed/{arrow.rb → neighbor.rb} +1 -1
  14. data/lib/clusta/geometry/directed/neighborhood.rb +31 -0
  15. data/lib/clusta/geometry/edge.rb +6 -4
  16. data/lib/clusta/geometry/element.rb +10 -117
  17. data/lib/clusta/geometry/{arrow.rb → neighbor.rb} +3 -3
  18. data/lib/clusta/geometry/neighborhood.rb +41 -0
  19. data/lib/clusta/geometry/vertex.rb +4 -1
  20. data/lib/clusta/runner.rb +101 -4
  21. data/lib/clusta/schema.rb +100 -0
  22. data/lib/clusta/serialization.rb +63 -0
  23. data/lib/clusta/serialization/json.rb +86 -0
  24. data/lib/clusta/serialization/tsv.rb +81 -0
  25. data/lib/clusta/transforms.rb +59 -26
  26. data/lib/clusta/transforms/{edge_degree_pairs_to_assortativities.rb → degree_pairs_to_assortativities.rb} +7 -3
  27. data/lib/clusta/transforms/edges_to_degrees.rb +5 -0
  28. data/lib/clusta/transforms/{edges_to_vertex_arrows.rb → edges_to_neighborhoods.rb} +11 -6
  29. data/lib/clusta/transforms/import.rb +6 -0
  30. data/lib/clusta/transforms/neighborhoods_to_degree_pairs.rb +70 -0
  31. data/lib/clusta/transforms/pm3d.rb +46 -0
  32. data/lib/clusta/transforms/prune_edges.rb +34 -0
  33. data/spec/clusta/schema_spec.rb +36 -0
  34. data/spec/clusta/serialization/json_spec.rb +133 -0
  35. data/spec/clusta/serialization/tsv_spec.rb +133 -0
  36. data/spec/clusta/serialization_spec.rb +27 -0
  37. data/spec/clusta/transforms/degree_pairs_to_assortativities_spec.rb +13 -0
  38. data/spec/clusta/transforms/{edges_to_vertex_arrows_spec.rb → edges_to_neighborhoods_spec.rb} +5 -5
  39. data/spec/clusta/transforms/import_spec.rb +9 -0
  40. data/spec/clusta/transforms/neighborhoods_to_degree_pairs_spec.rb +21 -0
  41. data/spec/clusta/transforms/prune_edges_spec.rb +22 -0
  42. data/spec/data/assortativities/directed.tsv +4 -0
  43. data/spec/data/assortativities/undirected.tsv +7 -0
  44. data/spec/data/degree_pairs/directed.tsv +10 -0
  45. data/spec/data/degree_pairs/undirected.tsv +18 -0
  46. data/spec/data/external/vertices.tsv +9 -0
  47. data/spec/data/imports/vertices.labeled.tsv +9 -0
  48. data/spec/data/neighborhoods/directed.unweighted.tsv +7 -0
  49. data/spec/data/neighborhoods/directed.weighted.tsv +7 -0
  50. data/spec/data/neighborhoods/undirected.unweighted.tsv +9 -0
  51. data/spec/data/neighborhoods/undirected.weighted.tsv +9 -0
  52. data/spec/data/pruned_edges/directed.unweighted.tsv +1 -0
  53. data/spec/data/pruned_edges/directed.weighted.tsv +3 -0
  54. data/spec/data/pruned_edges/undirected.unweighted.tsv +1 -0
  55. data/spec/data/pruned_edges/undirected.weighted.tsv +3 -0
  56. data/spec/support/transforms_spec_helper.rb +5 -1
  57. metadata +47 -23
  58. data/lib/clusta/geometry/directed/vertex_arrows.rb +0 -25
  59. data/lib/clusta/geometry/vertex_arrows.rb +0 -45
  60. data/lib/clusta/transforms/vertex_arrows_to_edge_degree_pairs.rb +0 -63
  61. data/spec/clusta/geometry/element_spec.rb +0 -191
  62. data/spec/data/vertex_arrows/directed.unweighted.tsv +0 -7
  63. data/spec/data/vertex_arrows/directed.weighted.tsv +0 -7
  64. data/spec/data/vertex_arrows/undirected.unweighted.tsv +0 -9
  65. data/spec/data/vertex_arrows/undirected.weighted.tsv +0 -9
@@ -1,8 +1,9 @@
1
1
  module Clusta
2
2
  module Geometry
3
3
 
4
- class Arrow < Element
5
- field :target_label
4
+ class Neighbor < Element
5
+
6
+ key :label
6
7
  field :weight, :optional => true
7
8
 
8
9
  def directed?
@@ -13,4 +14,3 @@ module Clusta
13
14
 
14
15
  end
15
16
  end
16
-
@@ -0,0 +1,41 @@
1
+ module Clusta
2
+ module Geometry
3
+
4
+ class Neighborhood < Vertex
5
+
6
+ extra_inputs :neighbors
7
+
8
+ def joins? label
9
+ neighbors.detect { |neighbor| neighbor.label == label }
10
+ end
11
+
12
+ def directed?
13
+ false
14
+ end
15
+
16
+ def size
17
+ neighbors.size
18
+ end
19
+
20
+ def degree_pairs
21
+ neighbors.map do |neighbor|
22
+ # This vertex's degree is just the size of this
23
+ # neighborhood.
24
+ #
25
+ # We don't know anything about each neighbor's degree other
26
+ # than it must be at least 1 b/c it's in this vertex's
27
+ # neighborhood.
28
+ DegreePair.new(label, neighbor.label, size, 1)
29
+ end
30
+ end
31
+
32
+ def reversed_degree_pairs
33
+ neighbors.map do |neighbor|
34
+ DegreePair.new(neighbor.label, label, 1, size)
35
+ end
36
+ end
37
+
38
+ end
39
+
40
+ end
41
+ end
@@ -2,7 +2,10 @@ module Clusta
2
2
  module Geometry
3
3
 
4
4
  class Vertex < Element
5
- field :label
5
+
6
+ abbreviate 'V'
7
+
8
+ key :label
6
9
  end
7
10
 
8
11
  end
@@ -1,15 +1,112 @@
1
+ require 'configliere'
2
+
3
+ Settings.use :commandline
4
+
5
+ Settings.define :transform, :description => "The name of the tranformation to run.", :required => false
6
+ Settings.define :list_transforms, :description => "List known transformations.", :required => false, :type => :boolean, :default => false
7
+ Settings.define :class_names, :description => "The output format for class names, one of: 'long', 'medium', or 'short'.", :required => true, :default => 'medium'
8
+ Settings.define :serialize, :description => "The serialization format for data, one of: 'json' or 'tsv'.", :required => true, :default => 'tsv'
9
+ Settings.define :transforms_path, :description => "A colon-separated list of directories to require transform definitions.", :required => false, :default => ''
10
+ Settings.define :geometry_path, :description => "A colon-separated list of directories to require geometry definitions.", :required => false, :default => ''
11
+
1
12
  module Clusta
2
13
 
3
14
  class Runner
4
15
 
5
- def initialize args
6
- self.args = args
16
+ RUN_ARG_REGEXP = /--run=./
17
+
18
+
19
+ def initialize name, argv
20
+ @name = name
21
+ @argv = argv
7
22
  end
8
23
 
9
24
  def run!
10
- Settings.resolve!
25
+ begin
26
+ Settings.resolve!
27
+ case
28
+ when Settings[:list_transforms]
29
+ load_transforms!
30
+ list_transforms!
31
+ when Settings[:list_geometry]
32
+ load_geometry!
33
+ list_geometry!
34
+ when Settings[:transform]
35
+ load_transforms!
36
+ load_geometry!
37
+ run_transform!
38
+ when Settings[:map_command] || Settings[:reduce_command]
39
+ run_map_reduce!
40
+ else
41
+ print_help!
42
+ end
43
+ rescue Clusta::Error => e
44
+ $stderr.puts "ERROR: #{e.message}"
45
+ exit(1)
46
+ end
47
+ end
48
+
49
+ def load_transforms!
50
+ rb_files_within(:transforms_path) do |path|
51
+ Clusta::Transforms.load_from(path)
52
+ end
53
+ end
54
+
55
+ def load_geometry!
56
+ rb_files_within(:geometry_path) do |path|
57
+ Clusta::Geometry.load_from(path)
58
+ end
59
+ end
60
+
61
+ def rb_files_within key, &block
62
+ return if Settings[key].nil? || Settings[key].empty?
63
+ Settings[key].split(':').each do |dir|
64
+ expanded = File.expand_path(dir)
65
+ unless File.directory?(expanded)
66
+ $stderr.puts("WARNING: #{expanded} is not a directory")
67
+ next
68
+ end
69
+ Dir[File.join(expanded, '*.rb')].each do |path|
70
+ yield path
71
+ end
72
+ end
73
+ end
74
+
75
+ def list_transforms!
76
+ puts Clusta::Transforms.listing
77
+ end
78
+
79
+ def list_geometry!
80
+ puts Clusta::Geometry.listing
81
+ end
82
+
83
+ def run_transform!
84
+ transform = Clusta::Transforms.from_name(Settings[:transform])
85
+ ::ARGV.replace(@argv)
86
+ ::ARGV.push('--run=local') unless ARGV.any? { |arg| arg =~ self.class::RUN_ARG_REGEXP }
87
+ script = Clusta::Transforms.script_for(transform)
88
+ script.run
89
+ end
90
+
91
+ def run_map_reduce!
92
+ ::ARGV.replace(@argv)
93
+ ::ARGV.push('--run=local') unless ARGV.any? { |arg| arg =~ self.class::RUN_ARG_REGEXP }
94
+ begin
95
+ s = Wukong::Script.new(nil, nil)
96
+ rescue RuntimeError => e
97
+ raise Error.new(e.message)
98
+ end
99
+ s.run
100
+ end
101
+
102
+ def print_help!
103
+ begin
104
+ s = Wukong::Script.new(nil, nil)
105
+ rescue RuntimeError => e
106
+ raise Error.new(e.message)
107
+ end
108
+ s.run
11
109
  end
12
110
 
13
111
  end
14
-
15
112
  end
@@ -0,0 +1,100 @@
1
+ module Clusta
2
+ module Schema
3
+
4
+ def extra_inputs
5
+ @extra_inputs ||= []
6
+ end
7
+ attr_writer :extra_inputs
8
+
9
+ def extra_outputs
10
+ extra_inputs.map(&:to_s)
11
+ end
12
+
13
+ def fields
14
+ self.class.fields
15
+ end
16
+
17
+ def keys
18
+ self.class.keys
19
+ end
20
+
21
+ def non_key_fields
22
+ self.class.non_key_fields
23
+ end
24
+
25
+ def self.included klass
26
+ klass.extend(ClassMethods)
27
+ class << klass ; attr_reader :fields ; end
28
+ klass.instance_variable_set('@fields', [])
29
+ end
30
+
31
+ module ClassMethods
32
+
33
+ def extra_inputs name
34
+ alias_method name, :extra_inputs
35
+ end
36
+
37
+ def inherited(subclass)
38
+ subclass.instance_variable_set("@fields", @fields.dup)
39
+ super
40
+ end
41
+
42
+ def field_names
43
+ @fields.map { |field| field[:name].to_s }
44
+ end
45
+
46
+ def has_optional_field?
47
+ @fields.any? { |field| field[:optional] }
48
+ end
49
+
50
+ def has_non_key_field?
51
+ @fields.any? { |field| ! field[:key] }
52
+ end
53
+
54
+ def optional_field
55
+ @fields.detect { |field| field[:optional] }
56
+ end
57
+
58
+ def keys
59
+ @fields.find_all { |field| field[:key] }
60
+ end
61
+
62
+ def non_key_fields
63
+ @fields.find_all { |field| ! field[:key] }
64
+ end
65
+
66
+ def field name, options={}
67
+ raise AmbiguousArgumentsError.new("Cannot define a second optional field #{name} because field #{optional_field[:name]} is already optional.") if has_optional_field?
68
+
69
+ raise SortError.new("The first field defined must be a key that can be sorted on.") if @fields.empty? && (!options[:key])
70
+ raise SortError.new("Cannot define a key field #{name} because some non-key fields have already been defined.") if options[:key] && has_non_key_field?
71
+ raise SortError.new("Key fields (#{name}) cannot have type :geometry") if options[:key] && options[:type] == :geometry
72
+
73
+ attr_reader name
74
+
75
+ case options[:type]
76
+ when :int
77
+ define_method "#{name}=" do |val|
78
+ instance_variable_set("@#{name}", val.to_i)
79
+ end
80
+ when :float
81
+ define_method "#{name}=" do |val|
82
+ instance_variable_set("@#{name}", val.to_f)
83
+ end
84
+ else
85
+ define_method "#{name}=" do |val|
86
+ instance_variable_set("@#{name}", val)
87
+ end
88
+ end
89
+ @fields << options.merge(:name => name)
90
+ end
91
+
92
+ def key name, options={}
93
+ field name, options.merge(:key => true)
94
+ end
95
+
96
+ end
97
+
98
+ end
99
+ end
100
+
@@ -0,0 +1,63 @@
1
+ module Clusta
2
+
3
+ # Defines methods that allow a class to (de)serialize itself in a
4
+ # way compatabile with Wukong.
5
+ module Serialization
6
+
7
+ autoload :TSV, 'clusta/serialization/tsv'
8
+ autoload :JSON, 'clusta/serialization/json'
9
+
10
+ def self.included klass
11
+ klass.extend(ClassMethods)
12
+ end
13
+
14
+ def stream_name
15
+ self.class.stream_name
16
+ end
17
+
18
+ def initialize *args
19
+ process_args(*args)
20
+ end
21
+
22
+ def process_args *args
23
+ end
24
+
25
+ module ClassMethods
26
+
27
+ def set_stream_name string
28
+ Geometry.register_element self, string
29
+ @stream_name = string
30
+ end
31
+
32
+ def abbreviate string
33
+ Geometry.register_element self, string
34
+ @abbreviation = string
35
+ end
36
+
37
+ def abbreviation
38
+ @abbreviation
39
+ end
40
+
41
+ def all_stream_names
42
+ [stream_name].tap do |names|
43
+ names << abbreviation if abbreviation
44
+ names << to_s
45
+ names << to_s.split('::').last if respond_to?(:name) && name
46
+ end
47
+ end
48
+
49
+ def stream_name
50
+ return @stream_name if @stream_name
51
+ case
52
+ when defined?(Settings) && Settings[:class_names].to_s == 'short' && abbreviation
53
+ @stream_name = abbreviation
54
+ when defined?(Settings) && Settings[:class_names].to_s == 'long'
55
+ @stream_name = to_s
56
+ else
57
+ @stream_name = to_s.split("::").last
58
+ end
59
+ end
60
+ end
61
+
62
+ end
63
+ end
@@ -0,0 +1,86 @@
1
+ require 'json'
2
+
3
+ module Clusta
4
+ module Serialization
5
+ module JSON
6
+
7
+ def to_hash
8
+ {}.tap do |json|
9
+ fields.each do |field|
10
+ value = send(field[:name])
11
+ value = value.to_hash if value.respond_to?(:to_hash)
12
+ json[field[:name]] = value
13
+ end
14
+ end
15
+ end
16
+
17
+ def to_flat
18
+ [stream_name].tap do |record|
19
+ keys.each do |key|
20
+ record << self.send(key[:name])
21
+ end
22
+ data = non_key_field_data
23
+ record << data.to_json unless data.empty?
24
+ record.concat(extra_outputs)
25
+ end
26
+ end
27
+
28
+ def non_key_field_data
29
+ {}.tap do |data|
30
+ non_key_fields.each do |field|
31
+ value = send(field[:name])
32
+ value = value.to_hash if value.respond_to?(:to_hash)
33
+ data[field[:name]] = value unless value.nil? && field[:optional]
34
+ end
35
+ end
36
+ end
37
+
38
+ def process_args *args
39
+ json_index = 0
40
+ self.class.keys.each_with_index do |key, index|
41
+ self.send("#{key[:name]}=", args[index])
42
+ json_index = index + 1
43
+ end
44
+
45
+ if args[json_index]
46
+ if args[json_index].is_a?(Hash)
47
+ data = args[json_index]
48
+ else
49
+ data = ::JSON.parse(args[json_index])
50
+ end
51
+ else
52
+ data = {}
53
+ end
54
+
55
+ non_key_fields.each do |field|
56
+ name = field[:name].to_s
57
+ case
58
+ when field[:optional]
59
+ self.send("#{name}=", data[name]) if data.has_key?(name)
60
+ when (!data.has_key?(name))
61
+ raise ArgumentError.new("A #{self.class} requires a non-nil value for #{name}.")
62
+ when field[:type] == :geometry
63
+ self.send("#{name}=", data[name])
64
+ else
65
+ self.send("#{name}=", data[name])
66
+ end
67
+ end
68
+
69
+ self.extra_inputs = (args[(json_index + 1)..-1] || [])
70
+ end
71
+
72
+ def self.included klass
73
+ klass.extend(ClassMethods)
74
+ end
75
+
76
+ module ClassMethods
77
+
78
+ def from_json_component data
79
+
80
+ end
81
+
82
+ end
83
+
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,81 @@
1
+ module Clusta
2
+ module Serialization
3
+ module TSV
4
+
5
+ def to_flat
6
+ [stream_name].tap do |record|
7
+ fields.each do |field|
8
+ value = send(field[:name])
9
+ value = value.to_tsv_component if field[:type] == :geometry
10
+ record << value.to_s unless value.nil? && field[:optional]
11
+ end
12
+ record.concat(extra_outputs)
13
+ end
14
+ end
15
+
16
+ def suffix index
17
+ case index.to_s
18
+ when /1$/ then 'st'
19
+ when /2$/ then 'nd'
20
+ when /3$/ then 'rd'
21
+ else 'th'
22
+ end
23
+ end
24
+
25
+ def process_args *args
26
+ self.class.fields.each_with_index do |field, index|
27
+ case
28
+ when field[:optional]
29
+ self.send("#{field[:name]}=", args[index]) if args[index]
30
+ when args[index].nil?
31
+ raise ArgumentError.new("A #{self.class} requires a non-nil value for #{field[:name]} as its #{index}#{suffix(index)} argument.")
32
+ when field[:type] == :geometry
33
+ self.send("#{field[:name]}=", self.class.from_tsv_component_string(args[index]))
34
+ else
35
+ self.send("#{field[:name]}=", args[index])
36
+ end
37
+ end
38
+ self.extra_inputs = (args[self.class.fields.size..-1] || [])
39
+ end
40
+
41
+ def extra_inputs= inputs
42
+ @extra_inputs = inputs.map do |input|
43
+ if input =~ /^[A-Z].*;/
44
+ self.class.from_tsv_component_string(input)
45
+ else
46
+ input
47
+ end
48
+ end
49
+ end
50
+
51
+ def extra_outputs
52
+ @extra_inputs.map do |input|
53
+ if input.respond_to?(:to_tsv_component)
54
+ input.to_tsv_component
55
+ else
56
+ input
57
+ end
58
+ end
59
+ end
60
+
61
+ def self.included klass
62
+ klass.extend(ClassMethods)
63
+ end
64
+
65
+ def to_tsv_component
66
+ to_flat.join(";")
67
+ end
68
+
69
+ module ClassMethods
70
+
71
+ def from_tsv_component_string string
72
+ return string unless string.is_a?(String)
73
+ args = string.split(';')
74
+ klass_name = args.shift
75
+ raise ArgumentError.new("Elements instantiated from a TSV component string must match the format 'klass;[field1;[field2;]...]'") unless klass_name
76
+ Wukong.class_from_resource(klass_name).new(*args)
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end