clusta 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. data/README.rdoc +66 -0
  2. data/VERSION +1 -1
  3. data/bin/clusta +1 -28
  4. data/lib/clusta.rb +12 -3
  5. data/lib/clusta/geometry.rb +53 -8
  6. data/lib/clusta/geometry/all.rb +3 -0
  7. data/lib/clusta/geometry/assortativity.rb +2 -2
  8. data/lib/clusta/geometry/degree.rb +3 -1
  9. data/lib/clusta/geometry/{edge_degree_pair.rb → degree_pair.rb} +3 -3
  10. data/lib/clusta/geometry/directed/degree.rb +3 -1
  11. data/lib/clusta/geometry/directed/{edge_degree_pair.rb → degree_pair.rb} +4 -3
  12. data/lib/clusta/geometry/directed/edge.rb +4 -2
  13. data/lib/clusta/geometry/directed/{arrow.rb → neighbor.rb} +1 -1
  14. data/lib/clusta/geometry/directed/neighborhood.rb +31 -0
  15. data/lib/clusta/geometry/edge.rb +6 -4
  16. data/lib/clusta/geometry/element.rb +10 -117
  17. data/lib/clusta/geometry/{arrow.rb → neighbor.rb} +3 -3
  18. data/lib/clusta/geometry/neighborhood.rb +41 -0
  19. data/lib/clusta/geometry/vertex.rb +4 -1
  20. data/lib/clusta/runner.rb +101 -4
  21. data/lib/clusta/schema.rb +100 -0
  22. data/lib/clusta/serialization.rb +63 -0
  23. data/lib/clusta/serialization/json.rb +86 -0
  24. data/lib/clusta/serialization/tsv.rb +81 -0
  25. data/lib/clusta/transforms.rb +59 -26
  26. data/lib/clusta/transforms/{edge_degree_pairs_to_assortativities.rb → degree_pairs_to_assortativities.rb} +7 -3
  27. data/lib/clusta/transforms/edges_to_degrees.rb +5 -0
  28. data/lib/clusta/transforms/{edges_to_vertex_arrows.rb → edges_to_neighborhoods.rb} +11 -6
  29. data/lib/clusta/transforms/import.rb +6 -0
  30. data/lib/clusta/transforms/neighborhoods_to_degree_pairs.rb +70 -0
  31. data/lib/clusta/transforms/pm3d.rb +46 -0
  32. data/lib/clusta/transforms/prune_edges.rb +34 -0
  33. data/spec/clusta/schema_spec.rb +36 -0
  34. data/spec/clusta/serialization/json_spec.rb +133 -0
  35. data/spec/clusta/serialization/tsv_spec.rb +133 -0
  36. data/spec/clusta/serialization_spec.rb +27 -0
  37. data/spec/clusta/transforms/degree_pairs_to_assortativities_spec.rb +13 -0
  38. data/spec/clusta/transforms/{edges_to_vertex_arrows_spec.rb → edges_to_neighborhoods_spec.rb} +5 -5
  39. data/spec/clusta/transforms/import_spec.rb +9 -0
  40. data/spec/clusta/transforms/neighborhoods_to_degree_pairs_spec.rb +21 -0
  41. data/spec/clusta/transforms/prune_edges_spec.rb +22 -0
  42. data/spec/data/assortativities/directed.tsv +4 -0
  43. data/spec/data/assortativities/undirected.tsv +7 -0
  44. data/spec/data/degree_pairs/directed.tsv +10 -0
  45. data/spec/data/degree_pairs/undirected.tsv +18 -0
  46. data/spec/data/external/vertices.tsv +9 -0
  47. data/spec/data/imports/vertices.labeled.tsv +9 -0
  48. data/spec/data/neighborhoods/directed.unweighted.tsv +7 -0
  49. data/spec/data/neighborhoods/directed.weighted.tsv +7 -0
  50. data/spec/data/neighborhoods/undirected.unweighted.tsv +9 -0
  51. data/spec/data/neighborhoods/undirected.weighted.tsv +9 -0
  52. data/spec/data/pruned_edges/directed.unweighted.tsv +1 -0
  53. data/spec/data/pruned_edges/directed.weighted.tsv +3 -0
  54. data/spec/data/pruned_edges/undirected.unweighted.tsv +1 -0
  55. data/spec/data/pruned_edges/undirected.weighted.tsv +3 -0
  56. data/spec/support/transforms_spec_helper.rb +5 -1
  57. metadata +47 -23
  58. data/lib/clusta/geometry/directed/vertex_arrows.rb +0 -25
  59. data/lib/clusta/geometry/vertex_arrows.rb +0 -45
  60. data/lib/clusta/transforms/vertex_arrows_to_edge_degree_pairs.rb +0 -63
  61. data/spec/clusta/geometry/element_spec.rb +0 -191
  62. data/spec/data/vertex_arrows/directed.unweighted.tsv +0 -7
  63. data/spec/data/vertex_arrows/directed.weighted.tsv +0 -7
  64. data/spec/data/vertex_arrows/undirected.unweighted.tsv +0 -9
  65. data/spec/data/vertex_arrows/undirected.weighted.tsv +0 -9
@@ -1,8 +1,9 @@
1
1
  module Clusta
2
2
  module Geometry
3
3
 
4
- class Arrow < Element
5
- field :target_label
4
+ class Neighbor < Element
5
+
6
+ key :label
6
7
  field :weight, :optional => true
7
8
 
8
9
  def directed?
@@ -13,4 +14,3 @@ module Clusta
13
14
 
14
15
  end
15
16
  end
16
-
@@ -0,0 +1,41 @@
1
+ module Clusta
2
+ module Geometry
3
+
4
+ class Neighborhood < Vertex
5
+
6
+ extra_inputs :neighbors
7
+
8
+ def joins? label
9
+ neighbors.detect { |neighbor| neighbor.label == label }
10
+ end
11
+
12
+ def directed?
13
+ false
14
+ end
15
+
16
+ def size
17
+ neighbors.size
18
+ end
19
+
20
+ def degree_pairs
21
+ neighbors.map do |neighbor|
22
+ # This vertex's degree is just the size of this
23
+ # neighborhood.
24
+ #
25
+ # We don't know anything about each neighbor's degree other
26
+ # than it must be at least 1 b/c it's in this vertex's
27
+ # neighborhood.
28
+ DegreePair.new(label, neighbor.label, size, 1)
29
+ end
30
+ end
31
+
32
+ def reversed_degree_pairs
33
+ neighbors.map do |neighbor|
34
+ DegreePair.new(neighbor.label, label, 1, size)
35
+ end
36
+ end
37
+
38
+ end
39
+
40
+ end
41
+ end
@@ -2,7 +2,10 @@ module Clusta
2
2
  module Geometry
3
3
 
4
4
  class Vertex < Element
5
- field :label
5
+
6
+ abbreviate 'V'
7
+
8
+ key :label
6
9
  end
7
10
 
8
11
  end
@@ -1,15 +1,112 @@
1
+ require 'configliere'
2
+
3
+ Settings.use :commandline
4
+
5
+ Settings.define :transform, :description => "The name of the tranformation to run.", :required => false
6
+ Settings.define :list_transforms, :description => "List known transformations.", :required => false, :type => :boolean, :default => false
7
+ Settings.define :class_names, :description => "The output format for class names, one of: 'long', 'medium', or 'short'.", :required => true, :default => 'medium'
8
+ Settings.define :serialize, :description => "The serialization format for data, one of: 'json' or 'tsv'.", :required => true, :default => 'tsv'
9
+ Settings.define :transforms_path, :description => "A colon-separated list of directories to require transform definitions.", :required => false, :default => ''
10
+ Settings.define :geometry_path, :description => "A colon-separated list of directories to require geometry definitions.", :required => false, :default => ''
11
+
1
12
  module Clusta
2
13
 
3
14
  class Runner
4
15
 
5
- def initialize args
6
- self.args = args
16
+ RUN_ARG_REGEXP = /--run=./
17
+
18
+
19
+ def initialize name, argv
20
+ @name = name
21
+ @argv = argv
7
22
  end
8
23
 
9
24
  def run!
10
- Settings.resolve!
25
+ begin
26
+ Settings.resolve!
27
+ case
28
+ when Settings[:list_transforms]
29
+ load_transforms!
30
+ list_transforms!
31
+ when Settings[:list_geometry]
32
+ load_geometry!
33
+ list_geometry!
34
+ when Settings[:transform]
35
+ load_transforms!
36
+ load_geometry!
37
+ run_transform!
38
+ when Settings[:map_command] || Settings[:reduce_command]
39
+ run_map_reduce!
40
+ else
41
+ print_help!
42
+ end
43
+ rescue Clusta::Error => e
44
+ $stderr.puts "ERROR: #{e.message}"
45
+ exit(1)
46
+ end
47
+ end
48
+
49
+ def load_transforms!
50
+ rb_files_within(:transforms_path) do |path|
51
+ Clusta::Transforms.load_from(path)
52
+ end
53
+ end
54
+
55
+ def load_geometry!
56
+ rb_files_within(:geometry_path) do |path|
57
+ Clusta::Geometry.load_from(path)
58
+ end
59
+ end
60
+
61
+ def rb_files_within key, &block
62
+ return if Settings[key].nil? || Settings[key].empty?
63
+ Settings[key].split(':').each do |dir|
64
+ expanded = File.expand_path(dir)
65
+ unless File.directory?(expanded)
66
+ $stderr.puts("WARNING: #{expanded} is not a directory")
67
+ next
68
+ end
69
+ Dir[File.join(expanded, '*.rb')].each do |path|
70
+ yield path
71
+ end
72
+ end
73
+ end
74
+
75
+ def list_transforms!
76
+ puts Clusta::Transforms.listing
77
+ end
78
+
79
+ def list_geometry!
80
+ puts Clusta::Geometry.listing
81
+ end
82
+
83
+ def run_transform!
84
+ transform = Clusta::Transforms.from_name(Settings[:transform])
85
+ ::ARGV.replace(@argv)
86
+ ::ARGV.push('--run=local') unless ARGV.any? { |arg| arg =~ self.class::RUN_ARG_REGEXP }
87
+ script = Clusta::Transforms.script_for(transform)
88
+ script.run
89
+ end
90
+
91
+ def run_map_reduce!
92
+ ::ARGV.replace(@argv)
93
+ ::ARGV.push('--run=local') unless ARGV.any? { |arg| arg =~ self.class::RUN_ARG_REGEXP }
94
+ begin
95
+ s = Wukong::Script.new(nil, nil)
96
+ rescue RuntimeError => e
97
+ raise Error.new(e.message)
98
+ end
99
+ s.run
100
+ end
101
+
102
+ def print_help!
103
+ begin
104
+ s = Wukong::Script.new(nil, nil)
105
+ rescue RuntimeError => e
106
+ raise Error.new(e.message)
107
+ end
108
+ s.run
11
109
  end
12
110
 
13
111
  end
14
-
15
112
  end
@@ -0,0 +1,100 @@
1
+ module Clusta
2
+ module Schema
3
+
4
+ def extra_inputs
5
+ @extra_inputs ||= []
6
+ end
7
+ attr_writer :extra_inputs
8
+
9
+ def extra_outputs
10
+ extra_inputs.map(&:to_s)
11
+ end
12
+
13
+ def fields
14
+ self.class.fields
15
+ end
16
+
17
+ def keys
18
+ self.class.keys
19
+ end
20
+
21
+ def non_key_fields
22
+ self.class.non_key_fields
23
+ end
24
+
25
+ def self.included klass
26
+ klass.extend(ClassMethods)
27
+ class << klass ; attr_reader :fields ; end
28
+ klass.instance_variable_set('@fields', [])
29
+ end
30
+
31
+ module ClassMethods
32
+
33
+ def extra_inputs name
34
+ alias_method name, :extra_inputs
35
+ end
36
+
37
+ def inherited(subclass)
38
+ subclass.instance_variable_set("@fields", @fields.dup)
39
+ super
40
+ end
41
+
42
+ def field_names
43
+ @fields.map { |field| field[:name].to_s }
44
+ end
45
+
46
+ def has_optional_field?
47
+ @fields.any? { |field| field[:optional] }
48
+ end
49
+
50
+ def has_non_key_field?
51
+ @fields.any? { |field| ! field[:key] }
52
+ end
53
+
54
+ def optional_field
55
+ @fields.detect { |field| field[:optional] }
56
+ end
57
+
58
+ def keys
59
+ @fields.find_all { |field| field[:key] }
60
+ end
61
+
62
+ def non_key_fields
63
+ @fields.find_all { |field| ! field[:key] }
64
+ end
65
+
66
+ def field name, options={}
67
+ raise AmbiguousArgumentsError.new("Cannot define a second optional field #{name} because field #{optional_field[:name]} is already optional.") if has_optional_field?
68
+
69
+ raise SortError.new("The first field defined must be a key that can be sorted on.") if @fields.empty? && (!options[:key])
70
+ raise SortError.new("Cannot define a key field #{name} because some non-key fields have already been defined.") if options[:key] && has_non_key_field?
71
+ raise SortError.new("Key fields (#{name}) cannot have type :geometry") if options[:key] && options[:type] == :geometry
72
+
73
+ attr_reader name
74
+
75
+ case options[:type]
76
+ when :int
77
+ define_method "#{name}=" do |val|
78
+ instance_variable_set("@#{name}", val.to_i)
79
+ end
80
+ when :float
81
+ define_method "#{name}=" do |val|
82
+ instance_variable_set("@#{name}", val.to_f)
83
+ end
84
+ else
85
+ define_method "#{name}=" do |val|
86
+ instance_variable_set("@#{name}", val)
87
+ end
88
+ end
89
+ @fields << options.merge(:name => name)
90
+ end
91
+
92
+ def key name, options={}
93
+ field name, options.merge(:key => true)
94
+ end
95
+
96
+ end
97
+
98
+ end
99
+ end
100
+
@@ -0,0 +1,63 @@
1
+ module Clusta
2
+
3
+ # Defines methods that allow a class to (de)serialize itself in a
4
+ # way compatabile with Wukong.
5
+ module Serialization
6
+
7
+ autoload :TSV, 'clusta/serialization/tsv'
8
+ autoload :JSON, 'clusta/serialization/json'
9
+
10
+ def self.included klass
11
+ klass.extend(ClassMethods)
12
+ end
13
+
14
+ def stream_name
15
+ self.class.stream_name
16
+ end
17
+
18
+ def initialize *args
19
+ process_args(*args)
20
+ end
21
+
22
+ def process_args *args
23
+ end
24
+
25
+ module ClassMethods
26
+
27
+ def set_stream_name string
28
+ Geometry.register_element self, string
29
+ @stream_name = string
30
+ end
31
+
32
+ def abbreviate string
33
+ Geometry.register_element self, string
34
+ @abbreviation = string
35
+ end
36
+
37
+ def abbreviation
38
+ @abbreviation
39
+ end
40
+
41
+ def all_stream_names
42
+ [stream_name].tap do |names|
43
+ names << abbreviation if abbreviation
44
+ names << to_s
45
+ names << to_s.split('::').last if respond_to?(:name) && name
46
+ end
47
+ end
48
+
49
+ def stream_name
50
+ return @stream_name if @stream_name
51
+ case
52
+ when defined?(Settings) && Settings[:class_names].to_s == 'short' && abbreviation
53
+ @stream_name = abbreviation
54
+ when defined?(Settings) && Settings[:class_names].to_s == 'long'
55
+ @stream_name = to_s
56
+ else
57
+ @stream_name = to_s.split("::").last
58
+ end
59
+ end
60
+ end
61
+
62
+ end
63
+ end
@@ -0,0 +1,86 @@
1
+ require 'json'
2
+
3
+ module Clusta
4
+ module Serialization
5
+ module JSON
6
+
7
+ def to_hash
8
+ {}.tap do |json|
9
+ fields.each do |field|
10
+ value = send(field[:name])
11
+ value = value.to_hash if value.respond_to?(:to_hash)
12
+ json[field[:name]] = value
13
+ end
14
+ end
15
+ end
16
+
17
+ def to_flat
18
+ [stream_name].tap do |record|
19
+ keys.each do |key|
20
+ record << self.send(key[:name])
21
+ end
22
+ data = non_key_field_data
23
+ record << data.to_json unless data.empty?
24
+ record.concat(extra_outputs)
25
+ end
26
+ end
27
+
28
+ def non_key_field_data
29
+ {}.tap do |data|
30
+ non_key_fields.each do |field|
31
+ value = send(field[:name])
32
+ value = value.to_hash if value.respond_to?(:to_hash)
33
+ data[field[:name]] = value unless value.nil? && field[:optional]
34
+ end
35
+ end
36
+ end
37
+
38
+ def process_args *args
39
+ json_index = 0
40
+ self.class.keys.each_with_index do |key, index|
41
+ self.send("#{key[:name]}=", args[index])
42
+ json_index = index + 1
43
+ end
44
+
45
+ if args[json_index]
46
+ if args[json_index].is_a?(Hash)
47
+ data = args[json_index]
48
+ else
49
+ data = ::JSON.parse(args[json_index])
50
+ end
51
+ else
52
+ data = {}
53
+ end
54
+
55
+ non_key_fields.each do |field|
56
+ name = field[:name].to_s
57
+ case
58
+ when field[:optional]
59
+ self.send("#{name}=", data[name]) if data.has_key?(name)
60
+ when (!data.has_key?(name))
61
+ raise ArgumentError.new("A #{self.class} requires a non-nil value for #{name}.")
62
+ when field[:type] == :geometry
63
+ self.send("#{name}=", data[name])
64
+ else
65
+ self.send("#{name}=", data[name])
66
+ end
67
+ end
68
+
69
+ self.extra_inputs = (args[(json_index + 1)..-1] || [])
70
+ end
71
+
72
+ def self.included klass
73
+ klass.extend(ClassMethods)
74
+ end
75
+
76
+ module ClassMethods
77
+
78
+ def from_json_component data
79
+
80
+ end
81
+
82
+ end
83
+
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,81 @@
1
+ module Clusta
2
+ module Serialization
3
+ module TSV
4
+
5
+ def to_flat
6
+ [stream_name].tap do |record|
7
+ fields.each do |field|
8
+ value = send(field[:name])
9
+ value = value.to_tsv_component if field[:type] == :geometry
10
+ record << value.to_s unless value.nil? && field[:optional]
11
+ end
12
+ record.concat(extra_outputs)
13
+ end
14
+ end
15
+
16
+ def suffix index
17
+ case index.to_s
18
+ when /1$/ then 'st'
19
+ when /2$/ then 'nd'
20
+ when /3$/ then 'rd'
21
+ else 'th'
22
+ end
23
+ end
24
+
25
+ def process_args *args
26
+ self.class.fields.each_with_index do |field, index|
27
+ case
28
+ when field[:optional]
29
+ self.send("#{field[:name]}=", args[index]) if args[index]
30
+ when args[index].nil?
31
+ raise ArgumentError.new("A #{self.class} requires a non-nil value for #{field[:name]} as its #{index}#{suffix(index)} argument.")
32
+ when field[:type] == :geometry
33
+ self.send("#{field[:name]}=", self.class.from_tsv_component_string(args[index]))
34
+ else
35
+ self.send("#{field[:name]}=", args[index])
36
+ end
37
+ end
38
+ self.extra_inputs = (args[self.class.fields.size..-1] || [])
39
+ end
40
+
41
+ def extra_inputs= inputs
42
+ @extra_inputs = inputs.map do |input|
43
+ if input =~ /^[A-Z].*;/
44
+ self.class.from_tsv_component_string(input)
45
+ else
46
+ input
47
+ end
48
+ end
49
+ end
50
+
51
+ def extra_outputs
52
+ @extra_inputs.map do |input|
53
+ if input.respond_to?(:to_tsv_component)
54
+ input.to_tsv_component
55
+ else
56
+ input
57
+ end
58
+ end
59
+ end
60
+
61
+ def self.included klass
62
+ klass.extend(ClassMethods)
63
+ end
64
+
65
+ def to_tsv_component
66
+ to_flat.join(";")
67
+ end
68
+
69
+ module ClassMethods
70
+
71
+ def from_tsv_component_string string
72
+ return string unless string.is_a?(String)
73
+ args = string.split(';')
74
+ klass_name = args.shift
75
+ raise ArgumentError.new("Elements instantiated from a TSV component string must match the format 'klass;[field1;[field2;]...]'") unless klass_name
76
+ Wukong.class_from_resource(klass_name).new(*args)
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end