clusta 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. data/README.rdoc +66 -0
  2. data/VERSION +1 -1
  3. data/bin/clusta +1 -28
  4. data/lib/clusta.rb +12 -3
  5. data/lib/clusta/geometry.rb +53 -8
  6. data/lib/clusta/geometry/all.rb +3 -0
  7. data/lib/clusta/geometry/assortativity.rb +2 -2
  8. data/lib/clusta/geometry/degree.rb +3 -1
  9. data/lib/clusta/geometry/{edge_degree_pair.rb → degree_pair.rb} +3 -3
  10. data/lib/clusta/geometry/directed/degree.rb +3 -1
  11. data/lib/clusta/geometry/directed/{edge_degree_pair.rb → degree_pair.rb} +4 -3
  12. data/lib/clusta/geometry/directed/edge.rb +4 -2
  13. data/lib/clusta/geometry/directed/{arrow.rb → neighbor.rb} +1 -1
  14. data/lib/clusta/geometry/directed/neighborhood.rb +31 -0
  15. data/lib/clusta/geometry/edge.rb +6 -4
  16. data/lib/clusta/geometry/element.rb +10 -117
  17. data/lib/clusta/geometry/{arrow.rb → neighbor.rb} +3 -3
  18. data/lib/clusta/geometry/neighborhood.rb +41 -0
  19. data/lib/clusta/geometry/vertex.rb +4 -1
  20. data/lib/clusta/runner.rb +101 -4
  21. data/lib/clusta/schema.rb +100 -0
  22. data/lib/clusta/serialization.rb +63 -0
  23. data/lib/clusta/serialization/json.rb +86 -0
  24. data/lib/clusta/serialization/tsv.rb +81 -0
  25. data/lib/clusta/transforms.rb +59 -26
  26. data/lib/clusta/transforms/{edge_degree_pairs_to_assortativities.rb → degree_pairs_to_assortativities.rb} +7 -3
  27. data/lib/clusta/transforms/edges_to_degrees.rb +5 -0
  28. data/lib/clusta/transforms/{edges_to_vertex_arrows.rb → edges_to_neighborhoods.rb} +11 -6
  29. data/lib/clusta/transforms/import.rb +6 -0
  30. data/lib/clusta/transforms/neighborhoods_to_degree_pairs.rb +70 -0
  31. data/lib/clusta/transforms/pm3d.rb +46 -0
  32. data/lib/clusta/transforms/prune_edges.rb +34 -0
  33. data/spec/clusta/schema_spec.rb +36 -0
  34. data/spec/clusta/serialization/json_spec.rb +133 -0
  35. data/spec/clusta/serialization/tsv_spec.rb +133 -0
  36. data/spec/clusta/serialization_spec.rb +27 -0
  37. data/spec/clusta/transforms/degree_pairs_to_assortativities_spec.rb +13 -0
  38. data/spec/clusta/transforms/{edges_to_vertex_arrows_spec.rb → edges_to_neighborhoods_spec.rb} +5 -5
  39. data/spec/clusta/transforms/import_spec.rb +9 -0
  40. data/spec/clusta/transforms/neighborhoods_to_degree_pairs_spec.rb +21 -0
  41. data/spec/clusta/transforms/prune_edges_spec.rb +22 -0
  42. data/spec/data/assortativities/directed.tsv +4 -0
  43. data/spec/data/assortativities/undirected.tsv +7 -0
  44. data/spec/data/degree_pairs/directed.tsv +10 -0
  45. data/spec/data/degree_pairs/undirected.tsv +18 -0
  46. data/spec/data/external/vertices.tsv +9 -0
  47. data/spec/data/imports/vertices.labeled.tsv +9 -0
  48. data/spec/data/neighborhoods/directed.unweighted.tsv +7 -0
  49. data/spec/data/neighborhoods/directed.weighted.tsv +7 -0
  50. data/spec/data/neighborhoods/undirected.unweighted.tsv +9 -0
  51. data/spec/data/neighborhoods/undirected.weighted.tsv +9 -0
  52. data/spec/data/pruned_edges/directed.unweighted.tsv +1 -0
  53. data/spec/data/pruned_edges/directed.weighted.tsv +3 -0
  54. data/spec/data/pruned_edges/undirected.unweighted.tsv +1 -0
  55. data/spec/data/pruned_edges/undirected.weighted.tsv +3 -0
  56. data/spec/support/transforms_spec_helper.rb +5 -1
  57. metadata +47 -23
  58. data/lib/clusta/geometry/directed/vertex_arrows.rb +0 -25
  59. data/lib/clusta/geometry/vertex_arrows.rb +0 -45
  60. data/lib/clusta/transforms/vertex_arrows_to_edge_degree_pairs.rb +0 -63
  61. data/spec/clusta/geometry/element_spec.rb +0 -191
  62. data/spec/data/vertex_arrows/directed.unweighted.tsv +0 -7
  63. data/spec/data/vertex_arrows/directed.weighted.tsv +0 -7
  64. data/spec/data/vertex_arrows/undirected.unweighted.tsv +0 -9
  65. data/spec/data/vertex_arrows/undirected.weighted.tsv +0 -9
@@ -2,25 +2,12 @@ module Clusta
2
2
 
3
3
  module Transforms
4
4
 
5
- def self.register_streamable klass, aliases=nil
6
- (aliases || [klass.to_s]).each do |klass_alias|
7
- Wukong::RESOURCE_CLASS_MAP[klass_alias] = klass
8
- end
9
- end
10
-
11
- def self.register_transform name, path
12
- autoload name, path
13
- end
14
-
15
- Dir[File.join(File.dirname(__FILE__), "transforms/*.rb")].each do |path|
16
- require_name = Clusta.require_name(path)
17
- register_transform Clusta.classify(require_name), "clusta/transforms/#{require_name}"
5
+ def self.names
6
+ @names ||= []
18
7
  end
19
8
 
20
- ARG_REGEXP = /--transform=[\w\d_]+/
21
-
22
- def self.from_arg arg
23
- from_name(arg.split('=').last)
9
+ def self.register_transform name
10
+ names << name
24
11
  end
25
12
 
26
13
  def self.from_name name
@@ -34,17 +21,63 @@ module Clusta
34
21
  def self.script_for transform
35
22
  mapper = transform::Mapper if defined?(transform::Mapper)
36
23
  reducer = transform::Reducer if defined?(transform::Reducer)
37
- Wukong::Script.new(mapper, reducer)
24
+ options = (transform.respond_to?(:options) ? transform.options : {})
25
+ script = defined?(transform::Script) ? transform::Script : default_script
26
+ script.new(mapper, reducer, options)
27
+ end
28
+
29
+ def self.default_script
30
+ Class.new(Wukong::Script).tap do |c|
31
+ c.class_eval do
32
+ def local_mode_sort_commandline
33
+ "sort -n -k2"
34
+ end
35
+ end
36
+ end
38
37
  end
39
-
40
- end
41
38
 
42
- Geometry::ELEMENTS.each do |element_name|
43
- Transforms.register_streamable Geometry.const_get(element_name), [
44
- "Clusta::Geometry::#{element_name}",
45
- "Geometry::#{element_name}",
46
- element_name.to_s
47
- ]
39
+ def self.has_mapper?(transform)
40
+ defined?(transform::Mapper)
41
+ end
42
+
43
+ def self.has_reducer?(transform)
44
+ defined?(transform::Reducer)
45
+ end
46
+
47
+ Dir[File.join(File.dirname(__FILE__), "transforms/*.rb")].each do |path|
48
+ require_name = Clusta.require_name(path)
49
+ autoload Clusta.classify(require_name), "clusta/transforms/#{require_name}"
50
+ register_transform require_name
51
+ end
52
+
53
+ def self.listing
54
+ [].tap do |out|
55
+ out << "Known transforms:"
56
+ out << ''
57
+ names.sort.each do |transform_name|
58
+ transform = from_name(transform_name)
59
+ name_suffix = case
60
+ when has_mapper?(transform) && has_reducer?(transform) then ''
61
+ when (! has_mapper?(transform)) && has_reducer?(transform) then ' (reduce-only)'
62
+ when has_mapper?(transform) && (! has_reducer?(transform)) then ' (map-only)'
63
+ when (! has_mapper?(transform)) && (! has_reducer?(transform)) then ' (nothing)'
64
+ end
65
+
66
+ out << " #{transform_name}#{name_suffix}"
67
+ if transform.respond_to?(:help)
68
+ out << ''
69
+ out << " #{transform.help}"
70
+ end
71
+ out << ''
72
+ end
73
+ end.join("\n")
74
+ end
75
+
76
+ def self.load_from path
77
+ class_eval(File.read(path), path)
78
+ register_transform(Clusta.require_name(path))
79
+ end
80
+
48
81
  end
49
82
 
50
83
  end
@@ -1,13 +1,17 @@
1
+ require 'clusta/geometry/degree_pair'
2
+ require 'clusta/geometry/directed/degree_pair'
3
+ require 'clusta/geometry/assortativity'
4
+
1
5
  module Clusta
2
6
 
3
7
  module Transforms
4
8
 
5
- module EdgeDegreePairsToAssortativities
9
+ module DegreePairsToAssortativities
6
10
 
7
11
  class Mapper < Wukong::Streamer::StructStreamer
8
12
 
9
- def process edge_degree_pair, *record
10
- emit edge_degree_pair.assortativity
13
+ def process degree_pair, *record
14
+ emit degree_pair.assortativity
11
15
  end
12
16
 
13
17
  end
@@ -1,3 +1,8 @@
1
+ require 'clusta/geometry/edge'
2
+ require 'clusta/geometry/directed/edge'
3
+ require 'clusta/geometry/degree'
4
+ require 'clusta/geometry/directed/degree'
5
+
1
6
  module Clusta
2
7
 
3
8
  module Transforms
@@ -1,8 +1,13 @@
1
+ require 'clusta/geometry/edge'
2
+ require 'clusta/geometry/directed/edge'
3
+ require 'clusta/geometry/neighborhood'
4
+ require 'clusta/geometry/directed/neighborhood'
5
+
1
6
  module Clusta
2
7
 
3
8
  module Transforms
4
9
 
5
- module EdgesToVertexArrows
10
+ module EdgesToNeighborhoods
6
11
 
7
12
  class Mapper < Wukong::Streamer::StructStreamer
8
13
 
@@ -15,7 +20,7 @@ module Clusta
15
20
 
16
21
  class Reducer < Wukong::Streamer::AccumulatingReducer
17
22
 
18
- attr_accessor :arrows, :directed
23
+ attr_accessor :neighbors, :directed
19
24
 
20
25
  include Wukong::Streamer::StructRecordizer
21
26
 
@@ -26,19 +31,19 @@ module Clusta
26
31
  end
27
32
 
28
33
  def start! new_edge, *record
29
- self.arrows = []
34
+ self.neighbors = []
30
35
  self.directed = new_edge.directed?
31
36
  end
32
37
 
33
38
  def accumulate new_edge, *record
34
- self.arrows << new_edge.arrow
39
+ self.neighbors << new_edge.neighbor
35
40
  end
36
41
 
37
42
  def finalize &block
38
43
  if directed
39
- emit Geometry::DirectedVertexArrows.new(vertex_label, *arrows)
44
+ emit Geometry::DirectedNeighborhood.new(vertex_label, *neighbors)
40
45
  else
41
- emit Geometry::VertexArrows.new(vertex_label, *arrows)
46
+ emit Geometry::Neighborhood.new(vertex_label, *neighbors)
42
47
  end
43
48
  end
44
49
 
@@ -1,9 +1,15 @@
1
+ Settings.define :as, :required => true, :description => "Name of the Clusta class to import data as."
2
+
1
3
  module Clusta
2
4
 
3
5
  module Transforms
4
6
 
5
7
  module Import
6
8
 
9
+ def self.help
10
+ "Import data into the format expected by Clusta."
11
+ end
12
+
7
13
  class Mapper < Wukong::Streamer::Base
8
14
 
9
15
  def process *record
@@ -0,0 +1,70 @@
1
+ require 'clusta/geometry/neighborhood'
2
+ require 'clusta/geometry/directed/neighborhood'
3
+ require 'clusta/geometry/neighbor'
4
+ require 'clusta/geometry/directed/neighbor'
5
+ require 'clusta/geometry/degree_pair'
6
+ require 'clusta/geometry/directed/degree_pair'
7
+
8
+ module Clusta
9
+
10
+ module Transforms
11
+
12
+ module NeighborhoodsToDegreePairs
13
+
14
+ class Mapper < Wukong::Streamer::StructStreamer
15
+
16
+ def process neighborhood, *record
17
+ neighborhood.reversed_degree_pairs.each { |degree_pair| emit(degree_pair) }
18
+ end
19
+
20
+ end
21
+
22
+ class Reducer < Wukong::Streamer::AccumulatingReducer
23
+
24
+ attr_accessor :degree_pairs, :source_degree, :source_in_degree, :source_out_degree
25
+
26
+ include Wukong::Streamer::StructRecordizer
27
+
28
+ def get_key new_degree_pair, *record
29
+ new_degree_pair.source_label
30
+ end
31
+
32
+ def start! new_degree_pair, *record
33
+ self.degree_pairs = []
34
+ if new_degree_pair.directed?
35
+ self.source_in_degree = 0
36
+ self.source_out_degree = 0
37
+ else
38
+ self.source_degree = 0
39
+ end
40
+ end
41
+
42
+ def accumulate new_degree_pair, *record
43
+ self.degree_pairs << new_degree_pair
44
+ if new_degree_pair.directed?
45
+ self.source_in_degree += new_degree_pair.source_in_degree_value
46
+ self.source_out_degree += new_degree_pair.source_out_degree_value
47
+ else
48
+ self.source_degree += new_degree_pair.source_degree_value
49
+ end
50
+ end
51
+
52
+ def finalize &block
53
+ degree_pairs.each do |degree_pair|
54
+ if degree_pair.directed?
55
+ degree_pair.source_in_degree_value = source_in_degree
56
+ degree_pair.source_out_degree_value = source_out_degree
57
+ else
58
+ degree_pair.source_degree_value = source_degree
59
+ end
60
+ emit degree_pair
61
+ end
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+ end
68
+
69
+ end
70
+
@@ -0,0 +1,46 @@
1
+ module Clusta
2
+
3
+ module Transforms
4
+
5
+ module Pm3d
6
+
7
+ class Mapper < Wukong::Streamer::Base
8
+
9
+ def process *record
10
+ if record.first && record.first =~ /[^\d]/
11
+ emit record[1..-1]
12
+ else
13
+ emit record
14
+ end
15
+ end
16
+ end
17
+
18
+ class Reducer < Wukong::Streamer::AccumulatingReducer
19
+
20
+ attr_accessor :records
21
+
22
+ def start! *record
23
+ self.records = []
24
+ end
25
+
26
+ def accumulate *record
27
+ self.records << record
28
+ end
29
+
30
+ def finalize &block
31
+ records.each { |record| emit(record) }
32
+ emit []
33
+ end
34
+
35
+ end
36
+
37
+ class Script < Wukong::Script
38
+ def local_mode_sort_commandline
39
+ "sort -n -k1 -k2"
40
+ end
41
+ end
42
+
43
+ end
44
+ end
45
+
46
+ end
@@ -0,0 +1,34 @@
1
+ require 'clusta/geometry/edge'
2
+ require 'clusta/geometry/directed/edge'
3
+
4
+ Settings.define :min_weight, :type => Float, :description => "Prune edges with weight less than this weight."
5
+ Settings.define :max_weight, :type => Float, :description => "Prune edges with weight more than this weight."
6
+
7
+ module Clusta
8
+
9
+ module Transforms
10
+
11
+ module PruneEdges
12
+
13
+ class Mapper < Wukong::Streamer::StructStreamer
14
+
15
+ def before_stream
16
+ raise ArgumentError.new("Must specify either a min_weight or a max_weight") if Settings[:min_weight].nil? && Settings[:max_weight].nil?
17
+ end
18
+
19
+ def within_weight_range? edge
20
+ return false if Settings[:min_weight] && Settings[:min_weight] > edge.weight.to_f
21
+ return false if Settings[:max_weight] && Settings[:max_weight] < edge.weight.to_f
22
+ true
23
+ end
24
+
25
+ def process edge, *record
26
+ emit(edge) if edge.weighted? && within_weight_range?(edge)
27
+ end
28
+
29
+ end
30
+
31
+ end
32
+ end
33
+
34
+ end
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+
3
+ describe Clusta::Schema do
4
+
5
+ before do
6
+ @root = Class.new
7
+ @root.send(:include, Clusta::Schema)
8
+ end
9
+
10
+ it "should not define any fields of its own" do
11
+ @root.fields.should == []
12
+ end
13
+
14
+ it "should allow a subclass to set its own fields without polluting the parent" do
15
+ subclass = Class.new(@root)
16
+ subclass.key :foo
17
+ @root.field_names.should_not include('foo')
18
+ subclass.field_names.should include('foo')
19
+ end
20
+
21
+ it "should allow a subclass of a subclass to set its own fields without polluting the parent" do
22
+ subclass1 = Class.new(@root)
23
+ subclass1.key :foo
24
+ subclass2 = Class.new(subclass1)
25
+ subclass2.key :bar
26
+
27
+ @root.field_names.should_not include('foo')
28
+ @root.field_names.should_not include('bar')
29
+
30
+ subclass1.field_names.should include('foo')
31
+ subclass1.field_names.should_not include('bar')
32
+
33
+ subclass2.field_names.should include('foo')
34
+ subclass2.field_names.should include('bar')
35
+ end
36
+ end
@@ -0,0 +1,133 @@
1
+ require 'spec_helper'
2
+
3
+ describe Clusta::Serialization::JSON do
4
+
5
+ def json_serializable_wrapper_class
6
+ Class.new.tap do |c|
7
+ c.send(:include, Clusta::Schema)
8
+ c.send(:include, Clusta::Serialization)
9
+ c.send(:include, Clusta::Serialization::JSON)
10
+ end
11
+ end
12
+
13
+ before do
14
+ @root = json_serializable_wrapper_class
15
+ @root.key :foo
16
+
17
+ @child = json_serializable_wrapper_class
18
+ @child.key :baz
19
+
20
+ @child.set_stream_name 'Child'
21
+ end
22
+
23
+ describe "processing inputs" do
24
+
25
+ it "should assign a single key field correctly" do
26
+ instance = @root.new("foovalue")
27
+ instance.foo.should == "foovalue"
28
+ end
29
+
30
+ it "should assign a multiple key fields correctly" do
31
+ @root.key :bar
32
+ instance = @root.new("foovalue", "barvalue")
33
+ instance.foo.should == "foovalue"
34
+ instance.bar.should == "barvalue"
35
+ end
36
+
37
+ it "should assign and parse key and non-key fields when given as Ruby hashes" do
38
+ @root.field :bar
39
+ @root.field :baz
40
+ instance = @root.new("foovalue", {"bar" => "barvalue", "baz" => "bazvalue"})
41
+ instance.foo.should == "foovalue"
42
+ instance.bar.should == "barvalue"
43
+ instance.baz.should == "bazvalue"
44
+ end
45
+
46
+ it "should assign and parse key and non-key fields when given as a JSON string" do
47
+ @root.field :bar
48
+ @root.field :baz
49
+ instance = @root.new("foovalue", '{"bar":"barvalue", "baz":"bazvalue"}')
50
+ instance.foo.should == "foovalue"
51
+ instance.bar.should == "barvalue"
52
+ instance.baz.should == "bazvalue"
53
+ end
54
+
55
+ it "should assign a value to an optional field only if it's present" do
56
+ @root.field :bar
57
+ @root.field :baz, :optional => true
58
+ instance = @root.new("foovalue", {"bar" => "barvalue"})
59
+ instance.foo.should == "foovalue"
60
+ instance.bar.should == "barvalue"
61
+ instance.baz.should be_nil
62
+
63
+ instance = @root.new("foovalue", {"bar" => "barvalue", "baz" => "bazvalue"})
64
+ instance.foo.should == "foovalue"
65
+ instance.bar.should == "barvalue"
66
+ instance.baz.should == "bazvalue"
67
+ end
68
+
69
+ it "should stash extra arguments it receives at initialization" do
70
+ @root.field :bar
71
+ instance = @root.new("foovalue", {"bar" => "barvalue"}, "extra1", "extra2")
72
+ instance.foo.should == "foovalue"
73
+ instance.bar.should == "barvalue"
74
+ instance.extra_inputs.size.should == 2
75
+ instance.extra_inputs[0].should == "extra1"
76
+ instance.extra_inputs[1].should == "extra2"
77
+ end
78
+
79
+ end
80
+
81
+ describe "serializing" do
82
+
83
+ it "returns an array appropriate for Wukong with a single key field" do
84
+ output = @root.new("foovalue").to_flat
85
+ output[0].should == @root.stream_name
86
+ output[1].should == "foovalue"
87
+ end
88
+
89
+ it "returns an array appropriate for Wukong with a multiple key fields" do
90
+ @root.key :bar
91
+ output = @root.new("foovalue", "barvalue").to_flat
92
+ output[0].should == @root.stream_name
93
+ output[1].should == "foovalue"
94
+ output[2].should == "barvalue"
95
+ end
96
+
97
+ it "returns an array appropriate for Wukong with keys and non key fields" do
98
+ @root.field :bar
99
+ output = @root.new("foovalue", { 'bar' => 'barvalue'}).to_flat
100
+ output[0].should == @root.stream_name
101
+ output[1].should == "foovalue"
102
+ output[2].should match(/"bar" *: *"barvalue"/)
103
+ end
104
+
105
+ it "returns an array with an optional field at the end only if it has a value" do
106
+ @root.field :bar, :optional => true
107
+ output1 = @root.new("foovalue").to_flat
108
+ output2 = @root.new("foovalue", {"bar" => "barvalue"}).to_flat
109
+
110
+ output1.size.should == 2
111
+ output1[0].should == @root.stream_name
112
+ output1[1].should == 'foovalue'
113
+
114
+ output2.size.should == 3
115
+ output2[0].should == @root.stream_name
116
+ output2[1].should == 'foovalue'
117
+ output2[2].should match(/"bar" *: *"barvalue"/)
118
+ end
119
+
120
+ it "returns an array including any extra inputs it received as outputs" do
121
+ @root.field :bar
122
+ output = @root.new("foovalue", {"bar" => "barvalue"}, "extra1", "extra2").to_flat
123
+ output.size.should == 5
124
+ output[0].should == @root.stream_name
125
+ output[1].should == "foovalue"
126
+ output[2].should match(/"bar" *: *"barvalue"/)
127
+ output[3].should == 'extra1'
128
+ output[4].should == 'extra2'
129
+ end
130
+
131
+ end
132
+
133
+ end