clusta 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/README.rdoc +66 -0
  2. data/VERSION +1 -1
  3. data/bin/clusta +1 -28
  4. data/lib/clusta.rb +12 -3
  5. data/lib/clusta/geometry.rb +53 -8
  6. data/lib/clusta/geometry/all.rb +3 -0
  7. data/lib/clusta/geometry/assortativity.rb +2 -2
  8. data/lib/clusta/geometry/degree.rb +3 -1
  9. data/lib/clusta/geometry/{edge_degree_pair.rb → degree_pair.rb} +3 -3
  10. data/lib/clusta/geometry/directed/degree.rb +3 -1
  11. data/lib/clusta/geometry/directed/{edge_degree_pair.rb → degree_pair.rb} +4 -3
  12. data/lib/clusta/geometry/directed/edge.rb +4 -2
  13. data/lib/clusta/geometry/directed/{arrow.rb → neighbor.rb} +1 -1
  14. data/lib/clusta/geometry/directed/neighborhood.rb +31 -0
  15. data/lib/clusta/geometry/edge.rb +6 -4
  16. data/lib/clusta/geometry/element.rb +10 -117
  17. data/lib/clusta/geometry/{arrow.rb → neighbor.rb} +3 -3
  18. data/lib/clusta/geometry/neighborhood.rb +41 -0
  19. data/lib/clusta/geometry/vertex.rb +4 -1
  20. data/lib/clusta/runner.rb +101 -4
  21. data/lib/clusta/schema.rb +100 -0
  22. data/lib/clusta/serialization.rb +63 -0
  23. data/lib/clusta/serialization/json.rb +86 -0
  24. data/lib/clusta/serialization/tsv.rb +81 -0
  25. data/lib/clusta/transforms.rb +59 -26
  26. data/lib/clusta/transforms/{edge_degree_pairs_to_assortativities.rb → degree_pairs_to_assortativities.rb} +7 -3
  27. data/lib/clusta/transforms/edges_to_degrees.rb +5 -0
  28. data/lib/clusta/transforms/{edges_to_vertex_arrows.rb → edges_to_neighborhoods.rb} +11 -6
  29. data/lib/clusta/transforms/import.rb +6 -0
  30. data/lib/clusta/transforms/neighborhoods_to_degree_pairs.rb +70 -0
  31. data/lib/clusta/transforms/pm3d.rb +46 -0
  32. data/lib/clusta/transforms/prune_edges.rb +34 -0
  33. data/spec/clusta/schema_spec.rb +36 -0
  34. data/spec/clusta/serialization/json_spec.rb +133 -0
  35. data/spec/clusta/serialization/tsv_spec.rb +133 -0
  36. data/spec/clusta/serialization_spec.rb +27 -0
  37. data/spec/clusta/transforms/degree_pairs_to_assortativities_spec.rb +13 -0
  38. data/spec/clusta/transforms/{edges_to_vertex_arrows_spec.rb → edges_to_neighborhoods_spec.rb} +5 -5
  39. data/spec/clusta/transforms/import_spec.rb +9 -0
  40. data/spec/clusta/transforms/neighborhoods_to_degree_pairs_spec.rb +21 -0
  41. data/spec/clusta/transforms/prune_edges_spec.rb +22 -0
  42. data/spec/data/assortativities/directed.tsv +4 -0
  43. data/spec/data/assortativities/undirected.tsv +7 -0
  44. data/spec/data/degree_pairs/directed.tsv +10 -0
  45. data/spec/data/degree_pairs/undirected.tsv +18 -0
  46. data/spec/data/external/vertices.tsv +9 -0
  47. data/spec/data/imports/vertices.labeled.tsv +9 -0
  48. data/spec/data/neighborhoods/directed.unweighted.tsv +7 -0
  49. data/spec/data/neighborhoods/directed.weighted.tsv +7 -0
  50. data/spec/data/neighborhoods/undirected.unweighted.tsv +9 -0
  51. data/spec/data/neighborhoods/undirected.weighted.tsv +9 -0
  52. data/spec/data/pruned_edges/directed.unweighted.tsv +1 -0
  53. data/spec/data/pruned_edges/directed.weighted.tsv +3 -0
  54. data/spec/data/pruned_edges/undirected.unweighted.tsv +1 -0
  55. data/spec/data/pruned_edges/undirected.weighted.tsv +3 -0
  56. data/spec/support/transforms_spec_helper.rb +5 -1
  57. metadata +47 -23
  58. data/lib/clusta/geometry/directed/vertex_arrows.rb +0 -25
  59. data/lib/clusta/geometry/vertex_arrows.rb +0 -45
  60. data/lib/clusta/transforms/vertex_arrows_to_edge_degree_pairs.rb +0 -63
  61. data/spec/clusta/geometry/element_spec.rb +0 -191
  62. data/spec/data/vertex_arrows/directed.unweighted.tsv +0 -7
  63. data/spec/data/vertex_arrows/directed.weighted.tsv +0 -7
  64. data/spec/data/vertex_arrows/undirected.unweighted.tsv +0 -9
  65. data/spec/data/vertex_arrows/undirected.weighted.tsv +0 -9
@@ -2,25 +2,12 @@ module Clusta
2
2
 
3
3
  module Transforms
4
4
 
5
- def self.register_streamable klass, aliases=nil
6
- (aliases || [klass.to_s]).each do |klass_alias|
7
- Wukong::RESOURCE_CLASS_MAP[klass_alias] = klass
8
- end
9
- end
10
-
11
- def self.register_transform name, path
12
- autoload name, path
13
- end
14
-
15
- Dir[File.join(File.dirname(__FILE__), "transforms/*.rb")].each do |path|
16
- require_name = Clusta.require_name(path)
17
- register_transform Clusta.classify(require_name), "clusta/transforms/#{require_name}"
5
+ def self.names
6
+ @names ||= []
18
7
  end
19
8
 
20
- ARG_REGEXP = /--transform=[\w\d_]+/
21
-
22
- def self.from_arg arg
23
- from_name(arg.split('=').last)
9
+ def self.register_transform name
10
+ names << name
24
11
  end
25
12
 
26
13
  def self.from_name name
@@ -34,17 +21,63 @@ module Clusta
34
21
  def self.script_for transform
35
22
  mapper = transform::Mapper if defined?(transform::Mapper)
36
23
  reducer = transform::Reducer if defined?(transform::Reducer)
37
- Wukong::Script.new(mapper, reducer)
24
+ options = (transform.respond_to?(:options) ? transform.options : {})
25
+ script = defined?(transform::Script) ? transform::Script : default_script
26
+ script.new(mapper, reducer, options)
27
+ end
28
+
29
+ def self.default_script
30
+ Class.new(Wukong::Script).tap do |c|
31
+ c.class_eval do
32
+ def local_mode_sort_commandline
33
+ "sort -n -k2"
34
+ end
35
+ end
36
+ end
38
37
  end
39
-
40
- end
41
38
 
42
- Geometry::ELEMENTS.each do |element_name|
43
- Transforms.register_streamable Geometry.const_get(element_name), [
44
- "Clusta::Geometry::#{element_name}",
45
- "Geometry::#{element_name}",
46
- element_name.to_s
47
- ]
39
+ def self.has_mapper?(transform)
40
+ defined?(transform::Mapper)
41
+ end
42
+
43
+ def self.has_reducer?(transform)
44
+ defined?(transform::Reducer)
45
+ end
46
+
47
+ Dir[File.join(File.dirname(__FILE__), "transforms/*.rb")].each do |path|
48
+ require_name = Clusta.require_name(path)
49
+ autoload Clusta.classify(require_name), "clusta/transforms/#{require_name}"
50
+ register_transform require_name
51
+ end
52
+
53
+ def self.listing
54
+ [].tap do |out|
55
+ out << "Known transforms:"
56
+ out << ''
57
+ names.sort.each do |transform_name|
58
+ transform = from_name(transform_name)
59
+ name_suffix = case
60
+ when has_mapper?(transform) && has_reducer?(transform) then ''
61
+ when (! has_mapper?(transform)) && has_reducer?(transform) then ' (reduce-only)'
62
+ when has_mapper?(transform) && (! has_reducer?(transform)) then ' (map-only)'
63
+ when (! has_mapper?(transform)) && (! has_reducer?(transform)) then ' (nothing)'
64
+ end
65
+
66
+ out << " #{transform_name}#{name_suffix}"
67
+ if transform.respond_to?(:help)
68
+ out << ''
69
+ out << " #{transform.help}"
70
+ end
71
+ out << ''
72
+ end
73
+ end.join("\n")
74
+ end
75
+
76
+ def self.load_from path
77
+ class_eval(File.read(path), path)
78
+ register_transform(Clusta.require_name(path))
79
+ end
80
+
48
81
  end
49
82
 
50
83
  end
@@ -1,13 +1,17 @@
1
+ require 'clusta/geometry/degree_pair'
2
+ require 'clusta/geometry/directed/degree_pair'
3
+ require 'clusta/geometry/assortativity'
4
+
1
5
  module Clusta
2
6
 
3
7
  module Transforms
4
8
 
5
- module EdgeDegreePairsToAssortativities
9
+ module DegreePairsToAssortativities
6
10
 
7
11
  class Mapper < Wukong::Streamer::StructStreamer
8
12
 
9
- def process edge_degree_pair, *record
10
- emit edge_degree_pair.assortativity
13
+ def process degree_pair, *record
14
+ emit degree_pair.assortativity
11
15
  end
12
16
 
13
17
  end
@@ -1,3 +1,8 @@
1
+ require 'clusta/geometry/edge'
2
+ require 'clusta/geometry/directed/edge'
3
+ require 'clusta/geometry/degree'
4
+ require 'clusta/geometry/directed/degree'
5
+
1
6
  module Clusta
2
7
 
3
8
  module Transforms
@@ -1,8 +1,13 @@
1
+ require 'clusta/geometry/edge'
2
+ require 'clusta/geometry/directed/edge'
3
+ require 'clusta/geometry/neighborhood'
4
+ require 'clusta/geometry/directed/neighborhood'
5
+
1
6
  module Clusta
2
7
 
3
8
  module Transforms
4
9
 
5
- module EdgesToVertexArrows
10
+ module EdgesToNeighborhoods
6
11
 
7
12
  class Mapper < Wukong::Streamer::StructStreamer
8
13
 
@@ -15,7 +20,7 @@ module Clusta
15
20
 
16
21
  class Reducer < Wukong::Streamer::AccumulatingReducer
17
22
 
18
- attr_accessor :arrows, :directed
23
+ attr_accessor :neighbors, :directed
19
24
 
20
25
  include Wukong::Streamer::StructRecordizer
21
26
 
@@ -26,19 +31,19 @@ module Clusta
26
31
  end
27
32
 
28
33
  def start! new_edge, *record
29
- self.arrows = []
34
+ self.neighbors = []
30
35
  self.directed = new_edge.directed?
31
36
  end
32
37
 
33
38
  def accumulate new_edge, *record
34
- self.arrows << new_edge.arrow
39
+ self.neighbors << new_edge.neighbor
35
40
  end
36
41
 
37
42
  def finalize &block
38
43
  if directed
39
- emit Geometry::DirectedVertexArrows.new(vertex_label, *arrows)
44
+ emit Geometry::DirectedNeighborhood.new(vertex_label, *neighbors)
40
45
  else
41
- emit Geometry::VertexArrows.new(vertex_label, *arrows)
46
+ emit Geometry::Neighborhood.new(vertex_label, *neighbors)
42
47
  end
43
48
  end
44
49
 
@@ -1,9 +1,15 @@
1
+ Settings.define :as, :required => true, :description => "Name of the Clusta class to import data as."
2
+
1
3
  module Clusta
2
4
 
3
5
  module Transforms
4
6
 
5
7
  module Import
6
8
 
9
+ def self.help
10
+ "Import data into the format expected by Clusta."
11
+ end
12
+
7
13
  class Mapper < Wukong::Streamer::Base
8
14
 
9
15
  def process *record
@@ -0,0 +1,70 @@
1
+ require 'clusta/geometry/neighborhood'
2
+ require 'clusta/geometry/directed/neighborhood'
3
+ require 'clusta/geometry/neighbor'
4
+ require 'clusta/geometry/directed/neighbor'
5
+ require 'clusta/geometry/degree_pair'
6
+ require 'clusta/geometry/directed/degree_pair'
7
+
8
+ module Clusta
9
+
10
+ module Transforms
11
+
12
+ module NeighborhoodsToDegreePairs
13
+
14
+ class Mapper < Wukong::Streamer::StructStreamer
15
+
16
+ def process neighborhood, *record
17
+ neighborhood.reversed_degree_pairs.each { |degree_pair| emit(degree_pair) }
18
+ end
19
+
20
+ end
21
+
22
+ class Reducer < Wukong::Streamer::AccumulatingReducer
23
+
24
+ attr_accessor :degree_pairs, :source_degree, :source_in_degree, :source_out_degree
25
+
26
+ include Wukong::Streamer::StructRecordizer
27
+
28
+ def get_key new_degree_pair, *record
29
+ new_degree_pair.source_label
30
+ end
31
+
32
+ def start! new_degree_pair, *record
33
+ self.degree_pairs = []
34
+ if new_degree_pair.directed?
35
+ self.source_in_degree = 0
36
+ self.source_out_degree = 0
37
+ else
38
+ self.source_degree = 0
39
+ end
40
+ end
41
+
42
+ def accumulate new_degree_pair, *record
43
+ self.degree_pairs << new_degree_pair
44
+ if new_degree_pair.directed?
45
+ self.source_in_degree += new_degree_pair.source_in_degree_value
46
+ self.source_out_degree += new_degree_pair.source_out_degree_value
47
+ else
48
+ self.source_degree += new_degree_pair.source_degree_value
49
+ end
50
+ end
51
+
52
+ def finalize &block
53
+ degree_pairs.each do |degree_pair|
54
+ if degree_pair.directed?
55
+ degree_pair.source_in_degree_value = source_in_degree
56
+ degree_pair.source_out_degree_value = source_out_degree
57
+ else
58
+ degree_pair.source_degree_value = source_degree
59
+ end
60
+ emit degree_pair
61
+ end
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+ end
68
+
69
+ end
70
+
@@ -0,0 +1,46 @@
1
+ module Clusta
2
+
3
+ module Transforms
4
+
5
+ module Pm3d
6
+
7
+ class Mapper < Wukong::Streamer::Base
8
+
9
+ def process *record
10
+ if record.first && record.first =~ /[^\d]/
11
+ emit record[1..-1]
12
+ else
13
+ emit record
14
+ end
15
+ end
16
+ end
17
+
18
+ class Reducer < Wukong::Streamer::AccumulatingReducer
19
+
20
+ attr_accessor :records
21
+
22
+ def start! *record
23
+ self.records = []
24
+ end
25
+
26
+ def accumulate *record
27
+ self.records << record
28
+ end
29
+
30
+ def finalize &block
31
+ records.each { |record| emit(record) }
32
+ emit []
33
+ end
34
+
35
+ end
36
+
37
+ class Script < Wukong::Script
38
+ def local_mode_sort_commandline
39
+ "sort -n -k1 -k2"
40
+ end
41
+ end
42
+
43
+ end
44
+ end
45
+
46
+ end
@@ -0,0 +1,34 @@
1
+ require 'clusta/geometry/edge'
2
+ require 'clusta/geometry/directed/edge'
3
+
4
+ Settings.define :min_weight, :type => Float, :description => "Prune edges with weight less than this weight."
5
+ Settings.define :max_weight, :type => Float, :description => "Prune edges with weight more than this weight."
6
+
7
+ module Clusta
8
+
9
+ module Transforms
10
+
11
+ module PruneEdges
12
+
13
+ class Mapper < Wukong::Streamer::StructStreamer
14
+
15
+ def before_stream
16
+ raise ArgumentError.new("Must specify either a min_weight or a max_weight") if Settings[:min_weight].nil? && Settings[:max_weight].nil?
17
+ end
18
+
19
+ def within_weight_range? edge
20
+ return false if Settings[:min_weight] && Settings[:min_weight] > edge.weight.to_f
21
+ return false if Settings[:max_weight] && Settings[:max_weight] < edge.weight.to_f
22
+ true
23
+ end
24
+
25
+ def process edge, *record
26
+ emit(edge) if edge.weighted? && within_weight_range?(edge)
27
+ end
28
+
29
+ end
30
+
31
+ end
32
+ end
33
+
34
+ end
@@ -0,0 +1,36 @@
1
+ require 'spec_helper'
2
+
3
+ describe Clusta::Schema do
4
+
5
+ before do
6
+ @root = Class.new
7
+ @root.send(:include, Clusta::Schema)
8
+ end
9
+
10
+ it "should not define any fields of its own" do
11
+ @root.fields.should == []
12
+ end
13
+
14
+ it "should allow a subclass to set its own fields without polluting the parent" do
15
+ subclass = Class.new(@root)
16
+ subclass.key :foo
17
+ @root.field_names.should_not include('foo')
18
+ subclass.field_names.should include('foo')
19
+ end
20
+
21
+ it "should allow a subclass of a subclass to set its own fields without polluting the parent" do
22
+ subclass1 = Class.new(@root)
23
+ subclass1.key :foo
24
+ subclass2 = Class.new(subclass1)
25
+ subclass2.key :bar
26
+
27
+ @root.field_names.should_not include('foo')
28
+ @root.field_names.should_not include('bar')
29
+
30
+ subclass1.field_names.should include('foo')
31
+ subclass1.field_names.should_not include('bar')
32
+
33
+ subclass2.field_names.should include('foo')
34
+ subclass2.field_names.should include('bar')
35
+ end
36
+ end
@@ -0,0 +1,133 @@
1
+ require 'spec_helper'
2
+
3
+ describe Clusta::Serialization::JSON do
4
+
5
+ def json_serializable_wrapper_class
6
+ Class.new.tap do |c|
7
+ c.send(:include, Clusta::Schema)
8
+ c.send(:include, Clusta::Serialization)
9
+ c.send(:include, Clusta::Serialization::JSON)
10
+ end
11
+ end
12
+
13
+ before do
14
+ @root = json_serializable_wrapper_class
15
+ @root.key :foo
16
+
17
+ @child = json_serializable_wrapper_class
18
+ @child.key :baz
19
+
20
+ @child.set_stream_name 'Child'
21
+ end
22
+
23
+ describe "processing inputs" do
24
+
25
+ it "should assign a single key field correctly" do
26
+ instance = @root.new("foovalue")
27
+ instance.foo.should == "foovalue"
28
+ end
29
+
30
+ it "should assign a multiple key fields correctly" do
31
+ @root.key :bar
32
+ instance = @root.new("foovalue", "barvalue")
33
+ instance.foo.should == "foovalue"
34
+ instance.bar.should == "barvalue"
35
+ end
36
+
37
+ it "should assign and parse key and non-key fields when given as Ruby hashes" do
38
+ @root.field :bar
39
+ @root.field :baz
40
+ instance = @root.new("foovalue", {"bar" => "barvalue", "baz" => "bazvalue"})
41
+ instance.foo.should == "foovalue"
42
+ instance.bar.should == "barvalue"
43
+ instance.baz.should == "bazvalue"
44
+ end
45
+
46
+ it "should assign and parse key and non-key fields when given as a JSON string" do
47
+ @root.field :bar
48
+ @root.field :baz
49
+ instance = @root.new("foovalue", '{"bar":"barvalue", "baz":"bazvalue"}')
50
+ instance.foo.should == "foovalue"
51
+ instance.bar.should == "barvalue"
52
+ instance.baz.should == "bazvalue"
53
+ end
54
+
55
+ it "should assign a value to an optional field only if it's present" do
56
+ @root.field :bar
57
+ @root.field :baz, :optional => true
58
+ instance = @root.new("foovalue", {"bar" => "barvalue"})
59
+ instance.foo.should == "foovalue"
60
+ instance.bar.should == "barvalue"
61
+ instance.baz.should be_nil
62
+
63
+ instance = @root.new("foovalue", {"bar" => "barvalue", "baz" => "bazvalue"})
64
+ instance.foo.should == "foovalue"
65
+ instance.bar.should == "barvalue"
66
+ instance.baz.should == "bazvalue"
67
+ end
68
+
69
+ it "should stash extra arguments it receives at initialization" do
70
+ @root.field :bar
71
+ instance = @root.new("foovalue", {"bar" => "barvalue"}, "extra1", "extra2")
72
+ instance.foo.should == "foovalue"
73
+ instance.bar.should == "barvalue"
74
+ instance.extra_inputs.size.should == 2
75
+ instance.extra_inputs[0].should == "extra1"
76
+ instance.extra_inputs[1].should == "extra2"
77
+ end
78
+
79
+ end
80
+
81
+ describe "serializing" do
82
+
83
+ it "returns an array appropriate for Wukong with a single key field" do
84
+ output = @root.new("foovalue").to_flat
85
+ output[0].should == @root.stream_name
86
+ output[1].should == "foovalue"
87
+ end
88
+
89
+ it "returns an array appropriate for Wukong with a multiple key fields" do
90
+ @root.key :bar
91
+ output = @root.new("foovalue", "barvalue").to_flat
92
+ output[0].should == @root.stream_name
93
+ output[1].should == "foovalue"
94
+ output[2].should == "barvalue"
95
+ end
96
+
97
+ it "returns an array appropriate for Wukong with keys and non key fields" do
98
+ @root.field :bar
99
+ output = @root.new("foovalue", { 'bar' => 'barvalue'}).to_flat
100
+ output[0].should == @root.stream_name
101
+ output[1].should == "foovalue"
102
+ output[2].should match(/"bar" *: *"barvalue"/)
103
+ end
104
+
105
+ it "returns an array with an optional field at the end only if it has a value" do
106
+ @root.field :bar, :optional => true
107
+ output1 = @root.new("foovalue").to_flat
108
+ output2 = @root.new("foovalue", {"bar" => "barvalue"}).to_flat
109
+
110
+ output1.size.should == 2
111
+ output1[0].should == @root.stream_name
112
+ output1[1].should == 'foovalue'
113
+
114
+ output2.size.should == 3
115
+ output2[0].should == @root.stream_name
116
+ output2[1].should == 'foovalue'
117
+ output2[2].should match(/"bar" *: *"barvalue"/)
118
+ end
119
+
120
+ it "returns an array including any extra inputs it received as outputs" do
121
+ @root.field :bar
122
+ output = @root.new("foovalue", {"bar" => "barvalue"}, "extra1", "extra2").to_flat
123
+ output.size.should == 5
124
+ output[0].should == @root.stream_name
125
+ output[1].should == "foovalue"
126
+ output[2].should match(/"bar" *: *"barvalue"/)
127
+ output[3].should == 'extra1'
128
+ output[4].should == 'extra2'
129
+ end
130
+
131
+ end
132
+
133
+ end