clusta 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. data/README.rdoc +66 -0
  2. data/VERSION +1 -1
  3. data/bin/clusta +1 -28
  4. data/lib/clusta.rb +12 -3
  5. data/lib/clusta/geometry.rb +53 -8
  6. data/lib/clusta/geometry/all.rb +3 -0
  7. data/lib/clusta/geometry/assortativity.rb +2 -2
  8. data/lib/clusta/geometry/degree.rb +3 -1
  9. data/lib/clusta/geometry/{edge_degree_pair.rb → degree_pair.rb} +3 -3
  10. data/lib/clusta/geometry/directed/degree.rb +3 -1
  11. data/lib/clusta/geometry/directed/{edge_degree_pair.rb → degree_pair.rb} +4 -3
  12. data/lib/clusta/geometry/directed/edge.rb +4 -2
  13. data/lib/clusta/geometry/directed/{arrow.rb → neighbor.rb} +1 -1
  14. data/lib/clusta/geometry/directed/neighborhood.rb +31 -0
  15. data/lib/clusta/geometry/edge.rb +6 -4
  16. data/lib/clusta/geometry/element.rb +10 -117
  17. data/lib/clusta/geometry/{arrow.rb → neighbor.rb} +3 -3
  18. data/lib/clusta/geometry/neighborhood.rb +41 -0
  19. data/lib/clusta/geometry/vertex.rb +4 -1
  20. data/lib/clusta/runner.rb +101 -4
  21. data/lib/clusta/schema.rb +100 -0
  22. data/lib/clusta/serialization.rb +63 -0
  23. data/lib/clusta/serialization/json.rb +86 -0
  24. data/lib/clusta/serialization/tsv.rb +81 -0
  25. data/lib/clusta/transforms.rb +59 -26
  26. data/lib/clusta/transforms/{edge_degree_pairs_to_assortativities.rb → degree_pairs_to_assortativities.rb} +7 -3
  27. data/lib/clusta/transforms/edges_to_degrees.rb +5 -0
  28. data/lib/clusta/transforms/{edges_to_vertex_arrows.rb → edges_to_neighborhoods.rb} +11 -6
  29. data/lib/clusta/transforms/import.rb +6 -0
  30. data/lib/clusta/transforms/neighborhoods_to_degree_pairs.rb +70 -0
  31. data/lib/clusta/transforms/pm3d.rb +46 -0
  32. data/lib/clusta/transforms/prune_edges.rb +34 -0
  33. data/spec/clusta/schema_spec.rb +36 -0
  34. data/spec/clusta/serialization/json_spec.rb +133 -0
  35. data/spec/clusta/serialization/tsv_spec.rb +133 -0
  36. data/spec/clusta/serialization_spec.rb +27 -0
  37. data/spec/clusta/transforms/degree_pairs_to_assortativities_spec.rb +13 -0
  38. data/spec/clusta/transforms/{edges_to_vertex_arrows_spec.rb → edges_to_neighborhoods_spec.rb} +5 -5
  39. data/spec/clusta/transforms/import_spec.rb +9 -0
  40. data/spec/clusta/transforms/neighborhoods_to_degree_pairs_spec.rb +21 -0
  41. data/spec/clusta/transforms/prune_edges_spec.rb +22 -0
  42. data/spec/data/assortativities/directed.tsv +4 -0
  43. data/spec/data/assortativities/undirected.tsv +7 -0
  44. data/spec/data/degree_pairs/directed.tsv +10 -0
  45. data/spec/data/degree_pairs/undirected.tsv +18 -0
  46. data/spec/data/external/vertices.tsv +9 -0
  47. data/spec/data/imports/vertices.labeled.tsv +9 -0
  48. data/spec/data/neighborhoods/directed.unweighted.tsv +7 -0
  49. data/spec/data/neighborhoods/directed.weighted.tsv +7 -0
  50. data/spec/data/neighborhoods/undirected.unweighted.tsv +9 -0
  51. data/spec/data/neighborhoods/undirected.weighted.tsv +9 -0
  52. data/spec/data/pruned_edges/directed.unweighted.tsv +1 -0
  53. data/spec/data/pruned_edges/directed.weighted.tsv +3 -0
  54. data/spec/data/pruned_edges/undirected.unweighted.tsv +1 -0
  55. data/spec/data/pruned_edges/undirected.weighted.tsv +3 -0
  56. data/spec/support/transforms_spec_helper.rb +5 -1
  57. metadata +47 -23
  58. data/lib/clusta/geometry/directed/vertex_arrows.rb +0 -25
  59. data/lib/clusta/geometry/vertex_arrows.rb +0 -45
  60. data/lib/clusta/transforms/vertex_arrows_to_edge_degree_pairs.rb +0 -63
  61. data/spec/clusta/geometry/element_spec.rb +0 -191
  62. data/spec/data/vertex_arrows/directed.unweighted.tsv +0 -7
  63. data/spec/data/vertex_arrows/directed.weighted.tsv +0 -7
  64. data/spec/data/vertex_arrows/undirected.unweighted.tsv +0 -9
  65. data/spec/data/vertex_arrows/undirected.weighted.tsv +0 -9
@@ -0,0 +1,66 @@
1
+ = Clusta
2
+
3
+ Clusta is a Ruby gem for network analysis built on top of
4
+ Wukong[http://github.com/mrflip/wukong].
5
+
6
+ Wukong lets you write Ruby scripts that run on your laptop as well as
7
+ on a Hadoop cluster.
8
+
9
+ Clusta is:
10
+
11
+ - classes that make describing the geometry of networks easy
12
+ - network algorithms written with these classes to use Wukong
13
+ - a shim command-line program for running these algorithms
14
+
15
+ Start with a file containing edges:
16
+
17
+ Edge 1 2
18
+ Edge 2 3
19
+ Edge 1 4
20
+ Edge 4 5
21
+ Edge 5 6
22
+ Edge 5 7
23
+ Edge 6 8
24
+ Edge 7 8
25
+ Edge 8 9
26
+
27
+ Run this through a transformation named +edges_to_degrees+:
28
+
29
+ $ clusta --transform=edges_to_degrees /local/edges.tsv -
30
+ Degree 1 2
31
+ Degree 2 2
32
+ Degree 3 1
33
+ Degree 4 2
34
+ Degree 5 3
35
+ Degree 6 2
36
+ Degree 7 2
37
+ Degree 8 3
38
+ Degree 9 1
39
+
40
+ Chain transformations together:
41
+
42
+ $ clusta --transform=edges_to_neighborhoods /local/edges.tsv - | clusta --transform=neighborhoods_to_degree_pairs - - | clusta --transform=degree_pairs_to_assortativities - -
43
+ Assortativity 1 2 1
44
+ Assortativity 1 3 1
45
+ Assortativity 2 1 1
46
+ Assortativity 2 2 4
47
+ Assortativity 2 3 5
48
+ Assortativity 3 1 1
49
+ Assortativity 3 2 5
50
+
51
+ And then leverage Wukong when you're ready:
52
+
53
+ $ clusta --run=hadoop --transform=edges_to_neighborhoods /hdfs/edges.tsv /hdfs/neighborhoods.tsv
54
+ I, [2012-03-03T21:00:39.992750 #25835] INFO -- : Launching hadoop!
55
+ I, [2012-03-03T21:00:39.992979 #25835] INFO -- : Running
56
+
57
+ /usr/lib/hadoop/bin/hadoop \
58
+ jar /usr/lib/hadoop/contrib/streaming/hadoop-*streaming*.jar \
59
+ -D mapred.job.name='clusta---spec/data/edges/undirected.unweighted.tsv----' \
60
+ -mapper '/usr/bin/ruby1.9.1 clusta --map --log_interval=10000 --log_seconds=30 --transform=edges_to_degrees' \
61
+ -reducer '/usr/bin/ruby1.9.1 clusta --reduce --log_interval=10000 --log_seconds=30 --transform=edges_to_degrees' \
62
+ -input 'spec/data/edges/undirected.unweighted.tsv' \
63
+ -output '-' \
64
+ -file '/home/user/projects/networks/clusta/bin/clusta'
65
+ ...
66
+
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
data/bin/clusta CHANGED
@@ -4,32 +4,5 @@ $: << File.expand_path('../lib', File.dirname(__FILE__)) unless $:.include?(File
4
4
 
5
5
  require 'clusta'
6
6
 
7
- def usage
8
- "usage: #{File.basename(__FILE__)} --transform=TRANSFORM_NAME [ARGS ...]"
9
- end
7
+ Clusta::Runner.new(File.basename(__FILE__), ARGV.dup).run! if $0 == __FILE__
10
8
 
11
- def extract_transform_arg
12
- transform_arg = ARGV.find_all { |arg| arg =~ Clusta::Transforms::ARG_REGEXP }.first
13
- if transform_arg.nil?
14
- $stderr.puts(usage)
15
- exit(1)
16
- end
17
- # ARGV.delete_if { |arg| arg =~ Clusta::Transforms::ARG_REGEXP }
18
- transform_arg
19
- end
20
-
21
- def add_default_run_arg
22
- ARGV.unshift('--run=local') unless ARGV.detect { |arg| arg =~ /--run/ }
23
- end
24
-
25
- if $0 == __FILE__
26
- begin
27
- add_default_run_arg
28
- transform = Clusta::Transforms.from_arg(extract_transform_arg)
29
- script = Clusta::Transforms.script_for(transform)
30
- script.run
31
- rescue Clusta::Error => e
32
- $stderr.puts e.message
33
- exit(1)
34
- end
35
- end
@@ -14,11 +14,20 @@ module Clusta
14
14
  File.basename(path).gsub(/\.rb$/, '')
15
15
  end
16
16
 
17
+ def self.require_path path
18
+ File.join(File.dirname(path), File.basename(path).gsub(/\.rb$/, ''))
19
+ end
20
+
17
21
  Error = Class.new(StandardError)
22
+ ArgumentError = Class.new(Error)
18
23
  DirectednessMismatchError = Class.new(Error)
19
24
  AmbiguousArgumentsError = Class.new(Error)
25
+ NotImplementedError = Class.new(Error)
26
+ SortError = Class.new(Error)
20
27
 
21
- autoload :Geometry, 'clusta/geometry'
22
- autoload :Transforms, 'clusta/transforms'
23
-
28
+ autoload :Geometry, 'clusta/geometry'
29
+ autoload :Transforms, 'clusta/transforms'
30
+ autoload :Runner, 'clusta/runner'
31
+ autoload :Schema, 'clusta/schema'
32
+ autoload :Serialization, 'clusta/serialization'
24
33
  end
@@ -2,24 +2,69 @@ module Clusta
2
2
 
3
3
  module Geometry
4
4
 
5
- autoload :Element, 'clusta/geometry/element'
6
-
7
- ELEMENTS = []
5
+ def self.names
6
+ @names ||= {}
7
+ end
8
8
 
9
- def self.register_geometry name, path, geometries=nil
10
- autoload name, path
11
- self::ELEMENTS << name
9
+ def self.register_element klass, name=nil
10
+ if name
11
+ Wukong::RESOURCE_CLASS_MAP[name] = klass
12
+ else
13
+ klass.all_stream_names.each do |name|
14
+ Wukong::RESOURCE_CLASS_MAP[name] = klass
15
+ end
16
+ end
12
17
  end
13
18
 
19
+ def self.from_name name
20
+ begin
21
+ const_get(Clusta.classify(name))
22
+ rescue NameError => e
23
+ raise Error.new("No such transform: '#{name}'")
24
+ end
25
+ end
26
+
14
27
  Dir[File.join(File.dirname(__FILE__), "geometry/*.rb")].each do |path|
15
28
  require_name = Clusta.require_name(path)
16
- register_geometry Clusta.classify(require_name), "clusta/geometry/#{require_name}"
29
+ autoload Clusta.classify(require_name), "clusta/geometry/#{require_name}"
30
+ names[require_name] ||= {} unless require_name == 'all'
17
31
  end
18
32
 
19
33
  Dir[File.join(File.dirname(__FILE__), "geometry/directed/*.rb")].each do |path|
20
34
  require_name = Clusta.require_name(path)
21
- register_geometry ("Directed" + Clusta.classify(require_name)), "clusta/geometry/directed/#{require_name}"
35
+ autoload ("Directed" + Clusta.classify(require_name)), "clusta/geometry/directed/#{require_name}"
36
+ names[require_name] ||= {}
37
+ names[require_name][:directed] = true
22
38
  end
23
39
 
40
+ def self.listing
41
+ [].tap do |out|
42
+ out << "Known geometries:"
43
+ out << ''
44
+ names.keys.sort.each do |element_name|
45
+ element = from_name(element_name)
46
+ if names[element_name][:directed]
47
+ directed_element = from_name("directed_#{element_name}")
48
+ else
49
+ directed_element = nil
50
+ end
51
+
52
+ out << " #{element}"
53
+ stream_names = element.all_stream_names.sort
54
+ stream_names.concat(directed_element.all_stream_names.sort) if directed_element
55
+ out << " streams as: #{stream_names.uniq.join(', ')}"
56
+ out << ''
57
+ end
58
+ end.join("\n")
59
+ end
60
+
61
+ def self.load_from path
62
+ class_eval(File.read(path), path)
63
+ require_name = Clusta.require_name(path)
64
+ names[require_name] ||= {}
65
+ names[require_name][:directed] = true if require_name =~ /^directed_/
66
+ end
67
+
68
+
24
69
  end
25
70
  end
@@ -0,0 +1,3 @@
1
+ Dir[File.join(File.dirname(__FILE__), '**/*.rb')].each do |path|
2
+ require Clusta.require_path path
3
+ end
@@ -4,8 +4,8 @@ module Clusta
4
4
 
5
5
  class Assortativity < Element
6
6
 
7
- field :source_degree_value, :type => :int
8
- field :target_degree_value, :type => :int
7
+ key :source_degree_value, :type => :int
8
+ key :target_degree_value, :type => :int
9
9
  field :count, :type => :int
10
10
 
11
11
  def directed?
@@ -3,7 +3,9 @@ module Clusta
3
3
 
4
4
  class Degree < Element
5
5
 
6
- field :vertex_label
6
+ abbreviate 'D'
7
+
8
+ key :vertex_label
7
9
  field :degree, :type => :int
8
10
 
9
11
  def directed?
@@ -1,9 +1,9 @@
1
1
  module Clusta
2
2
  module Geometry
3
3
 
4
- class EdgeDegreePair < Element
5
- field :source_label
6
- field :target_label
4
+ class DegreePair < Element
5
+ key :source_label
6
+ key :target_label
7
7
  field :source_degree_value, :type => :int
8
8
  field :target_degree_value, :type => :int
9
9
 
@@ -3,7 +3,9 @@ module Clusta
3
3
 
4
4
  class DirectedDegree < Element
5
5
 
6
- field :vertex_label
6
+ abbreviate 'DD'
7
+
8
+ key :vertex_label
7
9
  field :in_degree, :type => :int
8
10
  field :out_degree, :type => :int
9
11
 
@@ -1,9 +1,10 @@
1
1
  module Clusta
2
2
  module Geometry
3
3
 
4
- class DirectedEdgeDegreePair < Element
5
- field :source_label
6
- field :target_label
4
+ class DirectedDegreePair < Element
5
+
6
+ key :source_label
7
+ key :target_label
7
8
  field :source_in_degree_value, :type => :int
8
9
  field :source_out_degree_value, :type => :int
9
10
  field :target_in_degree_value, :type => :int
@@ -2,6 +2,8 @@ module Clusta
2
2
  module Geometry
3
3
 
4
4
  class DirectedEdge < Edge
5
+
6
+ abbreviate 'DE'
5
7
 
6
8
  def directed?
7
9
  true
@@ -15,8 +17,8 @@ module Clusta
15
17
  DirectedDegree.new(target_label, 1, 0)
16
18
  end
17
19
 
18
- def arrow
19
- DirectedArrow.new(target_label, weight)
20
+ def neighbor
21
+ DirectedNeighbor.new(target_label, weight)
20
22
  end
21
23
 
22
24
 
@@ -1,7 +1,7 @@
1
1
  module Clusta
2
2
  module Geometry
3
3
 
4
- class DirectedArrow < Arrow
4
+ class DirectedNeighbor < Neighbor
5
5
  def directed
6
6
  true
7
7
  end
@@ -0,0 +1,31 @@
1
+ module Clusta
2
+ module Geometry
3
+
4
+ class DirectedNeighborhood < Neighborhood
5
+
6
+ def directed?
7
+ true
8
+ end
9
+
10
+ def degree_pairs
11
+ neighbors.map do |neighbor|
12
+ # This vertex's in-degree is not known to us; we just have
13
+ # its out-degree based on the size of this neighborhood.
14
+ #
15
+ # We don't know anything about each neighbor's degree than
16
+ # its in-degree is at least 1 b/c it's in this vertex's
17
+ # neighborhood.
18
+ DirectedDegreePair.new(label, neighbor.label, 0, size, 1, 0)
19
+ end
20
+ end
21
+
22
+ def reversed_degree_pairs
23
+ neighbors.map do |neighbor|
24
+ DirectedDegreePair.new(neighbor.label, label, 1, 0, 0, size)
25
+ end
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+ end
@@ -3,8 +3,10 @@ module Clusta
3
3
 
4
4
  class Edge < Element
5
5
 
6
- field :source_label
7
- field :target_label
6
+ abbreviate 'E'
7
+
8
+ key :source_label
9
+ key :target_label
8
10
  field :weight, :optional => true
9
11
 
10
12
  def weighted?
@@ -48,8 +50,8 @@ module Clusta
48
50
  self.class.new(target_label, source_label, weight)
49
51
  end
50
52
 
51
- def arrow
52
- Arrow.new(target_label, weight)
53
+ def neighbor
54
+ Neighbor.new(target_label, weight)
53
55
  end
54
56
 
55
57
  end
@@ -3,129 +3,22 @@ module Clusta
3
3
 
4
4
  class Element
5
5
 
6
- attr_accessor :input_fields
6
+ include Clusta::Schema
7
+ include Clusta::Serialization
7
8
 
8
- @fields = []
9
- class << self ; attr_reader :fields ; end
10
-
11
- def self.inherited(subclass)
12
- subclass.instance_variable_set("@fields", @fields.dup)
13
- super
14
- end
15
-
16
- def self.field_names
17
- @fields.map { |field| field[:name].to_s }
18
- end
19
-
20
- def self.has_optional_field?
21
- @fields.any? { |field| field[:optional] }
22
- end
23
-
24
- def self.optional_field
25
- @fields.detect { |field| field[:optional] }
26
- end
27
-
28
- def self.from_string string
29
- return string unless string.is_a?(String)
30
- args = string.split(';')
31
- klass_name = args.shift
32
- raise ArgumentError.new("Elements instantiated from a string must match the format 'klass;[field1;[field2;]...]'") unless klass_name
33
- Wukong.class_from_resource(klass_name).new(*args)
34
- end
35
-
36
- def self.field name, options={}
37
- raise AmbiguousArgumentsError.new("Cannot define a second optional field #{name} because field #{optional_field[:name]} is already optional.") if has_optional_field?
38
- attr_reader name
39
- case options[:type]
40
- when :int
41
- define_method "#{name}=" do |val|
42
- instance_variable_set("@#{name}", val.to_i)
43
- end
44
- when :float
45
- define_method "#{name}=" do |val|
46
- instance_variable_set("@#{name}", val.to_f)
47
- end
48
- when :geometry
49
- define_method "#{name}=" do |val|
50
- instance_variable_set("@#{name}", self.class.from_string(val))
51
- end
52
- else
53
- define_method "#{name}=" do |val|
54
- instance_variable_set("@#{name}", val)
55
- end
56
- end
57
- @fields << options.merge(:name => name)
58
- end
59
-
60
- def fields
61
- self.class.fields
62
- end
63
-
64
- def self.input_fields name
65
- alias_method name, :input_fields
9
+ if defined?(Settings) && Settings[:serialize] == 'json'
10
+ include Clusta::Serialization::JSON
11
+ else
12
+ include Clusta::Serialization::TSV
66
13
  end
67
14
 
68
- def self.stream_name
69
- if defined?(Settings) && Settings[:full_class_names]
70
- to_s
71
- else
72
- to_s.split("::").last
73
- end
74
- end
75
-
76
- def stream_name
77
- self.class.stream_name
78
- end
79
-
80
- def initialize *args
81
- self.class.fields.each_with_index do |field, index|
82
- suffix = case index.to_s
83
- when /1$/ then 'st'
84
- when /2$/ then 'nd'
85
- when /3$/ then 'rd'
86
- else 'th'
87
- end
88
- case
89
- when field[:optional]
90
- self.send("#{field[:name]}=", args[index]) if args[index]
91
- when args[index].nil?
92
- raise ArgumentError.new("A #{self.class} requires a non-nil value for #{field[:name]} as its #{index}#{suffix} argument.")
93
- else
94
- self.send("#{field[:name]}=", args[index])
95
- end
96
- end
97
- self.set_input_fields(*(args[self.class.fields.size..-1] || []))
98
- end
99
-
100
- def set_input_fields *input_fields
101
- self.input_fields = input_fields.map do |field|
102
- if field =~ /^[A-Z].*;/
103
- self.class.from_string(field)
104
- else
105
- field
106
- end
107
- end
108
- end
109
-
110
- def output_fields
111
- input_fields.map(&:to_s)
112
- end
113
-
114
- def to_flat
115
- [stream_name].tap do |record|
116
- fields.each do |field|
117
- value = send(field[:name])
118
- record << value.to_s unless value.nil? && field[:optional]
119
- end
120
- end.concat(output_fields)
121
- end
122
-
123
- def to_s
124
- to_flat.join(';')
15
+ def self.inherited subclass
16
+ Clusta::Geometry.register_element(subclass)
17
+ super
125
18
  end
126
19
 
127
20
  end
128
-
21
+
129
22
  end
130
23
  end
131
24