clusta 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/README.rdoc +66 -0
  2. data/VERSION +1 -1
  3. data/bin/clusta +1 -28
  4. data/lib/clusta.rb +12 -3
  5. data/lib/clusta/geometry.rb +53 -8
  6. data/lib/clusta/geometry/all.rb +3 -0
  7. data/lib/clusta/geometry/assortativity.rb +2 -2
  8. data/lib/clusta/geometry/degree.rb +3 -1
  9. data/lib/clusta/geometry/{edge_degree_pair.rb → degree_pair.rb} +3 -3
  10. data/lib/clusta/geometry/directed/degree.rb +3 -1
  11. data/lib/clusta/geometry/directed/{edge_degree_pair.rb → degree_pair.rb} +4 -3
  12. data/lib/clusta/geometry/directed/edge.rb +4 -2
  13. data/lib/clusta/geometry/directed/{arrow.rb → neighbor.rb} +1 -1
  14. data/lib/clusta/geometry/directed/neighborhood.rb +31 -0
  15. data/lib/clusta/geometry/edge.rb +6 -4
  16. data/lib/clusta/geometry/element.rb +10 -117
  17. data/lib/clusta/geometry/{arrow.rb → neighbor.rb} +3 -3
  18. data/lib/clusta/geometry/neighborhood.rb +41 -0
  19. data/lib/clusta/geometry/vertex.rb +4 -1
  20. data/lib/clusta/runner.rb +101 -4
  21. data/lib/clusta/schema.rb +100 -0
  22. data/lib/clusta/serialization.rb +63 -0
  23. data/lib/clusta/serialization/json.rb +86 -0
  24. data/lib/clusta/serialization/tsv.rb +81 -0
  25. data/lib/clusta/transforms.rb +59 -26
  26. data/lib/clusta/transforms/{edge_degree_pairs_to_assortativities.rb → degree_pairs_to_assortativities.rb} +7 -3
  27. data/lib/clusta/transforms/edges_to_degrees.rb +5 -0
  28. data/lib/clusta/transforms/{edges_to_vertex_arrows.rb → edges_to_neighborhoods.rb} +11 -6
  29. data/lib/clusta/transforms/import.rb +6 -0
  30. data/lib/clusta/transforms/neighborhoods_to_degree_pairs.rb +70 -0
  31. data/lib/clusta/transforms/pm3d.rb +46 -0
  32. data/lib/clusta/transforms/prune_edges.rb +34 -0
  33. data/spec/clusta/schema_spec.rb +36 -0
  34. data/spec/clusta/serialization/json_spec.rb +133 -0
  35. data/spec/clusta/serialization/tsv_spec.rb +133 -0
  36. data/spec/clusta/serialization_spec.rb +27 -0
  37. data/spec/clusta/transforms/degree_pairs_to_assortativities_spec.rb +13 -0
  38. data/spec/clusta/transforms/{edges_to_vertex_arrows_spec.rb → edges_to_neighborhoods_spec.rb} +5 -5
  39. data/spec/clusta/transforms/import_spec.rb +9 -0
  40. data/spec/clusta/transforms/neighborhoods_to_degree_pairs_spec.rb +21 -0
  41. data/spec/clusta/transforms/prune_edges_spec.rb +22 -0
  42. data/spec/data/assortativities/directed.tsv +4 -0
  43. data/spec/data/assortativities/undirected.tsv +7 -0
  44. data/spec/data/degree_pairs/directed.tsv +10 -0
  45. data/spec/data/degree_pairs/undirected.tsv +18 -0
  46. data/spec/data/external/vertices.tsv +9 -0
  47. data/spec/data/imports/vertices.labeled.tsv +9 -0
  48. data/spec/data/neighborhoods/directed.unweighted.tsv +7 -0
  49. data/spec/data/neighborhoods/directed.weighted.tsv +7 -0
  50. data/spec/data/neighborhoods/undirected.unweighted.tsv +9 -0
  51. data/spec/data/neighborhoods/undirected.weighted.tsv +9 -0
  52. data/spec/data/pruned_edges/directed.unweighted.tsv +1 -0
  53. data/spec/data/pruned_edges/directed.weighted.tsv +3 -0
  54. data/spec/data/pruned_edges/undirected.unweighted.tsv +1 -0
  55. data/spec/data/pruned_edges/undirected.weighted.tsv +3 -0
  56. data/spec/support/transforms_spec_helper.rb +5 -1
  57. metadata +47 -23
  58. data/lib/clusta/geometry/directed/vertex_arrows.rb +0 -25
  59. data/lib/clusta/geometry/vertex_arrows.rb +0 -45
  60. data/lib/clusta/transforms/vertex_arrows_to_edge_degree_pairs.rb +0 -63
  61. data/spec/clusta/geometry/element_spec.rb +0 -191
  62. data/spec/data/vertex_arrows/directed.unweighted.tsv +0 -7
  63. data/spec/data/vertex_arrows/directed.weighted.tsv +0 -7
  64. data/spec/data/vertex_arrows/undirected.unweighted.tsv +0 -9
  65. data/spec/data/vertex_arrows/undirected.weighted.tsv +0 -9
@@ -0,0 +1,66 @@
1
+ = Clusta
2
+
3
+ Clusta is a Ruby gem for network analysis built on top of
4
+ Wukong[http://github.com/mrflip/wukong].
5
+
6
+ Wukong lets you write Ruby scripts that run on your laptop as well as
7
+ on a Hadoop cluster.
8
+
9
+ Clusta is:
10
+
11
+ - classes that make describing the geometry of networks easy
12
+ - network algorithms written with these classes to use Wukong
13
+ - a shim command-line program for running these algorithms
14
+
15
+ Start with a file containing edges:
16
+
17
+ Edge 1 2
18
+ Edge 2 3
19
+ Edge 1 4
20
+ Edge 4 5
21
+ Edge 5 6
22
+ Edge 5 7
23
+ Edge 6 8
24
+ Edge 7 8
25
+ Edge 8 9
26
+
27
+ Run this through a transformation named +edges_to_degrees+:
28
+
29
+ $ clusta --transform=edges_to_degrees /local/edges.tsv -
30
+ Degree 1 2
31
+ Degree 2 2
32
+ Degree 3 1
33
+ Degree 4 2
34
+ Degree 5 3
35
+ Degree 6 2
36
+ Degree 7 2
37
+ Degree 8 3
38
+ Degree 9 1
39
+
40
+ Chain transformations together:
41
+
42
+ $ clusta --transform=edges_to_neighborhoods /local/edges.tsv - | clusta --transform=neighborhoods_to_degree_pairs - - | clusta --transform=degree_pairs_to_assortativities - -
43
+ Assortativity 1 2 1
44
+ Assortativity 1 3 1
45
+ Assortativity 2 1 1
46
+ Assortativity 2 2 4
47
+ Assortativity 2 3 5
48
+ Assortativity 3 1 1
49
+ Assortativity 3 2 5
50
+
51
+ And then leverage Wukong when you're ready:
52
+
53
+ $ clusta --run=hadoop --transform=edges_to_neighborhoods /hdfs/edges.tsv /hdfs/neighborhoods.tsv
54
+ I, [2012-03-03T21:00:39.992750 #25835] INFO -- : Launching hadoop!
55
+ I, [2012-03-03T21:00:39.992979 #25835] INFO -- : Running
56
+
57
+ /usr/lib/hadoop/bin/hadoop \
58
+ jar /usr/lib/hadoop/contrib/streaming/hadoop-*streaming*.jar \
59
+ -D mapred.job.name='clusta---spec/data/edges/undirected.unweighted.tsv----' \
60
+ -mapper '/usr/bin/ruby1.9.1 clusta --map --log_interval=10000 --log_seconds=30 --transform=edges_to_degrees' \
61
+ -reducer '/usr/bin/ruby1.9.1 clusta --reduce --log_interval=10000 --log_seconds=30 --transform=edges_to_degrees' \
62
+ -input 'spec/data/edges/undirected.unweighted.tsv' \
63
+ -output '-' \
64
+ -file '/home/user/projects/networks/clusta/bin/clusta'
65
+ ...
66
+
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.1
1
+ 0.0.2
data/bin/clusta CHANGED
@@ -4,32 +4,5 @@ $: << File.expand_path('../lib', File.dirname(__FILE__)) unless $:.include?(File
4
4
 
5
5
  require 'clusta'
6
6
 
7
- def usage
8
- "usage: #{File.basename(__FILE__)} --transform=TRANSFORM_NAME [ARGS ...]"
9
- end
7
+ Clusta::Runner.new(File.basename(__FILE__), ARGV.dup).run! if $0 == __FILE__
10
8
 
11
- def extract_transform_arg
12
- transform_arg = ARGV.find_all { |arg| arg =~ Clusta::Transforms::ARG_REGEXP }.first
13
- if transform_arg.nil?
14
- $stderr.puts(usage)
15
- exit(1)
16
- end
17
- # ARGV.delete_if { |arg| arg =~ Clusta::Transforms::ARG_REGEXP }
18
- transform_arg
19
- end
20
-
21
- def add_default_run_arg
22
- ARGV.unshift('--run=local') unless ARGV.detect { |arg| arg =~ /--run/ }
23
- end
24
-
25
- if $0 == __FILE__
26
- begin
27
- add_default_run_arg
28
- transform = Clusta::Transforms.from_arg(extract_transform_arg)
29
- script = Clusta::Transforms.script_for(transform)
30
- script.run
31
- rescue Clusta::Error => e
32
- $stderr.puts e.message
33
- exit(1)
34
- end
35
- end
@@ -14,11 +14,20 @@ module Clusta
14
14
  File.basename(path).gsub(/\.rb$/, '')
15
15
  end
16
16
 
17
+ def self.require_path path
18
+ File.join(File.dirname(path), File.basename(path).gsub(/\.rb$/, ''))
19
+ end
20
+
17
21
  Error = Class.new(StandardError)
22
+ ArgumentError = Class.new(Error)
18
23
  DirectednessMismatchError = Class.new(Error)
19
24
  AmbiguousArgumentsError = Class.new(Error)
25
+ NotImplementedError = Class.new(Error)
26
+ SortError = Class.new(Error)
20
27
 
21
- autoload :Geometry, 'clusta/geometry'
22
- autoload :Transforms, 'clusta/transforms'
23
-
28
+ autoload :Geometry, 'clusta/geometry'
29
+ autoload :Transforms, 'clusta/transforms'
30
+ autoload :Runner, 'clusta/runner'
31
+ autoload :Schema, 'clusta/schema'
32
+ autoload :Serialization, 'clusta/serialization'
24
33
  end
@@ -2,24 +2,69 @@ module Clusta
2
2
 
3
3
  module Geometry
4
4
 
5
- autoload :Element, 'clusta/geometry/element'
6
-
7
- ELEMENTS = []
5
+ def self.names
6
+ @names ||= {}
7
+ end
8
8
 
9
- def self.register_geometry name, path, geometries=nil
10
- autoload name, path
11
- self::ELEMENTS << name
9
+ def self.register_element klass, name=nil
10
+ if name
11
+ Wukong::RESOURCE_CLASS_MAP[name] = klass
12
+ else
13
+ klass.all_stream_names.each do |name|
14
+ Wukong::RESOURCE_CLASS_MAP[name] = klass
15
+ end
16
+ end
12
17
  end
13
18
 
19
+ def self.from_name name
20
+ begin
21
+ const_get(Clusta.classify(name))
22
+ rescue NameError => e
23
+ raise Error.new("No such transform: '#{name}'")
24
+ end
25
+ end
26
+
14
27
  Dir[File.join(File.dirname(__FILE__), "geometry/*.rb")].each do |path|
15
28
  require_name = Clusta.require_name(path)
16
- register_geometry Clusta.classify(require_name), "clusta/geometry/#{require_name}"
29
+ autoload Clusta.classify(require_name), "clusta/geometry/#{require_name}"
30
+ names[require_name] ||= {} unless require_name == 'all'
17
31
  end
18
32
 
19
33
  Dir[File.join(File.dirname(__FILE__), "geometry/directed/*.rb")].each do |path|
20
34
  require_name = Clusta.require_name(path)
21
- register_geometry ("Directed" + Clusta.classify(require_name)), "clusta/geometry/directed/#{require_name}"
35
+ autoload ("Directed" + Clusta.classify(require_name)), "clusta/geometry/directed/#{require_name}"
36
+ names[require_name] ||= {}
37
+ names[require_name][:directed] = true
22
38
  end
23
39
 
40
+ def self.listing
41
+ [].tap do |out|
42
+ out << "Known geometries:"
43
+ out << ''
44
+ names.keys.sort.each do |element_name|
45
+ element = from_name(element_name)
46
+ if names[element_name][:directed]
47
+ directed_element = from_name("directed_#{element_name}")
48
+ else
49
+ directed_element = nil
50
+ end
51
+
52
+ out << " #{element}"
53
+ stream_names = element.all_stream_names.sort
54
+ stream_names.concat(directed_element.all_stream_names.sort) if directed_element
55
+ out << " streams as: #{stream_names.uniq.join(', ')}"
56
+ out << ''
57
+ end
58
+ end.join("\n")
59
+ end
60
+
61
+ def self.load_from path
62
+ class_eval(File.read(path), path)
63
+ require_name = Clusta.require_name(path)
64
+ names[require_name] ||= {}
65
+ names[require_name][:directed] = true if require_name =~ /^directed_/
66
+ end
67
+
68
+
24
69
  end
25
70
  end
@@ -0,0 +1,3 @@
1
+ Dir[File.join(File.dirname(__FILE__), '**/*.rb')].each do |path|
2
+ require Clusta.require_path path
3
+ end
@@ -4,8 +4,8 @@ module Clusta
4
4
 
5
5
  class Assortativity < Element
6
6
 
7
- field :source_degree_value, :type => :int
8
- field :target_degree_value, :type => :int
7
+ key :source_degree_value, :type => :int
8
+ key :target_degree_value, :type => :int
9
9
  field :count, :type => :int
10
10
 
11
11
  def directed?
@@ -3,7 +3,9 @@ module Clusta
3
3
 
4
4
  class Degree < Element
5
5
 
6
- field :vertex_label
6
+ abbreviate 'D'
7
+
8
+ key :vertex_label
7
9
  field :degree, :type => :int
8
10
 
9
11
  def directed?
@@ -1,9 +1,9 @@
1
1
  module Clusta
2
2
  module Geometry
3
3
 
4
- class EdgeDegreePair < Element
5
- field :source_label
6
- field :target_label
4
+ class DegreePair < Element
5
+ key :source_label
6
+ key :target_label
7
7
  field :source_degree_value, :type => :int
8
8
  field :target_degree_value, :type => :int
9
9
 
@@ -3,7 +3,9 @@ module Clusta
3
3
 
4
4
  class DirectedDegree < Element
5
5
 
6
- field :vertex_label
6
+ abbreviate 'DD'
7
+
8
+ key :vertex_label
7
9
  field :in_degree, :type => :int
8
10
  field :out_degree, :type => :int
9
11
 
@@ -1,9 +1,10 @@
1
1
  module Clusta
2
2
  module Geometry
3
3
 
4
- class DirectedEdgeDegreePair < Element
5
- field :source_label
6
- field :target_label
4
+ class DirectedDegreePair < Element
5
+
6
+ key :source_label
7
+ key :target_label
7
8
  field :source_in_degree_value, :type => :int
8
9
  field :source_out_degree_value, :type => :int
9
10
  field :target_in_degree_value, :type => :int
@@ -2,6 +2,8 @@ module Clusta
2
2
  module Geometry
3
3
 
4
4
  class DirectedEdge < Edge
5
+
6
+ abbreviate 'DE'
5
7
 
6
8
  def directed?
7
9
  true
@@ -15,8 +17,8 @@ module Clusta
15
17
  DirectedDegree.new(target_label, 1, 0)
16
18
  end
17
19
 
18
- def arrow
19
- DirectedArrow.new(target_label, weight)
20
+ def neighbor
21
+ DirectedNeighbor.new(target_label, weight)
20
22
  end
21
23
 
22
24
 
@@ -1,7 +1,7 @@
1
1
  module Clusta
2
2
  module Geometry
3
3
 
4
- class DirectedArrow < Arrow
4
+ class DirectedNeighbor < Neighbor
5
5
  def directed
6
6
  true
7
7
  end
@@ -0,0 +1,31 @@
1
+ module Clusta
2
+ module Geometry
3
+
4
+ class DirectedNeighborhood < Neighborhood
5
+
6
+ def directed?
7
+ true
8
+ end
9
+
10
+ def degree_pairs
11
+ neighbors.map do |neighbor|
12
+ # This vertex's in-degree is not known to us; we just have
13
+ # its out-degree based on the size of this neighborhood.
14
+ #
15
+ # We don't know anything about each neighbor's degree than
16
+ # its in-degree is at least 1 b/c it's in this vertex's
17
+ # neighborhood.
18
+ DirectedDegreePair.new(label, neighbor.label, 0, size, 1, 0)
19
+ end
20
+ end
21
+
22
+ def reversed_degree_pairs
23
+ neighbors.map do |neighbor|
24
+ DirectedDegreePair.new(neighbor.label, label, 1, 0, 0, size)
25
+ end
26
+ end
27
+
28
+ end
29
+
30
+ end
31
+ end
@@ -3,8 +3,10 @@ module Clusta
3
3
 
4
4
  class Edge < Element
5
5
 
6
- field :source_label
7
- field :target_label
6
+ abbreviate 'E'
7
+
8
+ key :source_label
9
+ key :target_label
8
10
  field :weight, :optional => true
9
11
 
10
12
  def weighted?
@@ -48,8 +50,8 @@ module Clusta
48
50
  self.class.new(target_label, source_label, weight)
49
51
  end
50
52
 
51
- def arrow
52
- Arrow.new(target_label, weight)
53
+ def neighbor
54
+ Neighbor.new(target_label, weight)
53
55
  end
54
56
 
55
57
  end
@@ -3,129 +3,22 @@ module Clusta
3
3
 
4
4
  class Element
5
5
 
6
- attr_accessor :input_fields
6
+ include Clusta::Schema
7
+ include Clusta::Serialization
7
8
 
8
- @fields = []
9
- class << self ; attr_reader :fields ; end
10
-
11
- def self.inherited(subclass)
12
- subclass.instance_variable_set("@fields", @fields.dup)
13
- super
14
- end
15
-
16
- def self.field_names
17
- @fields.map { |field| field[:name].to_s }
18
- end
19
-
20
- def self.has_optional_field?
21
- @fields.any? { |field| field[:optional] }
22
- end
23
-
24
- def self.optional_field
25
- @fields.detect { |field| field[:optional] }
26
- end
27
-
28
- def self.from_string string
29
- return string unless string.is_a?(String)
30
- args = string.split(';')
31
- klass_name = args.shift
32
- raise ArgumentError.new("Elements instantiated from a string must match the format 'klass;[field1;[field2;]...]'") unless klass_name
33
- Wukong.class_from_resource(klass_name).new(*args)
34
- end
35
-
36
- def self.field name, options={}
37
- raise AmbiguousArgumentsError.new("Cannot define a second optional field #{name} because field #{optional_field[:name]} is already optional.") if has_optional_field?
38
- attr_reader name
39
- case options[:type]
40
- when :int
41
- define_method "#{name}=" do |val|
42
- instance_variable_set("@#{name}", val.to_i)
43
- end
44
- when :float
45
- define_method "#{name}=" do |val|
46
- instance_variable_set("@#{name}", val.to_f)
47
- end
48
- when :geometry
49
- define_method "#{name}=" do |val|
50
- instance_variable_set("@#{name}", self.class.from_string(val))
51
- end
52
- else
53
- define_method "#{name}=" do |val|
54
- instance_variable_set("@#{name}", val)
55
- end
56
- end
57
- @fields << options.merge(:name => name)
58
- end
59
-
60
- def fields
61
- self.class.fields
62
- end
63
-
64
- def self.input_fields name
65
- alias_method name, :input_fields
9
+ if defined?(Settings) && Settings[:serialize] == 'json'
10
+ include Clusta::Serialization::JSON
11
+ else
12
+ include Clusta::Serialization::TSV
66
13
  end
67
14
 
68
- def self.stream_name
69
- if defined?(Settings) && Settings[:full_class_names]
70
- to_s
71
- else
72
- to_s.split("::").last
73
- end
74
- end
75
-
76
- def stream_name
77
- self.class.stream_name
78
- end
79
-
80
- def initialize *args
81
- self.class.fields.each_with_index do |field, index|
82
- suffix = case index.to_s
83
- when /1$/ then 'st'
84
- when /2$/ then 'nd'
85
- when /3$/ then 'rd'
86
- else 'th'
87
- end
88
- case
89
- when field[:optional]
90
- self.send("#{field[:name]}=", args[index]) if args[index]
91
- when args[index].nil?
92
- raise ArgumentError.new("A #{self.class} requires a non-nil value for #{field[:name]} as its #{index}#{suffix} argument.")
93
- else
94
- self.send("#{field[:name]}=", args[index])
95
- end
96
- end
97
- self.set_input_fields(*(args[self.class.fields.size..-1] || []))
98
- end
99
-
100
- def set_input_fields *input_fields
101
- self.input_fields = input_fields.map do |field|
102
- if field =~ /^[A-Z].*;/
103
- self.class.from_string(field)
104
- else
105
- field
106
- end
107
- end
108
- end
109
-
110
- def output_fields
111
- input_fields.map(&:to_s)
112
- end
113
-
114
- def to_flat
115
- [stream_name].tap do |record|
116
- fields.each do |field|
117
- value = send(field[:name])
118
- record << value.to_s unless value.nil? && field[:optional]
119
- end
120
- end.concat(output_fields)
121
- end
122
-
123
- def to_s
124
- to_flat.join(';')
15
+ def self.inherited subclass
16
+ Clusta::Geometry.register_element(subclass)
17
+ super
125
18
  end
126
19
 
127
20
  end
128
-
21
+
129
22
  end
130
23
  end
131
24