piglet 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/.document +5 -0
  2. data/.gitignore +22 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +293 -0
  5. data/Rakefile +50 -0
  6. data/bin/piglet +9 -0
  7. data/examples/analysis.rb +311 -0
  8. data/examples/scratch.rb +11 -0
  9. data/examples/spike1.rb +43 -0
  10. data/examples/spike2.rb +40 -0
  11. data/examples/test1.rb +3 -0
  12. data/examples/test2.rb +5 -0
  13. data/examples/test3.rb +4 -0
  14. data/lib/piglet/assignment.rb +13 -0
  15. data/lib/piglet/cogroup.rb +31 -0
  16. data/lib/piglet/cross.rb +22 -0
  17. data/lib/piglet/describe.rb +5 -0
  18. data/lib/piglet/distinct.rb +16 -0
  19. data/lib/piglet/dump.rb +5 -0
  20. data/lib/piglet/explain.rb +13 -0
  21. data/lib/piglet/field.rb +40 -0
  22. data/lib/piglet/field_expression_functions.rb +62 -0
  23. data/lib/piglet/field_function_expression.rb +19 -0
  24. data/lib/piglet/field_infix_expression.rb +17 -0
  25. data/lib/piglet/field_prefix_expression.rb +21 -0
  26. data/lib/piglet/field_rename.rb +11 -0
  27. data/lib/piglet/field_suffix_expression.rb +17 -0
  28. data/lib/piglet/filter.rb +13 -0
  29. data/lib/piglet/foreach.rb +19 -0
  30. data/lib/piglet/group.rb +21 -0
  31. data/lib/piglet/illustrate.rb +5 -0
  32. data/lib/piglet/interpreter.rb +108 -0
  33. data/lib/piglet/join.rb +20 -0
  34. data/lib/piglet/limit.rb +13 -0
  35. data/lib/piglet/load.rb +31 -0
  36. data/lib/piglet/load_and_store.rb +16 -0
  37. data/lib/piglet/order.rb +29 -0
  38. data/lib/piglet/relation.rb +177 -0
  39. data/lib/piglet/sample.rb +13 -0
  40. data/lib/piglet/split.rb +41 -0
  41. data/lib/piglet/store.rb +17 -0
  42. data/lib/piglet/storing.rb +13 -0
  43. data/lib/piglet/stream.rb +5 -0
  44. data/lib/piglet/union.rb +19 -0
  45. data/lib/piglet.rb +45 -0
  46. data/spec/piglet/field_spec.rb +130 -0
  47. data/spec/piglet/interpreter_spec.rb +413 -0
  48. data/spec/piglet/relation_spec.rb +79 -0
  49. data/spec/piglet/split_spec.rb +34 -0
  50. data/spec/piglet_spec.rb +7 -0
  51. data/spec/spec.opts +3 -0
  52. data/spec/spec_helper.rb +14 -0
  53. metadata +123 -0
@@ -0,0 +1,21 @@
1
+ module Piglet
2
+ class FieldPrefixExpression # :nodoc:
3
+ include FieldExpressionFunctions
4
+
5
+ def initialize(operator, expression, space_between=true)
6
+ @operator, @expression, @space_between = operator, expression, space_between
7
+ end
8
+
9
+ def simple?
10
+ true
11
+ end
12
+
13
+ def to_s
14
+ if @space_between
15
+ "#{@operator} #{parenthesise(@expression)}"
16
+ else
17
+ "#{@operator}#{parenthesise(@expression)}"
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,11 @@
1
+ module Piglet
2
+ class FieldRename # :nodoc:
3
+ def initialize(new_name, field_expression)
4
+ @new_name, @field_expression = new_name, field_expression
5
+ end
6
+
7
+ def to_s
8
+ "#{@field_expression} AS #{@new_name}"
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,17 @@
1
+ module Piglet
2
+ class FieldSuffixExpression # :nodoc:
3
+ include FieldExpressionFunctions
4
+
5
+ def initialize(operator, expression)
6
+ @operator, @expression = operator, expression
7
+ end
8
+
9
+ def simple?
10
+ false
11
+ end
12
+
13
+ def to_s
14
+ "#{parenthesise(@expression)} #{@operator}"
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ module Piglet
2
+ class Filter # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, expression)
6
+ @sources, @expression = [relation], expression
7
+ end
8
+
9
+ def to_s
10
+ "FILTER #{@sources.first.alias} BY #{@expression}"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,19 @@
1
+ module Piglet
2
+ class Foreach # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, field_expressions)
6
+ @sources, @field_expressions = [relation], [field_expressions].flatten
7
+ end
8
+
9
+ def to_s
10
+ "FOREACH #{@sources.first.alias} GENERATE #{field_expressions_string}"
11
+ end
12
+
13
+ private
14
+
15
+ def field_expressions_string
16
+ @field_expressions.map { |fe| fe.to_s }.join(', ')
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ module Piglet
2
+ class Group # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, grouping, options={})
6
+ options ||= {}
7
+ @sources, @grouping, @parallel = [relation], grouping, options[:parallel]
8
+ end
9
+
10
+ def to_s
11
+ str = "GROUP #{@sources.first.alias} BY "
12
+ if @grouping.size > 1
13
+ str << "(#{@grouping.join(', ')})"
14
+ else
15
+ str << @grouping.first.to_s
16
+ end
17
+ str << " PARALLEL #{@parallel}" if @parallel
18
+ str
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ module Piglet
2
+ class Illustrate # :nodoc:
3
+ include Storing
4
+ end
5
+ end
@@ -0,0 +1,108 @@
1
+ require 'set'
2
+
3
+
4
+ module Piglet
5
+ class Interpreter
6
+ def initialize(&block)
7
+ @stores = [ ]
8
+
9
+ interpret(&block) if block_given?
10
+ end
11
+
12
+ def interpret(&block)
13
+ if block_given?
14
+ instance_eval(&block)
15
+ end
16
+
17
+ self
18
+ end
19
+
20
+ def to_pig_latin
21
+ return '' if @stores.empty?
22
+
23
+ handled_relations = Set.new
24
+ statements = [ ]
25
+
26
+ @stores.each do |store|
27
+ unless store.relation.nil?
28
+ assignments(store.relation, handled_relations).each do |assignment|
29
+ statements << assignment
30
+ handled_relations << assignment.target
31
+ end
32
+ end
33
+ statements << store
34
+ end
35
+
36
+ statements.flatten.map { |s| s.to_s }.join(";\n") + ";\n"
37
+ end
38
+
39
+ protected
40
+
41
+ # LOAD
42
+ #
43
+ # load('some/path') # => LOAD 'some/path'
44
+ # load('some/path', :using => 'Xyz') # => LOAD 'some/path' USING Xyz
45
+ # load('some/path', :using => :pig_storage) # => LOAD 'some/path' USING PigStorage
46
+ # load('some/path', :schema => [:a, :b]) # => LOAD 'some/path' AS (a, b)
47
+ # load('some/path', :schema => %w(a b c d)) # => LOAD 'some/path' AS (a, b, c, d)
48
+ # load('some/path', :schema => [%w(a chararray), %(b int)]) # => LOAD 'some/path' AS (a:chararray, b:int)
49
+ #
50
+ #--
51
+ #
52
+ # NOTE: the syntax load('path', :schema => {:a => :chararray, :b => :int})
53
+ # would be nice, but the order of the keys can't be guaranteed in Ruby 1.8.
54
+ def load(path, options={})
55
+ Load.new(path, options)
56
+ end
57
+
58
+ # STORE
59
+ #
60
+ # store(x, 'some/path') # => STORE x INTO 'some/path'
61
+ # store(x, 'some/path', :using => 'Xyz') # => STORE x INTO 'some/path' USING Xyz
62
+ # store(x, 'some/path', :using => :pig_storage) # => STORE x INTO 'some/path' USING PigStorage
63
+ def store(relation, path, options={})
64
+ @stores << Store.new(relation, path, options)
65
+ end
66
+
67
+ # DUMP
68
+ #
69
+ # dump(x) # => DUMP x
70
+ def dump(relation)
71
+ @stores << Dump.new(relation)
72
+ end
73
+
74
+ # ILLUSTRATE
75
+ #
76
+ # illustrate(x) # => ILLUSTRATE x
77
+ def illustrate(relation)
78
+ @stores << Illustrate.new(relation)
79
+ end
80
+
81
+ # DESCRIBE
82
+ #
83
+ # describe(x) # => DESCRIBE x
84
+ def describe(relation)
85
+ @stores << Describe.new(relation)
86
+ end
87
+
88
+ # EXPLAIN
89
+ #
90
+ # explain # => EXPLAIN
91
+ # explain(x) # => EXPLAIN(x)
92
+ def explain(relation=nil)
93
+ @stores << Explain.new(relation)
94
+ end
95
+
96
+ private
97
+
98
+ def assignments(relation, ignore_set)
99
+ return [] if ignore_set.include?(relation)
100
+ assignment = Assignment.new(relation)
101
+ if relation.sources
102
+ (relation.sources.map { |source| assignments(source, ignore_set) } + [assignment]).flatten
103
+ else
104
+ [assignment]
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,20 @@
1
+ module Piglet
2
+ class Join # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, description)
6
+ @join_fields = Hash[*description.select { |k, v| k.is_a?(Relation) }.flatten]
7
+ @sources = @join_fields.keys
8
+ @using = description[:using]
9
+ @parallel = description[:parallel]
10
+ end
11
+
12
+ def to_s
13
+ joins = @sources.map { |s| "#{s.alias} BY #{@join_fields[s]}" }.join(', ')
14
+ str = "JOIN #{joins}"
15
+ str << " USING \"#{@using.to_s}\"" if @using
16
+ str << " PARALLEL #{@parallel}" if @parallel
17
+ str
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,13 @@
1
+ module Piglet
2
+ class Limit # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, n)
6
+ @sources, @n = [relation], n
7
+ end
8
+
9
+ def to_s
10
+ "LIMIT #{@sources.first.alias} #{@n}"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,31 @@
1
+ module Piglet
2
+ class Load # :nodoc:
3
+ include Relation
4
+ include LoadAndStore
5
+
6
+ def initialize(path, options={})
7
+ options ||= {}
8
+ @path, @using, @schema = path, options[:using], options[:schema]
9
+ end
10
+
11
+ def to_s
12
+ str = "LOAD '#{@path}'"
13
+ str << " USING #{resolve_load_store_function(@using)}" if @using
14
+ str << " AS (#{schema_string})" if @schema
15
+ str
16
+ end
17
+
18
+ private
19
+
20
+ def schema_string
21
+ @schema.map do |field|
22
+ if field.is_a?(Enumerable)
23
+ field.map { |f| f.to_s }.join(':')
24
+ else
25
+ field.to_s
26
+ end
27
+ end.join(', ')
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,16 @@
1
+ module Piglet
2
+ module LoadAndStore # :nodoc:
3
+ LOAD_STORE_FUNCTIONS = {
4
+ :binary_serializer => 'BinarySerializer',
5
+ :binary_deserializer => 'BinaryDeserializer',
6
+ :bin_storage => 'BinStorage',
7
+ :pig_storage => 'PigStorage',
8
+ :pig_dump => 'PigDump',
9
+ :text_loader => 'TextLoader'
10
+ }
11
+
12
+ def resolve_load_store_function(name)
13
+ LOAD_STORE_FUNCTIONS[name] || name.to_s
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ module Piglet
2
+ class Order # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, fields, options)
6
+ options ||= {}
7
+ @sources, @parallel = [relation], options[:parallel]
8
+ @fields = fields.is_a?(Enumerable) ? fields : [fields]
9
+ end
10
+
11
+ def to_s
12
+ "ORDER #{@sources.first.alias} BY #{field_strings}"
13
+ end
14
+
15
+ private
16
+
17
+ def field_strings
18
+ @fields.map { |f| field_string(f) }.join(', ')
19
+ end
20
+
21
+ def field_string(f)
22
+ if f.is_a?(Enumerable)
23
+ "#{f[0]} #{f[1].to_s.upcase}"
24
+ else
25
+ f.to_s
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,177 @@
1
+ module Piglet
2
+ module Relation
3
+ attr_reader :sources
4
+
5
+ # The name this relation will get in Pig Latin. Then name is generated when
6
+ # the relation is outputed by the interpreter, and will be unique.
7
+ def alias
8
+ @alias ||= Relation.next_alias
9
+ end
10
+
11
+ # GROUP
12
+ #
13
+ # x.group(:a) # => GROUP x By a
14
+ # x.group(:a, :b, :c) # => GROUP x BY (a, b, c)
15
+ # x.group([:a, :b, :c], :parallel => 3) # => GROUP x BY (a, b, c) PARALLEL 3
16
+ def group(*args)
17
+ grouping, options = split_at_options(args)
18
+ Group.new(self, [grouping].flatten, options)
19
+ end
20
+
21
+ # DISTINCT
22
+ #
23
+ # x.distinct # => DISTINCT x
24
+ # x.distinct(:parallel => 5) # => DISTINCT x PARALLEL 5
25
+ def distinct(options={})
26
+ Distinct.new(self, options)
27
+ end
28
+
29
+ # COGROUP
30
+ #
31
+ # x.cogroup(x => :a, y => :b) # => COGROUP x BY a, y BY b
32
+ # x.cogroup(x => :a, y => :b, z => :c) # => COGROUP x BY a, y BY b, z BY c
33
+ # x.cogroup(x => [:a, :b], y => [:c, :d]) # => COGROUP x BY (a, b), y BY (c, d)
34
+ # x.cogroup(x => :a, y => [:b, :inner]) # => COGROUP x BY a, y BY b INNER
35
+ # x.cogroup(x => :a, y => :b, :parallel => 5) # => COGROUP x BY a, y BY b PARALLEL 5
36
+ def cogroup(description)
37
+ Cogroup.new(self, description)
38
+ end
39
+
40
+ # CROSS
41
+ #
42
+ # x.cross(y) # => CROSS x, y
43
+ # x.cross(y, z, w) # => CROSS x, y, z, w
44
+ # x.cross([y, z], :parallel => 5) # => CROSS x, y, z, w PARALLEL 5
45
+ def cross(*args)
46
+ relations, options = split_at_options(args)
47
+ Cross.new(([self] + relations).flatten, options)
48
+ end
49
+
50
+ # FILTER
51
+ #
52
+ # x.filter { |r| r.a == r.b } # => FILTER x BY a == b
53
+ # x.filter { |r| r.a > r.b && r.c != 3 } # => FILTER x BY a > b AND c != 3
54
+ def filter
55
+ Filter.new(self, yield(self))
56
+ end
57
+
58
+ # FOREACH ... GENERATE
59
+ #
60
+ # x.foreach { |r| r.a } # => FOREACH x GENERATE a
61
+ # x.foreach { |r| [r.a, r.b] } # => FOREACH x GENERATE a, b
62
+ # x.foreach { |r| r.a.max } # => FOREACH x GENERATE MAX(a)
63
+ # x.foreach { |r| r.a.avg.as(:b) } # => FOREACH x GENERATE AVG(a) AS b
64
+ #
65
+ #--
66
+ #
67
+ # TODO: FOREACH a { b GENERATE c }
68
+ def foreach
69
+ Foreach.new(self, yield(self))
70
+ end
71
+
72
+ # JOIN
73
+ #
74
+ # x.join(x => :a, y => :b) # => JOIN x BY a, y BY b
75
+ # x.join(x => :a, y => :b, z => :c) # => JOIN x BY a, y BY b, z BY c
76
+ # x.join(x => :a, y => :b, :using => :replicated) # => JOIN x BY a, y BY b USING "replicated"
77
+ # x.join(x => :a, y => :b, :parallel => 5) # => JOIN x BY a, y BY b PARALLEL 5
78
+ def join(description)
79
+ Join.new(self, description)
80
+ end
81
+
82
+ # LIMIT
83
+ #
84
+ # x.limit(10) # => LIMIT x 10
85
+ def limit(n)
86
+ Limit.new(self, n)
87
+ end
88
+
89
+ # ORDER
90
+ #
91
+ # x.order(:a) # => ORDER x BY a
92
+ # x.order(:a, :b) # => ORDER x BY a, b
93
+ # x.order([:a, :asc], [:b, :desc]) # => ORDER x BY a ASC, b DESC
94
+ # x.order(:a, :parallel => 5) # => ORDER x BY a PARALLEL 5
95
+ #
96
+ #--
97
+ #
98
+ # NOTE: the syntax x.order(:a => :asc, :b => :desc) would be nice, but in
99
+ # Ruby 1.8 the order of the keys cannot be guaranteed.
100
+ def order(*args)
101
+ fields, options = split_at_options(args)
102
+ fields = *fields
103
+ Order.new(self, fields, options)
104
+ end
105
+
106
+ # SAMPLE
107
+ #
108
+ # x.sample(5) # => SAMPLE x 5;
109
+ def sample(n)
110
+ Sample.new(self, n)
111
+ end
112
+
113
+ # SPLIT
114
+ #
115
+ # y, z = x.split { |r| [r.a <= 3, r.b > 4]} # => SPLIT x INTO y IF a <= 3, z IF a > 4
116
+ def split
117
+ Split.new(self, yield(self)).shards
118
+ end
119
+
120
+ # STREAM
121
+ #
122
+ # x.stream(x, 'cut -f 3') # => STREAM x THROUGH `cut -f 3`
123
+ # x.stream([x, y], 'cut -f 3') # => STREAM x, y THROUGH `cut -f 3`
124
+ # x.stream(x, 'cut -f 3', :schema => [%w(a int)]) # => STREAM x THROUGH `cut -f 3` AS (a:int)
125
+ #
126
+ #--
127
+ #
128
+ # TODO: how to handle DEFINE'd commands?
129
+ def stream(relations, command, options={})
130
+ raise NotSupportedError
131
+ end
132
+
133
+ # UNION
134
+ #
135
+ # x.union(y) # => UNION x, y
136
+ # x.union(y, z) # => UNION x, y, z
137
+ def union(*relations)
138
+ Union.new(*([self] + relations))
139
+ end
140
+
141
+ def method_missing(name, *args)
142
+ if name.to_s =~ /^\w+$/ && args.empty?
143
+ Field.new(name, self)
144
+ else
145
+ super
146
+ end
147
+ end
148
+
149
+ def [](n)
150
+ Field.new("\$#{n}", self)
151
+ end
152
+
153
+ def hash
154
+ self.alias.hash
155
+ end
156
+
157
+ def eql?(other)
158
+ other.is_a?(Relation) && other.alias == self.alias
159
+ end
160
+
161
+ private
162
+
163
+ def split_at_options(parameters)
164
+ if parameters.last.is_a? Hash
165
+ [parameters[0..-2], parameters.last]
166
+ else
167
+ [parameters, nil]
168
+ end
169
+ end
170
+
171
+ def self.next_alias
172
+ @counter ||= 0
173
+ @counter += 1
174
+ "relation_#{@counter}"
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,13 @@
1
+ module Piglet
2
+ class Sample # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, n)
6
+ @sources, @n = [relation], n
7
+ end
8
+
9
+ def to_s
10
+ "SAMPLE #{@sources.first.alias} #{@n}"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,41 @@
1
+ module Piglet
2
+ class Split # :nodoc:
3
+ include Relation
4
+
5
+
6
+ def initialize(relation, expressions)
7
+ @sources, @expressions = [relation], expressions
8
+ @shard_map = create_shards
9
+ end
10
+
11
+ def shards
12
+ @shard_map.keys
13
+ end
14
+
15
+ def to_s
16
+ "SPLIT #{@sources.first.alias} INTO #{split_strings}"
17
+ end
18
+
19
+ private
20
+
21
+ def create_shards
22
+ Hash[*@expressions.map { |expr| [RelationShard.new(self), expr] }.flatten]
23
+ end
24
+
25
+ def split_strings
26
+ shards.map { |relation| "#{relation.alias} IF #{@shard_map[relation]}" }.join(', ')
27
+ end
28
+ end
29
+
30
+ class RelationShard # :nodoc:
31
+ include Relation
32
+
33
+ def initialize(split)
34
+ @sources = [split]
35
+ end
36
+
37
+ def to_s
38
+ self.alias
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,17 @@
1
+ module Piglet
2
+ class Store # :nodoc:
3
+ include LoadAndStore
4
+ include Storing
5
+
6
+ def initialize(relation, path, options={})
7
+ @relation, @path, @using = relation, path, options[:using]
8
+ end
9
+
10
+ def to_s
11
+ str = super
12
+ str << " INTO '#{@path}'"
13
+ str << " USING #{resolve_load_store_function(@using)}" if @using
14
+ str
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ module Piglet
2
+ module Storing # :nodoc:
3
+ attr_reader :relation
4
+
5
+ def initialize(relation)
6
+ @relation = relation
7
+ end
8
+
9
+ def to_s
10
+ "#{self.class.name.split(/::/).last.upcase} #{@relation.alias}"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,5 @@
1
+ module Piglet
2
+ class Stream # :nodoc:
3
+ include Relation
4
+ end
5
+ end
@@ -0,0 +1,19 @@
1
+ module Piglet
2
+ class Union # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(*relations)
6
+ @sources = relations
7
+ end
8
+
9
+ def to_s
10
+ "UNION #{source_aliases.join(', ')}"
11
+ end
12
+
13
+ private
14
+
15
+ def source_aliases
16
+ @sources.map { |s| s.alias }
17
+ end
18
+ end
19
+ end
data/lib/piglet.rb ADDED
@@ -0,0 +1,45 @@
1
+ module Piglet
2
+ VERSION = '0.1.0'
3
+
4
+ autoload_files = %w(
5
+ assignment
6
+ cogroup
7
+ cross
8
+ describe
9
+ distinct
10
+ dump
11
+ explain
12
+ field
13
+ field_expression_functions
14
+ field_function_expression
15
+ field_infix_expression
16
+ field_prefix_expression
17
+ field_rename
18
+ field_suffix_expression
19
+ filter
20
+ foreach
21
+ group
22
+ illustrate
23
+ interpreter
24
+ join
25
+ limit
26
+ load
27
+ load_and_store
28
+ order
29
+ relation
30
+ sample
31
+ split
32
+ store
33
+ storing
34
+ stream
35
+ union
36
+ )
37
+
38
+ autoload_files.each do |f|
39
+ c = f.split('_').map { |s| s.capitalize }.join.to_sym
40
+ p = "piglet/#{f}"
41
+ autoload c, p
42
+ end
43
+
44
+ class NotSupportedError < StandardError; end
45
+ end