piglet 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. data/.document +5 -0
  2. data/.gitignore +22 -0
  3. data/LICENSE +20 -0
  4. data/README.rdoc +293 -0
  5. data/Rakefile +50 -0
  6. data/bin/piglet +9 -0
  7. data/examples/analysis.rb +311 -0
  8. data/examples/scratch.rb +11 -0
  9. data/examples/spike1.rb +43 -0
  10. data/examples/spike2.rb +40 -0
  11. data/examples/test1.rb +3 -0
  12. data/examples/test2.rb +5 -0
  13. data/examples/test3.rb +4 -0
  14. data/lib/piglet/assignment.rb +13 -0
  15. data/lib/piglet/cogroup.rb +31 -0
  16. data/lib/piglet/cross.rb +22 -0
  17. data/lib/piglet/describe.rb +5 -0
  18. data/lib/piglet/distinct.rb +16 -0
  19. data/lib/piglet/dump.rb +5 -0
  20. data/lib/piglet/explain.rb +13 -0
  21. data/lib/piglet/field.rb +40 -0
  22. data/lib/piglet/field_expression_functions.rb +62 -0
  23. data/lib/piglet/field_function_expression.rb +19 -0
  24. data/lib/piglet/field_infix_expression.rb +17 -0
  25. data/lib/piglet/field_prefix_expression.rb +21 -0
  26. data/lib/piglet/field_rename.rb +11 -0
  27. data/lib/piglet/field_suffix_expression.rb +17 -0
  28. data/lib/piglet/filter.rb +13 -0
  29. data/lib/piglet/foreach.rb +19 -0
  30. data/lib/piglet/group.rb +21 -0
  31. data/lib/piglet/illustrate.rb +5 -0
  32. data/lib/piglet/interpreter.rb +108 -0
  33. data/lib/piglet/join.rb +20 -0
  34. data/lib/piglet/limit.rb +13 -0
  35. data/lib/piglet/load.rb +31 -0
  36. data/lib/piglet/load_and_store.rb +16 -0
  37. data/lib/piglet/order.rb +29 -0
  38. data/lib/piglet/relation.rb +177 -0
  39. data/lib/piglet/sample.rb +13 -0
  40. data/lib/piglet/split.rb +41 -0
  41. data/lib/piglet/store.rb +17 -0
  42. data/lib/piglet/storing.rb +13 -0
  43. data/lib/piglet/stream.rb +5 -0
  44. data/lib/piglet/union.rb +19 -0
  45. data/lib/piglet.rb +45 -0
  46. data/spec/piglet/field_spec.rb +130 -0
  47. data/spec/piglet/interpreter_spec.rb +413 -0
  48. data/spec/piglet/relation_spec.rb +79 -0
  49. data/spec/piglet/split_spec.rb +34 -0
  50. data/spec/piglet_spec.rb +7 -0
  51. data/spec/spec.opts +3 -0
  52. data/spec/spec_helper.rb +14 -0
  53. metadata +123 -0
@@ -0,0 +1,21 @@
1
+ module Piglet
2
+ class FieldPrefixExpression # :nodoc:
3
+ include FieldExpressionFunctions
4
+
5
+ def initialize(operator, expression, space_between=true)
6
+ @operator, @expression, @space_between = operator, expression, space_between
7
+ end
8
+
9
+ def simple?
10
+ true
11
+ end
12
+
13
+ def to_s
14
+ if @space_between
15
+ "#{@operator} #{parenthesise(@expression)}"
16
+ else
17
+ "#{@operator}#{parenthesise(@expression)}"
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,11 @@
1
+ module Piglet
2
+ class FieldRename # :nodoc:
3
+ def initialize(new_name, field_expression)
4
+ @new_name, @field_expression = new_name, field_expression
5
+ end
6
+
7
+ def to_s
8
+ "#{@field_expression} AS #{@new_name}"
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,17 @@
1
+ module Piglet
2
+ class FieldSuffixExpression # :nodoc:
3
+ include FieldExpressionFunctions
4
+
5
+ def initialize(operator, expression)
6
+ @operator, @expression = operator, expression
7
+ end
8
+
9
+ def simple?
10
+ false
11
+ end
12
+
13
+ def to_s
14
+ "#{parenthesise(@expression)} #{@operator}"
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ module Piglet
2
+ class Filter # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, expression)
6
+ @sources, @expression = [relation], expression
7
+ end
8
+
9
+ def to_s
10
+ "FILTER #{@sources.first.alias} BY #{@expression}"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,19 @@
1
+ module Piglet
2
+ class Foreach # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, field_expressions)
6
+ @sources, @field_expressions = [relation], [field_expressions].flatten
7
+ end
8
+
9
+ def to_s
10
+ "FOREACH #{@sources.first.alias} GENERATE #{field_expressions_string}"
11
+ end
12
+
13
+ private
14
+
15
+ def field_expressions_string
16
+ @field_expressions.map { |fe| fe.to_s }.join(', ')
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,21 @@
1
+ module Piglet
2
+ class Group # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, grouping, options={})
6
+ options ||= {}
7
+ @sources, @grouping, @parallel = [relation], grouping, options[:parallel]
8
+ end
9
+
10
+ def to_s
11
+ str = "GROUP #{@sources.first.alias} BY "
12
+ if @grouping.size > 1
13
+ str << "(#{@grouping.join(', ')})"
14
+ else
15
+ str << @grouping.first.to_s
16
+ end
17
+ str << " PARALLEL #{@parallel}" if @parallel
18
+ str
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ module Piglet
2
+ class Illustrate # :nodoc:
3
+ include Storing
4
+ end
5
+ end
@@ -0,0 +1,108 @@
1
+ require 'set'
2
+
3
+
4
+ module Piglet
5
+ class Interpreter
6
+ def initialize(&block)
7
+ @stores = [ ]
8
+
9
+ interpret(&block) if block_given?
10
+ end
11
+
12
+ def interpret(&block)
13
+ if block_given?
14
+ instance_eval(&block)
15
+ end
16
+
17
+ self
18
+ end
19
+
20
+ def to_pig_latin
21
+ return '' if @stores.empty?
22
+
23
+ handled_relations = Set.new
24
+ statements = [ ]
25
+
26
+ @stores.each do |store|
27
+ unless store.relation.nil?
28
+ assignments(store.relation, handled_relations).each do |assignment|
29
+ statements << assignment
30
+ handled_relations << assignment.target
31
+ end
32
+ end
33
+ statements << store
34
+ end
35
+
36
+ statements.flatten.map { |s| s.to_s }.join(";\n") + ";\n"
37
+ end
38
+
39
+ protected
40
+
41
+ # LOAD
42
+ #
43
+ # load('some/path') # => LOAD 'some/path'
44
+ # load('some/path', :using => 'Xyz') # => LOAD 'some/path' USING Xyz
45
+ # load('some/path', :using => :pig_storage) # => LOAD 'some/path' USING PigStorage
46
+ # load('some/path', :schema => [:a, :b]) # => LOAD 'some/path' AS (a, b)
47
+ # load('some/path', :schema => %w(a b c d)) # => LOAD 'some/path' AS (a, b, c, d)
48
+ # load('some/path', :schema => [%w(a chararray), %(b int)]) # => LOAD 'some/path' AS (a:chararray, b:int)
49
+ #
50
+ #--
51
+ #
52
+ # NOTE: the syntax load('path', :schema => {:a => :chararray, :b => :int})
53
+ # would be nice, but the order of the keys can't be guaranteed in Ruby 1.8.
54
+ def load(path, options={})
55
+ Load.new(path, options)
56
+ end
57
+
58
+ # STORE
59
+ #
60
+ # store(x, 'some/path') # => STORE x INTO 'some/path'
61
+ # store(x, 'some/path', :using => 'Xyz') # => STORE x INTO 'some/path' USING Xyz
62
+ # store(x, 'some/path', :using => :pig_storage) # => STORE x INTO 'some/path' USING PigStorage
63
+ def store(relation, path, options={})
64
+ @stores << Store.new(relation, path, options)
65
+ end
66
+
67
+ # DUMP
68
+ #
69
+ # dump(x) # => DUMP x
70
+ def dump(relation)
71
+ @stores << Dump.new(relation)
72
+ end
73
+
74
+ # ILLUSTRATE
75
+ #
76
+ # illustrate(x) # => ILLUSTRATE x
77
+ def illustrate(relation)
78
+ @stores << Illustrate.new(relation)
79
+ end
80
+
81
+ # DESCRIBE
82
+ #
83
+ # describe(x) # => DESCRIBE x
84
+ def describe(relation)
85
+ @stores << Describe.new(relation)
86
+ end
87
+
88
+ # EXPLAIN
89
+ #
90
+ # explain # => EXPLAIN
91
+ # explain(x) # => EXPLAIN(x)
92
+ def explain(relation=nil)
93
+ @stores << Explain.new(relation)
94
+ end
95
+
96
+ private
97
+
98
+ def assignments(relation, ignore_set)
99
+ return [] if ignore_set.include?(relation)
100
+ assignment = Assignment.new(relation)
101
+ if relation.sources
102
+ (relation.sources.map { |source| assignments(source, ignore_set) } + [assignment]).flatten
103
+ else
104
+ [assignment]
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,20 @@
1
+ module Piglet
2
+ class Join # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, description)
6
+ @join_fields = Hash[*description.select { |k, v| k.is_a?(Relation) }.flatten]
7
+ @sources = @join_fields.keys
8
+ @using = description[:using]
9
+ @parallel = description[:parallel]
10
+ end
11
+
12
+ def to_s
13
+ joins = @sources.map { |s| "#{s.alias} BY #{@join_fields[s]}" }.join(', ')
14
+ str = "JOIN #{joins}"
15
+ str << " USING \"#{@using.to_s}\"" if @using
16
+ str << " PARALLEL #{@parallel}" if @parallel
17
+ str
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,13 @@
1
+ module Piglet
2
+ class Limit # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, n)
6
+ @sources, @n = [relation], n
7
+ end
8
+
9
+ def to_s
10
+ "LIMIT #{@sources.first.alias} #{@n}"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,31 @@
1
+ module Piglet
2
+ class Load # :nodoc:
3
+ include Relation
4
+ include LoadAndStore
5
+
6
+ def initialize(path, options={})
7
+ options ||= {}
8
+ @path, @using, @schema = path, options[:using], options[:schema]
9
+ end
10
+
11
+ def to_s
12
+ str = "LOAD '#{@path}'"
13
+ str << " USING #{resolve_load_store_function(@using)}" if @using
14
+ str << " AS (#{schema_string})" if @schema
15
+ str
16
+ end
17
+
18
+ private
19
+
20
+ def schema_string
21
+ @schema.map do |field|
22
+ if field.is_a?(Enumerable)
23
+ field.map { |f| f.to_s }.join(':')
24
+ else
25
+ field.to_s
26
+ end
27
+ end.join(', ')
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,16 @@
1
+ module Piglet
2
+ module LoadAndStore # :nodoc:
3
+ LOAD_STORE_FUNCTIONS = {
4
+ :binary_serializer => 'BinarySerializer',
5
+ :binary_deserializer => 'BinaryDeserializer',
6
+ :bin_storage => 'BinStorage',
7
+ :pig_storage => 'PigStorage',
8
+ :pig_dump => 'PigDump',
9
+ :text_loader => 'TextLoader'
10
+ }
11
+
12
+ def resolve_load_store_function(name)
13
+ LOAD_STORE_FUNCTIONS[name] || name.to_s
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ module Piglet
2
+ class Order # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, fields, options)
6
+ options ||= {}
7
+ @sources, @parallel = [relation], options[:parallel]
8
+ @fields = fields.is_a?(Enumerable) ? fields : [fields]
9
+ end
10
+
11
+ def to_s
12
+ "ORDER #{@sources.first.alias} BY #{field_strings}"
13
+ end
14
+
15
+ private
16
+
17
+ def field_strings
18
+ @fields.map { |f| field_string(f) }.join(', ')
19
+ end
20
+
21
+ def field_string(f)
22
+ if f.is_a?(Enumerable)
23
+ "#{f[0]} #{f[1].to_s.upcase}"
24
+ else
25
+ f.to_s
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,177 @@
1
+ module Piglet
2
+ module Relation
3
+ attr_reader :sources
4
+
5
+ # The name this relation will get in Pig Latin. Then name is generated when
6
+ # the relation is outputed by the interpreter, and will be unique.
7
+ def alias
8
+ @alias ||= Relation.next_alias
9
+ end
10
+
11
+ # GROUP
12
+ #
13
+ # x.group(:a) # => GROUP x By a
14
+ # x.group(:a, :b, :c) # => GROUP x BY (a, b, c)
15
+ # x.group([:a, :b, :c], :parallel => 3) # => GROUP x BY (a, b, c) PARALLEL 3
16
+ def group(*args)
17
+ grouping, options = split_at_options(args)
18
+ Group.new(self, [grouping].flatten, options)
19
+ end
20
+
21
+ # DISTINCT
22
+ #
23
+ # x.distinct # => DISTINCT x
24
+ # x.distinct(:parallel => 5) # => DISTINCT x PARALLEL 5
25
+ def distinct(options={})
26
+ Distinct.new(self, options)
27
+ end
28
+
29
+ # COGROUP
30
+ #
31
+ # x.cogroup(x => :a, y => :b) # => COGROUP x BY a, y BY b
32
+ # x.cogroup(x => :a, y => :b, z => :c) # => COGROUP x BY a, y BY b, z BY c
33
+ # x.cogroup(x => [:a, :b], y => [:c, :d]) # => COGROUP x BY (a, b), y BY (c, d)
34
+ # x.cogroup(x => :a, y => [:b, :inner]) # => COGROUP x BY a, y BY b INNER
35
+ # x.cogroup(x => :a, y => :b, :parallel => 5) # => COGROUP x BY a, y BY b PARALLEL 5
36
+ def cogroup(description)
37
+ Cogroup.new(self, description)
38
+ end
39
+
40
+ # CROSS
41
+ #
42
+ # x.cross(y) # => CROSS x, y
43
+ # x.cross(y, z, w) # => CROSS x, y, z, w
44
+ # x.cross([y, z], :parallel => 5) # => CROSS x, y, z, w PARALLEL 5
45
+ def cross(*args)
46
+ relations, options = split_at_options(args)
47
+ Cross.new(([self] + relations).flatten, options)
48
+ end
49
+
50
+ # FILTER
51
+ #
52
+ # x.filter { |r| r.a == r.b } # => FILTER x BY a == b
53
+ # x.filter { |r| r.a > r.b && r.c != 3 } # => FILTER x BY a > b AND c != 3
54
+ def filter
55
+ Filter.new(self, yield(self))
56
+ end
57
+
58
+ # FOREACH ... GENERATE
59
+ #
60
+ # x.foreach { |r| r.a } # => FOREACH x GENERATE a
61
+ # x.foreach { |r| [r.a, r.b] } # => FOREACH x GENERATE a, b
62
+ # x.foreach { |r| r.a.max } # => FOREACH x GENERATE MAX(a)
63
+ # x.foreach { |r| r.a.avg.as(:b) } # => FOREACH x GENERATE AVG(a) AS b
64
+ #
65
+ #--
66
+ #
67
+ # TODO: FOREACH a { b GENERATE c }
68
+ def foreach
69
+ Foreach.new(self, yield(self))
70
+ end
71
+
72
+ # JOIN
73
+ #
74
+ # x.join(x => :a, y => :b) # => JOIN x BY a, y BY b
75
+ # x.join(x => :a, y => :b, z => :c) # => JOIN x BY a, y BY b, z BY c
76
+ # x.join(x => :a, y => :b, :using => :replicated) # => JOIN x BY a, y BY b USING "replicated"
77
+ # x.join(x => :a, y => :b, :parallel => 5) # => JOIN x BY a, y BY b PARALLEL 5
78
+ def join(description)
79
+ Join.new(self, description)
80
+ end
81
+
82
+ # LIMIT
83
+ #
84
+ # x.limit(10) # => LIMIT x 10
85
+ def limit(n)
86
+ Limit.new(self, n)
87
+ end
88
+
89
+ # ORDER
90
+ #
91
+ # x.order(:a) # => ORDER x BY a
92
+ # x.order(:a, :b) # => ORDER x BY a, b
93
+ # x.order([:a, :asc], [:b, :desc]) # => ORDER x BY a ASC, b DESC
94
+ # x.order(:a, :parallel => 5) # => ORDER x BY a PARALLEL 5
95
+ #
96
+ #--
97
+ #
98
+ # NOTE: the syntax x.order(:a => :asc, :b => :desc) would be nice, but in
99
+ # Ruby 1.8 the order of the keys cannot be guaranteed.
100
+ def order(*args)
101
+ fields, options = split_at_options(args)
102
+ fields = *fields
103
+ Order.new(self, fields, options)
104
+ end
105
+
106
+ # SAMPLE
107
+ #
108
+ # x.sample(5) # => SAMPLE x 5;
109
+ def sample(n)
110
+ Sample.new(self, n)
111
+ end
112
+
113
+ # SPLIT
114
+ #
115
+ # y, z = x.split { |r| [r.a <= 3, r.b > 4]} # => SPLIT x INTO y IF a <= 3, z IF a > 4
116
+ def split
117
+ Split.new(self, yield(self)).shards
118
+ end
119
+
120
+ # STREAM
121
+ #
122
+ # x.stream(x, 'cut -f 3') # => STREAM x THROUGH `cut -f 3`
123
+ # x.stream([x, y], 'cut -f 3') # => STREAM x, y THROUGH `cut -f 3`
124
+ # x.stream(x, 'cut -f 3', :schema => [%w(a int)]) # => STREAM x THROUGH `cut -f 3` AS (a:int)
125
+ #
126
+ #--
127
+ #
128
+ # TODO: how to handle DEFINE'd commands?
129
+ def stream(relations, command, options={})
130
+ raise NotSupportedError
131
+ end
132
+
133
+ # UNION
134
+ #
135
+ # x.union(y) # => UNION x, y
136
+ # x.union(y, z) # => UNION x, y, z
137
+ def union(*relations)
138
+ Union.new(*([self] + relations))
139
+ end
140
+
141
+ def method_missing(name, *args)
142
+ if name.to_s =~ /^\w+$/ && args.empty?
143
+ Field.new(name, self)
144
+ else
145
+ super
146
+ end
147
+ end
148
+
149
+ def [](n)
150
+ Field.new("\$#{n}", self)
151
+ end
152
+
153
+ def hash
154
+ self.alias.hash
155
+ end
156
+
157
+ def eql?(other)
158
+ other.is_a?(Relation) && other.alias == self.alias
159
+ end
160
+
161
+ private
162
+
163
+ def split_at_options(parameters)
164
+ if parameters.last.is_a? Hash
165
+ [parameters[0..-2], parameters.last]
166
+ else
167
+ [parameters, nil]
168
+ end
169
+ end
170
+
171
+ def self.next_alias
172
+ @counter ||= 0
173
+ @counter += 1
174
+ "relation_#{@counter}"
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,13 @@
1
+ module Piglet
2
+ class Sample # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(relation, n)
6
+ @sources, @n = [relation], n
7
+ end
8
+
9
+ def to_s
10
+ "SAMPLE #{@sources.first.alias} #{@n}"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,41 @@
1
+ module Piglet
2
+ class Split # :nodoc:
3
+ include Relation
4
+
5
+
6
+ def initialize(relation, expressions)
7
+ @sources, @expressions = [relation], expressions
8
+ @shard_map = create_shards
9
+ end
10
+
11
+ def shards
12
+ @shard_map.keys
13
+ end
14
+
15
+ def to_s
16
+ "SPLIT #{@sources.first.alias} INTO #{split_strings}"
17
+ end
18
+
19
+ private
20
+
21
+ def create_shards
22
+ Hash[*@expressions.map { |expr| [RelationShard.new(self), expr] }.flatten]
23
+ end
24
+
25
+ def split_strings
26
+ shards.map { |relation| "#{relation.alias} IF #{@shard_map[relation]}" }.join(', ')
27
+ end
28
+ end
29
+
30
+ class RelationShard # :nodoc:
31
+ include Relation
32
+
33
+ def initialize(split)
34
+ @sources = [split]
35
+ end
36
+
37
+ def to_s
38
+ self.alias
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,17 @@
1
+ module Piglet
2
+ class Store # :nodoc:
3
+ include LoadAndStore
4
+ include Storing
5
+
6
+ def initialize(relation, path, options={})
7
+ @relation, @path, @using = relation, path, options[:using]
8
+ end
9
+
10
+ def to_s
11
+ str = super
12
+ str << " INTO '#{@path}'"
13
+ str << " USING #{resolve_load_store_function(@using)}" if @using
14
+ str
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ module Piglet
2
+ module Storing # :nodoc:
3
+ attr_reader :relation
4
+
5
+ def initialize(relation)
6
+ @relation = relation
7
+ end
8
+
9
+ def to_s
10
+ "#{self.class.name.split(/::/).last.upcase} #{@relation.alias}"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,5 @@
1
+ module Piglet
2
+ class Stream # :nodoc:
3
+ include Relation
4
+ end
5
+ end
@@ -0,0 +1,19 @@
1
+ module Piglet
2
+ class Union # :nodoc:
3
+ include Relation
4
+
5
+ def initialize(*relations)
6
+ @sources = relations
7
+ end
8
+
9
+ def to_s
10
+ "UNION #{source_aliases.join(', ')}"
11
+ end
12
+
13
+ private
14
+
15
+ def source_aliases
16
+ @sources.map { |s| s.alias }
17
+ end
18
+ end
19
+ end
data/lib/piglet.rb ADDED
@@ -0,0 +1,45 @@
1
+ module Piglet
2
+ VERSION = '0.1.0'
3
+
4
+ autoload_files = %w(
5
+ assignment
6
+ cogroup
7
+ cross
8
+ describe
9
+ distinct
10
+ dump
11
+ explain
12
+ field
13
+ field_expression_functions
14
+ field_function_expression
15
+ field_infix_expression
16
+ field_prefix_expression
17
+ field_rename
18
+ field_suffix_expression
19
+ filter
20
+ foreach
21
+ group
22
+ illustrate
23
+ interpreter
24
+ join
25
+ limit
26
+ load
27
+ load_and_store
28
+ order
29
+ relation
30
+ sample
31
+ split
32
+ store
33
+ storing
34
+ stream
35
+ union
36
+ )
37
+
38
+ autoload_files.each do |f|
39
+ c = f.split('_').map { |s| s.capitalize }.join.to_sym
40
+ p = "piglet/#{f}"
41
+ autoload c, p
42
+ end
43
+
44
+ class NotSupportedError < StandardError; end
45
+ end