piglet 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/README.rdoc +15 -0
- data/lib/piglet.rb +12 -4
- data/lib/piglet/field/binary_conditional.rb +4 -3
- data/lib/piglet/field/call_expression.rb +6 -6
- data/lib/piglet/field/field.rb +134 -0
- data/lib/piglet/field/infix_expression.rb +25 -2
- data/lib/piglet/field/literal.rb +19 -2
- data/lib/piglet/field/prefix_expression.rb +4 -2
- data/lib/piglet/field/reference.rb +8 -3
- data/lib/piglet/field/rename.rb +5 -3
- data/lib/piglet/field/suffix_expression.rb +4 -2
- data/lib/piglet/inout/load.rb +5 -0
- data/lib/piglet/interpreter.rb +2 -4
- data/lib/piglet/relation/cogroup.rb +15 -0
- data/lib/piglet/relation/cross.rb +5 -0
- data/lib/piglet/relation/foreach.rb +5 -0
- data/lib/piglet/relation/group.rb +16 -0
- data/lib/piglet/relation/join.rb +5 -0
- data/lib/piglet/relation/relation.rb +17 -2
- data/lib/piglet/relation/union.rb +1 -1
- data/lib/piglet/schema/bag.rb +21 -0
- data/lib/piglet/schema/tuple.rb +111 -0
- data/spec/piglet/field/binary_conditional_spec.rb +47 -0
- data/spec/piglet/field/field_spec.rb +103 -0
- data/spec/piglet/field/infix_expression_spec.rb +69 -0
- data/spec/piglet/field/literal_spec.rb +27 -0
- data/spec/piglet/field/reference_spec.rb +15 -1
- data/spec/piglet/interpreter_spec.rb +8 -395
- data/spec/piglet/relation/relation_spec.rb +4 -0
- data/spec/piglet/relation/union_spec.rb +37 -0
- data/spec/piglet/schema/tuple_spec.rb +121 -0
- data/spec/piglet_spec.rb +664 -0
- metadata +17 -3
- data/lib/piglet/field/operators.rb +0 -80
data/.gitignore
CHANGED
data/README.rdoc
CHANGED
@@ -209,6 +209,21 @@ and then use them just as any other operator:
|
|
209
209
|
|
210
210
|
nifty, huh?
|
211
211
|
|
212
|
+
=== Types & schemas
|
213
|
+
|
214
|
+
Piglet knows the schema of relations, so you can do something else that Pig lacks: introspection. This lets you do things like like this code, which counts the unique values of all fields in a relation:
|
215
|
+
|
216
|
+
relation = load('in', :schema => [:a, :b, :c])
|
217
|
+
relation.schema.field_names.each do |field|
|
218
|
+
grouped = relation.group(field)
|
219
|
+
counted = grouped.foreach { |r| [r[1].count] }
|
220
|
+
store(counted, "out-#{field}")
|
221
|
+
end
|
222
|
+
|
223
|
+
This feature obviously only works if you have specified a schema in the call to #load.
|
224
|
+
|
225
|
+
There are currently many limitations to this feature, so use it with caution. Since the schema support isn't completely reliable Piglet does not enforce schemas, and it does not warn you if you try to access a field that doesn't exist. If it had, it would probably be more annoying and limiting than it would be worth.
|
226
|
+
|
212
227
|
== Limitations
|
213
228
|
|
214
229
|
The aim is to support most of Pig Latin, but currently there are some limitations.
|
data/lib/piglet.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
# :main: README.rdoc
|
2
2
|
module Piglet # :nodoc:
|
3
|
-
VERSION = '0.
|
3
|
+
VERSION = '0.2.0'
|
4
|
+
|
5
|
+
class PigletError < StandardError; end
|
6
|
+
class NotSupportedError < PigletError; end
|
4
7
|
|
5
8
|
autoload :Interpreter, 'piglet/interpreter'
|
6
9
|
|
@@ -37,12 +40,17 @@ module Piglet # :nodoc:
|
|
37
40
|
autoload :CallExpression, 'piglet/field/call_expression'
|
38
41
|
autoload :InfixExpression, 'piglet/field/infix_expression'
|
39
42
|
autoload :Literal, 'piglet/field/literal'
|
40
|
-
autoload :
|
43
|
+
autoload :Field, 'piglet/field/field'
|
41
44
|
autoload :PrefixExpression, 'piglet/field/prefix_expression'
|
42
45
|
autoload :Reference, 'piglet/field/reference'
|
43
46
|
autoload :Rename, 'piglet/field/rename'
|
44
47
|
autoload :SuffixExpression, 'piglet/field/suffix_expression'
|
45
48
|
end
|
46
|
-
|
47
|
-
|
49
|
+
|
50
|
+
module Schema
|
51
|
+
autoload :Bag, 'piglet/schema/bag'
|
52
|
+
autoload :Tuple, 'piglet/schema/tuple'
|
53
|
+
|
54
|
+
class SchemaError < PigletError; end
|
55
|
+
end
|
48
56
|
end
|
@@ -1,12 +1,13 @@
|
|
1
1
|
module Piglet
|
2
2
|
module Field
|
3
|
-
include Operators
|
4
|
-
|
5
3
|
class BinaryConditional
|
4
|
+
include Field
|
5
|
+
|
6
6
|
def initialize(test, if_true, if_false)
|
7
7
|
@test, @if_true, @if_false = test, if_true, if_false
|
8
|
+
@type = expression_type(@if_true)
|
8
9
|
end
|
9
|
-
|
10
|
+
|
10
11
|
def to_s
|
11
12
|
"(#{@test} ? #{@if_true} : #{@if_false})"
|
12
13
|
end
|
@@ -1,20 +1,20 @@
|
|
1
1
|
module Piglet
|
2
2
|
module Field
|
3
3
|
class CallExpression # :nodoc:
|
4
|
-
include
|
4
|
+
include Field
|
5
5
|
|
6
|
-
def initialize(
|
6
|
+
def initialize(function_name, inner_expression, options=nil)
|
7
7
|
options ||= {}
|
8
|
-
@
|
9
|
-
@
|
8
|
+
@function_name, @inner_expression = function_name, inner_expression
|
9
|
+
@type = options[:type] || inner_expression.type
|
10
10
|
end
|
11
|
-
|
11
|
+
|
12
12
|
def simple?
|
13
13
|
false
|
14
14
|
end
|
15
15
|
|
16
16
|
def to_s
|
17
|
-
"#{@
|
17
|
+
"#{@function_name}(#{@inner_expression})"
|
18
18
|
end
|
19
19
|
end
|
20
20
|
end
|
@@ -0,0 +1,134 @@
|
|
1
|
+
module Piglet
|
2
|
+
module Field
|
3
|
+
module Field # :nodoc:
|
4
|
+
SYMBOLIC_OPERATORS = [:==, :>, :<, :>=, :<=, :%, :+, :-, :*, :/]
|
5
|
+
FUNCTIONS = [:avg, :count, :max, :min, :size, :sum, :tokenize]
|
6
|
+
|
7
|
+
attr_reader :name, :type
|
8
|
+
|
9
|
+
FUNCTIONS.each do |fun|
|
10
|
+
define_method(fun) do
|
11
|
+
CallExpression.new(fun.to_s.upcase, self, :type => function_return_type(fun, self.type))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def empty?
|
16
|
+
CallExpression.new('IsEmpty', self, :type => :boolean)
|
17
|
+
end
|
18
|
+
|
19
|
+
def diff(other)
|
20
|
+
raise NotSupportedError
|
21
|
+
end
|
22
|
+
|
23
|
+
def as(new_name)
|
24
|
+
Rename.new(new_name, self)
|
25
|
+
end
|
26
|
+
|
27
|
+
def not
|
28
|
+
PrefixExpression.new('NOT', self, true, :type => :boolean)
|
29
|
+
end
|
30
|
+
|
31
|
+
def null?
|
32
|
+
SuffixExpression.new('is null', self, :type => :boolean)
|
33
|
+
end
|
34
|
+
|
35
|
+
def not_null?
|
36
|
+
SuffixExpression.new('is not null', self, :type => :boolean)
|
37
|
+
end
|
38
|
+
|
39
|
+
def cast(type)
|
40
|
+
PrefixExpression.new("(#{type.to_s})", self, true, :type => type.to_sym)
|
41
|
+
end
|
42
|
+
|
43
|
+
def matches(pattern)
|
44
|
+
regex_options_pattern = /^\(\?.+?:(.*)\)$/
|
45
|
+
pattern = pattern.to_s.sub(regex_options_pattern, '\1') if pattern.is_a?(Regexp) && pattern.to_s =~ regex_options_pattern
|
46
|
+
InfixExpression.new('matches', self, "'#{pattern.to_s}'", :type => :boolean)
|
47
|
+
end
|
48
|
+
|
49
|
+
def neg
|
50
|
+
PrefixExpression.new('-', self, false, :type => self.type)
|
51
|
+
end
|
52
|
+
|
53
|
+
def ne(other)
|
54
|
+
InfixExpression.new('!=', self, other, :type => :boolean)
|
55
|
+
end
|
56
|
+
|
57
|
+
def and(other)
|
58
|
+
InfixExpression.new('AND', self, other, :type => :boolean)
|
59
|
+
end
|
60
|
+
|
61
|
+
def or(other)
|
62
|
+
InfixExpression.new('OR', self, other, :type => :boolean)
|
63
|
+
end
|
64
|
+
|
65
|
+
SYMBOLIC_OPERATORS.each do |op|
|
66
|
+
define_method(op) do |other|
|
67
|
+
InfixExpression.new(op.to_s, self, other, :type => symbolic_operator_return_type(op, self, other))
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
protected
|
72
|
+
|
73
|
+
def parenthesise(expr)
|
74
|
+
if expr.respond_to?(:simple?) && ! expr.simple?
|
75
|
+
"(#{expr})"
|
76
|
+
else
|
77
|
+
expr.to_s
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def escape(str)
|
82
|
+
str.gsub(/("|'|\\)/) { |m| "\\#{$1}" }
|
83
|
+
end
|
84
|
+
|
85
|
+
def function_return_type(function_name, expression_type)
|
86
|
+
case function_name
|
87
|
+
when :avg, :sum
|
88
|
+
case expression_type
|
89
|
+
when :int, :long
|
90
|
+
:long
|
91
|
+
when :float, :double, :bytearray
|
92
|
+
:double
|
93
|
+
else
|
94
|
+
nil
|
95
|
+
end
|
96
|
+
when :count, :size
|
97
|
+
:long
|
98
|
+
when :max, :min
|
99
|
+
expression_type
|
100
|
+
when :tokenize
|
101
|
+
:bag
|
102
|
+
else
|
103
|
+
nil
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def symbolic_operator_return_type(operator, left_expression, right_expression)
|
108
|
+
case operator
|
109
|
+
when :==, :>, :<, :>=, :<=
|
110
|
+
:boolean
|
111
|
+
when :%
|
112
|
+
:int
|
113
|
+
else # :+, :-, :*, :/
|
114
|
+
nil
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def expression_type(expression)
|
119
|
+
case expression
|
120
|
+
when Field
|
121
|
+
expression.type
|
122
|
+
when Integer
|
123
|
+
:int
|
124
|
+
when Numeric
|
125
|
+
:float
|
126
|
+
when true, false
|
127
|
+
:boolean
|
128
|
+
else
|
129
|
+
nil
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
@@ -1,10 +1,16 @@
|
|
1
1
|
module Piglet
|
2
2
|
module Field
|
3
3
|
class InfixExpression # :nodoc:
|
4
|
-
include
|
4
|
+
include Field
|
5
5
|
|
6
|
-
def initialize(operator, left_expression, right_expression)
|
6
|
+
def initialize(operator, left_expression, right_expression, options=nil)
|
7
|
+
options ||= {}
|
7
8
|
@operator, @left_expression, @right_expression = operator, left_expression, right_expression
|
9
|
+
if options[:type]
|
10
|
+
@type = options[:type]
|
11
|
+
else
|
12
|
+
@type = determine_type(@left_expression, @right_expression)
|
13
|
+
end
|
8
14
|
end
|
9
15
|
|
10
16
|
def simple?
|
@@ -14,6 +20,23 @@ module Piglet
|
|
14
20
|
def to_s
|
15
21
|
"#{parenthesise(@left_expression)} #{@operator} #{parenthesise(@right_expression)}"
|
16
22
|
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def determine_type(left, right)
|
27
|
+
left_type = expression_type(left)
|
28
|
+
right_type = expression_type(right)
|
29
|
+
|
30
|
+
if left_type == :double || right_type == :double
|
31
|
+
:double
|
32
|
+
elsif left_type == :float || right_type == :float
|
33
|
+
:float
|
34
|
+
elsif left_type == :long || right_type == :long
|
35
|
+
:long
|
36
|
+
else
|
37
|
+
left_type
|
38
|
+
end
|
39
|
+
end
|
17
40
|
end
|
18
41
|
end
|
19
42
|
end
|
data/lib/piglet/field/literal.rb
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
module Piglet
|
2
2
|
module Field
|
3
3
|
class Literal
|
4
|
-
include
|
4
|
+
include Field
|
5
5
|
|
6
|
-
def initialize(obj)
|
6
|
+
def initialize(obj, options=nil)
|
7
|
+
options ||= {}
|
7
8
|
@obj = obj
|
9
|
+
@type = options[:type] || literal_type(obj)
|
8
10
|
end
|
9
11
|
|
10
12
|
def to_s
|
@@ -15,6 +17,21 @@ module Piglet
|
|
15
17
|
"'#{escape(@obj.to_s)}'"
|
16
18
|
end
|
17
19
|
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def literal_type(obj)
|
24
|
+
case obj
|
25
|
+
when String
|
26
|
+
:chararray
|
27
|
+
when Integer
|
28
|
+
:int
|
29
|
+
when Numeric
|
30
|
+
:double
|
31
|
+
else
|
32
|
+
:bytearray
|
33
|
+
end
|
34
|
+
end
|
18
35
|
end
|
19
36
|
end
|
20
37
|
end
|
@@ -1,10 +1,12 @@
|
|
1
1
|
module Piglet
|
2
2
|
module Field
|
3
3
|
class PrefixExpression # :nodoc:
|
4
|
-
include
|
4
|
+
include Field
|
5
5
|
|
6
|
-
def initialize(operator, expression, space_between=true)
|
6
|
+
def initialize(operator, expression, space_between=true, options=nil)
|
7
|
+
options ||= {}
|
7
8
|
@operator, @expression, @space_between = operator, expression, space_between
|
9
|
+
@type = options[:type] || expression.type
|
8
10
|
end
|
9
11
|
|
10
12
|
def simple?
|
@@ -1,28 +1,33 @@
|
|
1
1
|
module Piglet
|
2
2
|
module Field
|
3
3
|
class Reference # :nodoc:
|
4
|
-
include
|
4
|
+
include Field
|
5
5
|
|
6
6
|
def initialize(name, relation=nil, options=nil)
|
7
7
|
options ||= {}
|
8
8
|
@name, @parent = name, relation
|
9
9
|
@explicit_ancestry = options[:explicit_ancestry] || false
|
10
|
+
@type = options[:type]
|
10
11
|
end
|
11
12
|
|
12
13
|
def simple?
|
13
14
|
true
|
14
15
|
end
|
15
16
|
|
17
|
+
def field(name)
|
18
|
+
Reference.new(name, self, :explicit_ancestry => true)
|
19
|
+
end
|
20
|
+
|
16
21
|
def method_missing(name, *args)
|
17
22
|
if name.to_s =~ /^\w+$/ && args.empty?
|
18
|
-
|
23
|
+
field(name)
|
19
24
|
else
|
20
25
|
super
|
21
26
|
end
|
22
27
|
end
|
23
28
|
|
24
29
|
def [](n)
|
25
|
-
|
30
|
+
field("\$#{n}")
|
26
31
|
end
|
27
32
|
|
28
33
|
def to_s
|
data/lib/piglet/field/rename.rb
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
module Piglet
|
2
2
|
module Field
|
3
3
|
class Rename # :nodoc:
|
4
|
+
attr_reader :name, :type
|
5
|
+
|
4
6
|
def initialize(new_name, field_expression)
|
5
|
-
@
|
7
|
+
@name, @field_expression, @type = new_name, field_expression, field_expression.type
|
6
8
|
end
|
7
|
-
|
9
|
+
|
8
10
|
def to_s
|
9
|
-
"#{@field_expression} AS #{@
|
11
|
+
"#{@field_expression} AS #{@name}"
|
10
12
|
end
|
11
13
|
end
|
12
14
|
end
|
@@ -1,10 +1,12 @@
|
|
1
1
|
module Piglet
|
2
2
|
module Field
|
3
3
|
class SuffixExpression # :nodoc:
|
4
|
-
include
|
4
|
+
include Field
|
5
5
|
|
6
|
-
def initialize(operator, expression)
|
6
|
+
def initialize(operator, expression, options=nil)
|
7
|
+
options ||= {}
|
7
8
|
@operator, @expression = operator, expression
|
9
|
+
@type = options[:type] || expression.type
|
8
10
|
end
|
9
11
|
|
10
12
|
def simple?
|
data/lib/piglet/inout/load.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
module Piglet
|
2
2
|
module Inout
|
3
3
|
class Load # :nodoc:
|
4
|
+
include Piglet::Relation::Relation
|
4
5
|
include StorageTypes
|
5
6
|
|
6
7
|
def initialize(path, options={})
|
@@ -8,6 +9,10 @@ module Piglet
|
|
8
9
|
@path, @using, @schema = path, options[:using], options[:schema]
|
9
10
|
end
|
10
11
|
|
12
|
+
def schema
|
13
|
+
Piglet::Schema::Tuple.parse(@schema) if @schema
|
14
|
+
end
|
15
|
+
|
11
16
|
def to_s
|
12
17
|
str = "LOAD '#{@path}'"
|
13
18
|
str << " USING #{resolve_load_store_function(@using)}" if @using
|
data/lib/piglet/interpreter.rb
CHANGED
@@ -27,7 +27,6 @@ module Piglet
|
|
27
27
|
unless store.relation.nil?
|
28
28
|
assignments(store.relation, handled_relations).each do |assignment|
|
29
29
|
statements << assignment
|
30
|
-
handled_relations << assignment.target
|
31
30
|
end
|
32
31
|
end
|
33
32
|
statements << store
|
@@ -52,9 +51,7 @@ module Piglet
|
|
52
51
|
# NOTE: the syntax load('path', :schema => {:a => :chararray, :b => :int})
|
53
52
|
# would be nice, but the order of the keys can't be guaranteed in Ruby 1.8.
|
54
53
|
def load(path, options={})
|
55
|
-
|
56
|
-
load.extend Piglet::Relation::Relation
|
57
|
-
load
|
54
|
+
Inout::Load.new(path, options)
|
58
55
|
end
|
59
56
|
|
60
57
|
# STORE
|
@@ -116,6 +113,7 @@ module Piglet
|
|
116
113
|
def assignments(relation, ignore_set)
|
117
114
|
return [] if ignore_set.include?(relation)
|
118
115
|
assignment = Assignment.new(relation)
|
116
|
+
ignore_set << relation
|
119
117
|
if relation.sources
|
120
118
|
(relation.sources.map { |source| assignments(source, ignore_set) } + [assignment]).flatten
|
121
119
|
else
|