piglet 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/README.rdoc +15 -0
- data/lib/piglet.rb +12 -4
- data/lib/piglet/field/binary_conditional.rb +4 -3
- data/lib/piglet/field/call_expression.rb +6 -6
- data/lib/piglet/field/field.rb +134 -0
- data/lib/piglet/field/infix_expression.rb +25 -2
- data/lib/piglet/field/literal.rb +19 -2
- data/lib/piglet/field/prefix_expression.rb +4 -2
- data/lib/piglet/field/reference.rb +8 -3
- data/lib/piglet/field/rename.rb +5 -3
- data/lib/piglet/field/suffix_expression.rb +4 -2
- data/lib/piglet/inout/load.rb +5 -0
- data/lib/piglet/interpreter.rb +2 -4
- data/lib/piglet/relation/cogroup.rb +15 -0
- data/lib/piglet/relation/cross.rb +5 -0
- data/lib/piglet/relation/foreach.rb +5 -0
- data/lib/piglet/relation/group.rb +16 -0
- data/lib/piglet/relation/join.rb +5 -0
- data/lib/piglet/relation/relation.rb +17 -2
- data/lib/piglet/relation/union.rb +1 -1
- data/lib/piglet/schema/bag.rb +21 -0
- data/lib/piglet/schema/tuple.rb +111 -0
- data/spec/piglet/field/binary_conditional_spec.rb +47 -0
- data/spec/piglet/field/field_spec.rb +103 -0
- data/spec/piglet/field/infix_expression_spec.rb +69 -0
- data/spec/piglet/field/literal_spec.rb +27 -0
- data/spec/piglet/field/reference_spec.rb +15 -1
- data/spec/piglet/interpreter_spec.rb +8 -395
- data/spec/piglet/relation/relation_spec.rb +4 -0
- data/spec/piglet/relation/union_spec.rb +37 -0
- data/spec/piglet/schema/tuple_spec.rb +121 -0
- data/spec/piglet_spec.rb +664 -0
- metadata +17 -3
- data/lib/piglet/field/operators.rb +0 -80
@@ -74,6 +74,10 @@ describe Piglet::Relation::Relation do
|
|
74
74
|
it 'returns fields with positional notation' do
|
75
75
|
@relation[1].to_s.should eql('$1')
|
76
76
|
end
|
77
|
+
|
78
|
+
it 'returns fields through a direct call to #field' do
|
79
|
+
@relation.field(:a).to_s.should eql('a')
|
80
|
+
end
|
77
81
|
end
|
78
82
|
|
79
83
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper')
|
2
|
+
|
3
|
+
|
4
|
+
describe Piglet::Relation::Union do
|
5
|
+
|
6
|
+
before do
|
7
|
+
@relation1 = Object.new
|
8
|
+
@relation1.extend Piglet::Relation::Relation
|
9
|
+
@relation2 = mock('relation2')
|
10
|
+
@relation3 = mock('relation3')
|
11
|
+
@relation1.stub!(:alias).and_return('relation1')
|
12
|
+
@relation2.stub!(:alias).and_return('relation2')
|
13
|
+
@relation3.stub!(:alias).and_return('relation3')
|
14
|
+
end
|
15
|
+
|
16
|
+
describe '#to_s' do
|
17
|
+
it 'outputs the names of all the relations (given as separate arguments)' do
|
18
|
+
pig_latin = @relation1.union(@relation2, @relation3).to_s
|
19
|
+
pig_latin.should include('relation1')
|
20
|
+
pig_latin.should include('relation2')
|
21
|
+
pig_latin.should include('relation3')
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'outputs the names of all the relations (given as an array)' do
|
25
|
+
pig_latin = @relation1.union([@relation2, @relation3]).to_s
|
26
|
+
pig_latin.should include('relation1')
|
27
|
+
pig_latin.should include('relation2')
|
28
|
+
pig_latin.should include('relation3')
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'outputs a UNION statement with the right number of relations' do
|
32
|
+
pig_latin = @relation1.union(@relation2, @relation3).to_s
|
33
|
+
pig_latin.should match(/UNION \w+, \w+, \w+/)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
include Piglet::Schema
|
5
|
+
|
6
|
+
|
7
|
+
describe Tuple do
|
8
|
+
|
9
|
+
describe '.parse' do
|
10
|
+
it 'can parse a non-typed, single field description' do
|
11
|
+
tuple = Tuple.parse([:a])
|
12
|
+
tuple.field_names.should eql([:a])
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'can parse a non-typed, multiple field description' do
|
16
|
+
tuple = Tuple.parse([:a, :b, :c])
|
17
|
+
tuple.field_names.should eql([:a, :b, :c])
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'can parse a typed, single field description' do
|
21
|
+
tuple = Tuple.parse([[:a, :chararray]])
|
22
|
+
tuple.field_names.should eql([:a])
|
23
|
+
tuple.field_type(:a).should eql(:chararray)
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'can parse a typed, multiple field description' do
|
27
|
+
tuple = Tuple.parse([[:a, :chararray], [:b, :double]])
|
28
|
+
tuple.field_names.should eql([:a, :b])
|
29
|
+
tuple.field_type(:a).should eql(:chararray)
|
30
|
+
tuple.field_type(:b).should eql(:double)
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'can parse a mixed typed and non-typed field description' do
|
34
|
+
tuple = Tuple.parse([:a, [:b, :double]])
|
35
|
+
tuple.field_names.should eql([:a, :b])
|
36
|
+
tuple.field_type(:b).should eql(:double)
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'defaults to :bytearray for untyped fields' do
|
40
|
+
tuple = Tuple.parse([:a])
|
41
|
+
tuple.field_type(:a).should eql(:bytearray)
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'accepts a Tuple object as the type of a field' do
|
45
|
+
tuple = Tuple.parse([[:a, Tuple.parse([:c, :d])]])
|
46
|
+
tuple.field_type(:a).should be_a(Tuple)
|
47
|
+
tuple.field_type(:a).field_names.should eql([:c, :d])
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'can parse a Tuple from a field typed as :tuple' do
|
51
|
+
tuple = Tuple.parse([[:a, :tuple, [:c, :d]]])
|
52
|
+
tuple.field_type(:a).should be_a(Tuple)
|
53
|
+
tuple.field_type(:a).field_names.should eql([:c, :d])
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'accepts a Bag object as the type of a field' do
|
57
|
+
tuple = Tuple.parse([[:a, Bag.new(Tuple.parse([:c, :d]))]])
|
58
|
+
tuple.field_type(:a).should be_a(Bag)
|
59
|
+
tuple.field_type(:a).field_names.should eql([:c, :d])
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'can parse a Bag from a field typed as :bag' do
|
63
|
+
tuple = Tuple.parse([[:a, :bag, [:c, :d]]])
|
64
|
+
tuple.field_type(:a).should be_a(Bag)
|
65
|
+
tuple.field_type(:a).field_names.should eql([:c, :d])
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'can parse a description that lacks field names (and fall back to making the fields accessible by index)' do
|
69
|
+
tuple = Tuple.parse([[nil, :chararray], [nil, :int]])
|
70
|
+
tuple.field_type(1).should eql(:int)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe '#union' do
|
75
|
+
it 'creates a new tuple with the fields from two tuples' do
|
76
|
+
t1 = Tuple.parse([:a, :b, :c])
|
77
|
+
t2 = Tuple.parse([:d, :e, :f])
|
78
|
+
t3 = t1.union(t2)
|
79
|
+
t3.field_names.should eql([:a, :b, :c, :d, :e, :f])
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'creates a new tuple with the fields from three tuples' do
|
83
|
+
t1 = Tuple.parse([:a, :b, :c])
|
84
|
+
t2 = Tuple.parse([:d, :e, :f])
|
85
|
+
t3 = Tuple.parse([:g, :h, :i])
|
86
|
+
t4 = t1.union(t2, t3)
|
87
|
+
t4.field_names.should eql([:a, :b, :c, :d, :e, :f, :g, :h, :i])
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'creates a new tuple with the fields from three tuples (arguments as an array)' do
|
91
|
+
t1 = Tuple.parse([:a, :b, :c])
|
92
|
+
t2 = Tuple.parse([:d, :e, :f])
|
93
|
+
t3 = Tuple.parse([:g, :h, :i])
|
94
|
+
t4 = t1.union([t2, t3])
|
95
|
+
t4.field_names.should eql([:a, :b, :c, :d, :e, :f, :g, :h, :i])
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'retains all the fields even if some have the same name' do
|
99
|
+
t1 = Tuple.parse([:a, :b, :c])
|
100
|
+
t2 = Tuple.parse([:b, :c, :d])
|
101
|
+
t3 = t1.union(t2)
|
102
|
+
t3.field_names.should eql([:a, :b, :c, :b, :c, :d])
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
describe '#to_s' do
|
107
|
+
it 'returns the schema string for a simple untyped schema' do
|
108
|
+
Tuple.parse([:a, :b]).to_s.should eql('(a:bytearray, b:bytearray)')
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'returns the schema string for a simple typed schema' do
|
112
|
+
Tuple.parse([[:a, :chararray], [:b, :int]]).to_s.should eql('(a:chararray, b:int)')
|
113
|
+
end
|
114
|
+
|
115
|
+
it 'returns the schema string for a nested schema' do
|
116
|
+
description = [[:a, :tuple, [[:x, :int], [:y, :float]]], [:b, :bag, [[:w, :bytearray]]]]
|
117
|
+
Tuple.parse(description).to_s.should eql('(a:tuple (x:int, y:float), b:bag {w:bytearray})')
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
data/spec/piglet_spec.rb
CHANGED
@@ -3,5 +3,669 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
3
3
|
|
4
4
|
describe Piglet do
|
5
5
|
|
6
|
+
before do
|
7
|
+
@interpreter = Piglet::Interpreter.new
|
8
|
+
end
|
9
|
+
|
10
|
+
context 'load & store operators:' do
|
11
|
+
describe 'LOAD' do
|
12
|
+
it 'outputs a LOAD statement' do
|
13
|
+
@interpreter.interpret { store(load('some/path'), 'out') }
|
14
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path'")
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'outputs a LOAD statement without a USING clause if none specified' do
|
18
|
+
@interpreter.interpret { store(load('some/path'), 'out') }
|
19
|
+
@interpreter.to_pig_latin.should_not include('USING')
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'outputs a LOAD statement with a USING clause with a specified function' do
|
23
|
+
@interpreter.interpret { store(load('some/path', :using => 'XYZ'), 'out') }
|
24
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path' USING XYZ;")
|
25
|
+
end
|
26
|
+
|
27
|
+
Piglet::Inout::StorageTypes::LOAD_STORE_FUNCTIONS.each do |symbolic_name, function|
|
28
|
+
it "knows that the load method :#{symbolic_name} means #{function}" do
|
29
|
+
@interpreter.interpret { store(load('some/path', :using => symbolic_name), 'out') }
|
30
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path' USING #{function};")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'outputs a LOAD statement with an AS clause' do
|
35
|
+
@interpreter.interpret { store(load('some/path', :schema => %w(a b c)), 'out') }
|
36
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path' AS (a, b, c);")
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'outputs a LOAD statement with an AS clause with types' do
|
40
|
+
@interpreter.interpret { store(load('some/path', :schema => [:a, [:b, :chararray], :c]), 'out') }
|
41
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path' AS (a, b:chararray, c);")
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'outputs a LOAD statement with an AS clause with types specified as both strings and symbols' do
|
45
|
+
@interpreter.interpret { store(load('some/path', :schema => [:a, %w(b chararray), :c]), 'out') }
|
46
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path' AS (a, b:chararray, c);")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe 'STORE' do
|
51
|
+
it 'outputs a STORE statement' do
|
52
|
+
@interpreter.interpret { store(load('some/path'), 'out') }
|
53
|
+
@interpreter.to_pig_latin.should match(/STORE \w+ INTO 'out'/)
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'outputs a STORE statement without a USING clause if none specified' do
|
57
|
+
@interpreter.interpret { store(load('some/path'), 'out') }
|
58
|
+
@interpreter.to_pig_latin.should_not include("USING")
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'outputs a STORE statement with a USING clause with a specified function' do
|
62
|
+
@interpreter.interpret { store(load('some/path'), 'out', :using => 'XYZ') }
|
63
|
+
@interpreter.to_pig_latin.should match(/STORE \w+ INTO 'out' USING XYZ/)
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'knows that the load method :pig_storage means PigStorage' do
|
67
|
+
@interpreter.interpret { store(load('some/path'), 'out', :using => :pig_storage) }
|
68
|
+
@interpreter.to_pig_latin.should match(/STORE \w+ INTO 'out' USING PigStorage/)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
describe 'DUMP' do
|
73
|
+
it 'outputs a DUMP statement' do
|
74
|
+
@interpreter.interpret { dump(load('some/path')) }
|
75
|
+
@interpreter.to_pig_latin.should match(/DUMP \w+/)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
context 'diagnostic operators:' do
|
81
|
+
describe 'ILLUSTRATE' do
|
82
|
+
it 'outputs an ILLUSTRATE statement' do
|
83
|
+
@interpreter.interpret { illustrate(load('some/path')) }
|
84
|
+
@interpreter.to_pig_latin.should match(/ILLUSTRATE \w+/)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
describe 'DESCRIBE' do
|
89
|
+
it 'outputs a DESCRIBE statement' do
|
90
|
+
@interpreter.interpret { describe(load('some/path')) }
|
91
|
+
@interpreter.to_pig_latin.should match(/DESCRIBE \w+/)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
describe 'EXPLAIN' do
|
96
|
+
it 'outputs an EXPLAIN statement' do
|
97
|
+
@interpreter.interpret { explain(load('some/path')) }
|
98
|
+
@interpreter.to_pig_latin.should match(/EXPLAIN \w+/)
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'outputs an EXPLAIN statement without an alias' do
|
102
|
+
@interpreter.interpret { explain }
|
103
|
+
@interpreter.to_pig_latin.should match(/EXPLAIN;/)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
context 'relation operators:' do
|
109
|
+
describe 'GROUP' do
|
110
|
+
it 'outputs a GROUP statement with one grouping field' do
|
111
|
+
@interpreter.interpret { store(load('in').group(:a), 'out') }
|
112
|
+
@interpreter.to_pig_latin.should match(/GROUP \w+ BY a/)
|
113
|
+
end
|
114
|
+
|
115
|
+
it 'outputs a GROUP statement with more than one grouping field' do
|
116
|
+
@interpreter.interpret { store(load('in').group(:a, :b, :c), 'out') }
|
117
|
+
@interpreter.to_pig_latin.should match(/GROUP \w+ BY \(a, b, c\)/)
|
118
|
+
end
|
119
|
+
|
120
|
+
it 'outputs a GROUP statement with a PARALLEL clause' do
|
121
|
+
@interpreter.interpret { store(load('in').group([:a, :b, :c], :parallel => 3), 'out') }
|
122
|
+
@interpreter.to_pig_latin.should match(/GROUP \w+ BY \(a, b, c\) PARALLEL 3/)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
describe 'DISTINCT' do
|
127
|
+
it 'outputs a DISTINCT statement' do
|
128
|
+
@interpreter.interpret { store(load('in').distinct, 'out') }
|
129
|
+
@interpreter.to_pig_latin.should match(/DISTINCT \w+/)
|
130
|
+
end
|
131
|
+
|
132
|
+
it 'outputs a DISTINCT statement with a PARALLEL clause' do
|
133
|
+
@interpreter.interpret { store(load('in').distinct(:parallel => 4), 'out') }
|
134
|
+
@interpreter.to_pig_latin.should match(/DISTINCT \w+ PARALLEL 4/)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
describe 'CROSS' do
|
139
|
+
it 'outputs a CROSS statement with two relations' do
|
140
|
+
@interpreter.interpret do
|
141
|
+
a = load('in1')
|
142
|
+
b = load('in2')
|
143
|
+
c = a.cross(b)
|
144
|
+
dump(c)
|
145
|
+
end
|
146
|
+
@interpreter.to_pig_latin.should match(/CROSS \w+, \w+/)
|
147
|
+
end
|
148
|
+
|
149
|
+
it 'outputs a CROSS statement with many relations' do
|
150
|
+
@interpreter.interpret do
|
151
|
+
a = load('in1')
|
152
|
+
b = load('in2')
|
153
|
+
c = load('in3')
|
154
|
+
d = load('in4')
|
155
|
+
e = a.cross(b, c, d)
|
156
|
+
dump(e)
|
157
|
+
end
|
158
|
+
@interpreter.to_pig_latin.should match(/CROSS \w+, \w+, \w+, \w+/)
|
159
|
+
end
|
160
|
+
|
161
|
+
it 'outputs a CROSS statement with a PARALLEL clause' do
|
162
|
+
@interpreter.interpret do
|
163
|
+
a = load('in1')
|
164
|
+
b = load('in2')
|
165
|
+
c = load('in3')
|
166
|
+
d = a.cross([b, c], :parallel => 4)
|
167
|
+
dump(d)
|
168
|
+
end
|
169
|
+
@interpreter.to_pig_latin.should match(/CROSS \w+, \w+, \w+ PARALLEL 4/)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
describe 'UNION' do
|
174
|
+
it 'outputs a UNION statement with two relations' do
|
175
|
+
@interpreter.interpret do
|
176
|
+
a = load('in1')
|
177
|
+
b = load('in2')
|
178
|
+
c = a.union(b)
|
179
|
+
dump(c)
|
180
|
+
end
|
181
|
+
@interpreter.to_pig_latin.should match(/UNION \w+, \w+/)
|
182
|
+
end
|
183
|
+
|
184
|
+
it 'outputs a UNION statement with many relations' do
|
185
|
+
@interpreter.interpret do
|
186
|
+
a = load('in1')
|
187
|
+
b = load('in2')
|
188
|
+
c = load('in3')
|
189
|
+
d = load('in4')
|
190
|
+
e = a.union(b, c, d)
|
191
|
+
dump(e)
|
192
|
+
end
|
193
|
+
@interpreter.to_pig_latin.should match(/UNION \w+, \w+, \w+, \w+/)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
describe 'SAMPLE' do
|
198
|
+
it 'outputs a SAMPLE statement' do
|
199
|
+
@interpreter.interpret { dump(load('in').sample(10)) }
|
200
|
+
@interpreter.to_pig_latin.should match(/SAMPLE \w+ 10/)
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
describe 'LIMIT' do
|
205
|
+
it 'outputs a LIMIT statement' do
|
206
|
+
@interpreter.interpret { dump(load('in').limit(42)) }
|
207
|
+
@interpreter.to_pig_latin.should match(/LIMIT \w+ 42/)
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
describe 'FOREACH … GENERATE' do
|
212
|
+
it 'outputs a FOREACH … GENERATE statement' do
|
213
|
+
@interpreter.interpret { dump(load('in').foreach { |r| :a }) }
|
214
|
+
@interpreter.to_pig_latin.should match(/FOREACH \w+ GENERATE a/)
|
215
|
+
end
|
216
|
+
|
217
|
+
it 'outputs a FOREACH … GENERATE statement with a list of fields' do
|
218
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [:a, :b, :c] }) }
|
219
|
+
@interpreter.to_pig_latin.should match(/FOREACH \w+ GENERATE a, b, c/)
|
220
|
+
end
|
221
|
+
|
222
|
+
it 'outputs a FOREACH … GENERATE statement with fields resolved from the relation' do
|
223
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [r.a, r.b, r.c] }) }
|
224
|
+
@interpreter.to_pig_latin.should match(/FOREACH (\w+) GENERATE a, b, c/)
|
225
|
+
end
|
226
|
+
|
227
|
+
it 'outputs a FOREACH … GENERATE statement with fields resolved from the relation with positional syntax' do
|
228
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [r[0], r[1], r[2]] }) }
|
229
|
+
@interpreter.to_pig_latin.should match(/FOREACH (\w+) GENERATE \$0, \$1, \$2/)
|
230
|
+
end
|
231
|
+
|
232
|
+
it 'outputs a FOREACH … GENERATE statement with aggregate functions applied to the fields' do
|
233
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [r.a.max, r.b.min, r.c.avg] }) }
|
234
|
+
@interpreter.to_pig_latin.should match(/FOREACH (\w+) GENERATE MAX\(a\), MIN\(b\), AVG\(c\)/)
|
235
|
+
end
|
236
|
+
|
237
|
+
it 'outputs a FOREACH … GENERATE statement with fields that access inner fields' do
|
238
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [r.a.b, r.b.c, r.c.d] }) }
|
239
|
+
@interpreter.to_pig_latin.should match(/FOREACH (\w+) GENERATE a.b, b.c, c.d/)
|
240
|
+
end
|
241
|
+
|
242
|
+
it 'outputs a FOREACH … GENERATE statement that includes field aliasing' do
|
243
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [r.a.b.as(:c), r.a.b.as(:d)] }) }
|
244
|
+
@interpreter.to_pig_latin.should match(/FOREACH (\w+) GENERATE a.b AS c, a.b AS d/)
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
describe 'FILTER' do
|
249
|
+
it 'outputs a FILTER statement' do
|
250
|
+
@interpreter.interpret { dump(load('in').filter { |r| r.a == 3 }) }
|
251
|
+
@interpreter.to_pig_latin.should match(/FILTER \w+ BY a == 3/)
|
252
|
+
end
|
253
|
+
|
254
|
+
it 'outputs a FILTER statement with a complex test' do
|
255
|
+
@interpreter.interpret { dump(load('in').filter { |r| (r.a > r.b).and(r.c.ne(3)) }) }
|
256
|
+
@interpreter.to_pig_latin.should match(/FILTER \w+ BY \(a > b\) AND \(c != 3\)/)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
describe 'SPLIT' do
|
261
|
+
it 'outputs a SPLIT statement' do
|
262
|
+
@interpreter.interpret do
|
263
|
+
a, b = load('in').split { |r| [r.a >= 0, r.a < 0]}
|
264
|
+
dump(a)
|
265
|
+
dump(b)
|
266
|
+
end
|
267
|
+
@interpreter.to_pig_latin.should match(/SPLIT \w+ INTO \w+ IF a >= 0, \w+ IF a < 0/)
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
describe 'ORDER' do
|
272
|
+
it 'outputs an ORDER statement' do
|
273
|
+
@interpreter.interpret { dump(load('in').order(:a)) }
|
274
|
+
@interpreter.to_pig_latin.should match(/ORDER \w+ BY a/)
|
275
|
+
end
|
276
|
+
|
277
|
+
it 'outputs an ORDER statement with multiple fields' do
|
278
|
+
@interpreter.interpret { dump(load('in').order(:a, :b)) }
|
279
|
+
@interpreter.to_pig_latin.should match(/ORDER \w+ BY a, b/)
|
280
|
+
end
|
281
|
+
|
282
|
+
it 'outputs an ORDER statement with ASC and DESC' do
|
283
|
+
@interpreter.interpret { dump(load('in').order([:a, :asc], [:b, :desc])) }
|
284
|
+
@interpreter.to_pig_latin.should match(/ORDER \w+ BY a ASC, b DESC/)
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
describe 'JOIN' do
|
289
|
+
it 'outputs a JOIN statement' do
|
290
|
+
@interpreter.interpret do
|
291
|
+
a = load('in1')
|
292
|
+
b = load('in2')
|
293
|
+
c = a.join(a => :x, b => :y)
|
294
|
+
dump(c)
|
295
|
+
end
|
296
|
+
@interpreter.to_pig_latin.should match(/JOIN \w+ BY \w+, \w+ BY \w+/)
|
297
|
+
end
|
298
|
+
|
299
|
+
it 'outputs a JOIN statement with a PARALLEL clause' do
|
300
|
+
@interpreter.interpret do
|
301
|
+
a = load('in1')
|
302
|
+
b = load('in2')
|
303
|
+
c = a.join(a => :x, b => :y, :parallel => 5)
|
304
|
+
dump(c)
|
305
|
+
end
|
306
|
+
@interpreter.to_pig_latin.should match(/JOIN \w+ BY \w+, \w+ BY \w+ PARALLEL 5/)
|
307
|
+
end
|
308
|
+
|
309
|
+
it 'outputs a JOIN statement with a USING clause' do
|
310
|
+
@interpreter.interpret do
|
311
|
+
a = load('in1')
|
312
|
+
b = load('in2')
|
313
|
+
c = a.join(a => :x, b => :y, :using => :replicated)
|
314
|
+
dump(c)
|
315
|
+
end
|
316
|
+
@interpreter.to_pig_latin.should match(/JOIN \w+ BY \w+, \w+ BY \w+ USING "replicated"/)
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
describe 'COGROUP' do
|
321
|
+
it 'outputs a COGROUP statement' do
|
322
|
+
@interpreter.interpret do
|
323
|
+
a = load('in1')
|
324
|
+
b = load('in2')
|
325
|
+
c = a.cogroup(a => :x, b => :y)
|
326
|
+
dump(c)
|
327
|
+
end
|
328
|
+
@interpreter.to_pig_latin.should match(/COGROUP \w+ BY \w+, \w+ BY \w+/)
|
329
|
+
end
|
330
|
+
|
331
|
+
it 'outputs a COGROUP statement with multiple join fields' do
|
332
|
+
@interpreter.interpret do
|
333
|
+
a = load('in1')
|
334
|
+
b = load('in2')
|
335
|
+
c = a.cogroup(a => :x, b => [:y, :z, :w])
|
336
|
+
dump(c)
|
337
|
+
end
|
338
|
+
@interpreter.to_pig_latin.should match(/\w+ BY \(y, z, w\)/)
|
339
|
+
end
|
340
|
+
|
341
|
+
it 'outputs a COGROUP statement with a PARALLEL clause' do
|
342
|
+
@interpreter.interpret do
|
343
|
+
a = load('in1')
|
344
|
+
b = load('in2')
|
345
|
+
c = a.cogroup(a => :x, b => :y, :parallel => 5)
|
346
|
+
dump(c)
|
347
|
+
end
|
348
|
+
@interpreter.to_pig_latin.should match(/COGROUP \w+ BY \w+, \w+ BY \w+ PARALLEL 5/)
|
349
|
+
end
|
350
|
+
|
351
|
+
it 'outputs a COGROUP statement with INNER and OUTER' do
|
352
|
+
@interpreter.interpret do
|
353
|
+
a = load('in1')
|
354
|
+
b = load('in2')
|
355
|
+
c = a.cogroup(a => [:x, :inner], b => [:y, :outer])
|
356
|
+
dump(c)
|
357
|
+
end
|
358
|
+
@interpreter.to_pig_latin.should match(/\w+ BY x INNER/)
|
359
|
+
@interpreter.to_pig_latin.should match(/\w+ BY y OUTER/)
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
context 'aliasing & multiple statements' do
|
365
|
+
it 'aliases the loaded relation and uses the same alias in the STORE statement' do
|
366
|
+
@interpreter.interpret { store(load('in'), 'out') }
|
367
|
+
@interpreter.to_pig_latin.should match(/(\w+) = LOAD 'in';\nSTORE \1 INTO 'out';/)
|
368
|
+
end
|
369
|
+
|
370
|
+
it 'aliases both a loaded relation and a grouped relation and uses the latter in the STORE statement' do
|
371
|
+
@interpreter.interpret { store(load('in', :schema => [:a]).group(:a), 'out') }
|
372
|
+
@interpreter.to_pig_latin.should match(/(\w+) = LOAD 'in' AS \(a\);\n(\w+) = GROUP \1 BY a;\nSTORE \2 INTO 'out';/)
|
373
|
+
end
|
374
|
+
|
375
|
+
it 'aliases a whole row of statements' do
|
376
|
+
@interpreter.interpret do
|
377
|
+
a = load('in', :schema => [:a])
|
378
|
+
b = a.group(:a)
|
379
|
+
c = b.group(:a)
|
380
|
+
d = c.group(:a)
|
381
|
+
store(d, 'out')
|
382
|
+
end
|
383
|
+
@interpreter.to_pig_latin.should match(/(\w+) = LOAD 'in' AS \(a\);\n(\w+) = GROUP \1 BY a;\n(\w+) = GROUP \2 BY a;\n(\w+) = GROUP \3 BY a;\nSTORE \4 INTO 'out';/)
|
384
|
+
end
|
385
|
+
|
386
|
+
it 'outputs the statements for an alias only once, regardless of home many times it is stored' do
|
387
|
+
@interpreter.interpret do
|
388
|
+
a = load('in')
|
389
|
+
b = a.distinct
|
390
|
+
store(b, 'out1')
|
391
|
+
store(b, 'out2')
|
392
|
+
end
|
393
|
+
@interpreter.to_pig_latin.should match(/(\w+) = LOAD 'in';\n(\w+) = DISTINCT \1;\nSTORE \2 INTO 'out1';\nSTORE \2 INTO 'out2';/)
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
context 'long and complex scripts' do
|
398
|
+
before do
|
399
|
+
@interpreter.interpret do
|
400
|
+
sessions = load('sessions', :schema => [
|
401
|
+
[:ad_id, :chararray],
|
402
|
+
[:site, :chararray],
|
403
|
+
[:size, :chararray],
|
404
|
+
[:name, :chararray],
|
405
|
+
[:impression, :int],
|
406
|
+
[:engagement, :int],
|
407
|
+
[:click_thru, :int]
|
408
|
+
])
|
409
|
+
%w(site size name).each do |dimension|
|
410
|
+
result = sessions.group(:ad_id, dimension).foreach do |r|
|
411
|
+
[
|
412
|
+
r[0].ad_id.as(:ad_id),
|
413
|
+
literal(dimension).as(:dimension),
|
414
|
+
r[0].field(dimension).as(:value),
|
415
|
+
r[1].exposure.sum.as(:exposures),
|
416
|
+
r[1].impression.sum.as(:impressions),
|
417
|
+
r[1].engagement.sum.as(:engagements),
|
418
|
+
r[1].click_thru.sum.as(:click_thrus)
|
419
|
+
]
|
420
|
+
end
|
421
|
+
store(result, "report_metrics-#{dimension}")
|
422
|
+
end
|
423
|
+
end
|
424
|
+
@output = @interpreter.to_pig_latin
|
425
|
+
end
|
426
|
+
|
427
|
+
it 'outputs the correct number of LOAD statements' do
|
428
|
+
@output.scan(/LOAD/).size.should eql(1)
|
429
|
+
end
|
430
|
+
|
431
|
+
it 'outputs the correct number of STORE statements' do
|
432
|
+
@output.scan(/STORE/).size.should eql(3)
|
433
|
+
end
|
434
|
+
|
435
|
+
it 'doesn\'t assign to the same relation twice' do
|
436
|
+
@assignments = @output.scan(/^(\w+)(?=\s*=)/).flatten
|
437
|
+
@assignments.uniq.should eql(@assignments)
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
context 'schemas' do
|
442
|
+
it 'knows the schema of a relation returned by #load, with types' do
|
443
|
+
schema = catch(:schema) do
|
444
|
+
@interpreter.interpret do
|
445
|
+
schema = load('in', :schema => [[:a, :chararray], [:b, :chararray]]).schema
|
446
|
+
throw :schema, schema
|
447
|
+
end
|
448
|
+
end
|
449
|
+
schema.field_names.should eql([:a, :b])
|
450
|
+
schema.field_type(:a).should eql(:chararray)
|
451
|
+
end
|
452
|
+
|
453
|
+
it 'knows the schema of a relation returned by #load, without types' do
|
454
|
+
schema = catch(:schema) do
|
455
|
+
@interpreter.interpret do
|
456
|
+
schema = load('in', :schema => [:a, :b]).schema
|
457
|
+
throw :schema, schema
|
458
|
+
end
|
459
|
+
end
|
460
|
+
schema.field_names.should eql([:a, :b])
|
461
|
+
schema.field_type(:a).should eql(:bytearray)
|
462
|
+
end
|
463
|
+
|
464
|
+
it 'knows the schema of a relation returned by #load, with and without types' do
|
465
|
+
schema = catch(:schema) do
|
466
|
+
@interpreter.interpret do
|
467
|
+
schema = load('in', :schema => [[:a, :float], :b]).schema
|
468
|
+
throw :schema, schema
|
469
|
+
end
|
470
|
+
end
|
471
|
+
schema.field_names.should eql([:a, :b])
|
472
|
+
schema.field_type(:a).should eql(:float)
|
473
|
+
end
|
474
|
+
|
475
|
+
it 'does not know anything about the schema of a relation returned by #load if no schema was given' do
|
476
|
+
relation = catch(:relation) do
|
477
|
+
@interpreter.interpret do
|
478
|
+
throw :relation, load('in')
|
479
|
+
end
|
480
|
+
end
|
481
|
+
relation.schema.should be_nil
|
482
|
+
end
|
483
|
+
|
484
|
+
it 'knows the schema of a relation derived through non-schema-changing operations' do
|
485
|
+
schema = catch(:schema) do
|
486
|
+
@interpreter.interpret do
|
487
|
+
relation = load('in', :schema => [[:a, :float], [:b, :int]]).limit(3).sample(0.1).distinct.order(:a)
|
488
|
+
throw :schema, relation.schema
|
489
|
+
end
|
490
|
+
end
|
491
|
+
schema.field_names.should eql([:a, :b])
|
492
|
+
schema.field_type(:a).should eql(:float)
|
493
|
+
schema.field_type(:b).should eql(:int)
|
494
|
+
end
|
495
|
+
|
496
|
+
it 'knows the schema of a relation grouped on one field' do
|
497
|
+
relation = catch(:relation) do
|
498
|
+
@interpreter.interpret do
|
499
|
+
relation = load('in', :schema => [[:a, :float], [:b, :int]]).group(:a)
|
500
|
+
throw :relation, relation
|
501
|
+
end
|
502
|
+
end
|
503
|
+
source_relation_name = relation.sources.first.alias.to_sym
|
504
|
+
relation.schema.field_names.should eql([:group, source_relation_name])
|
505
|
+
relation.schema.field_type(:group).should eql(:float)
|
506
|
+
relation.schema.field_type(source_relation_name).should be_a(Piglet::Schema::Bag)
|
507
|
+
relation.schema.field_type(source_relation_name).field_names.should eql([:a, :b])
|
508
|
+
relation.schema.field_type(source_relation_name).field_type(:a).should eql(:float)
|
509
|
+
end
|
510
|
+
|
511
|
+
it 'knows the schema of a relation grouped on more than one field' do
|
512
|
+
relation = catch(:relation) do
|
513
|
+
@interpreter.interpret do
|
514
|
+
relation = load('in', :schema => [[:a, :float], [:b, :int]]).group(:a, :b)
|
515
|
+
throw :relation, relation
|
516
|
+
end
|
517
|
+
end
|
518
|
+
source_relation_name = relation.sources.first.alias.to_sym
|
519
|
+
relation.schema.field_names.should eql([:group, source_relation_name])
|
520
|
+
relation.schema.field_type(:group).should be_a(Piglet::Schema::Tuple)
|
521
|
+
relation.schema.field_type(:group).field_names.should eql([:a, :b])
|
522
|
+
relation.schema.field_type(:group).field_type(:a).should eql(:float)
|
523
|
+
relation.schema.field_type(source_relation_name).should be_a(Piglet::Schema::Bag)
|
524
|
+
relation.schema.field_type(source_relation_name).field_names.should eql([:a, :b])
|
525
|
+
relation.schema.field_type(source_relation_name).field_type(:b).should eql(:int)
|
526
|
+
end
|
527
|
+
|
528
|
+
it 'knows the schema of a relation cross joined with itself' do
|
529
|
+
schema = catch(:schema) do
|
530
|
+
@interpreter.interpret do
|
531
|
+
relation = load('in', :schema => [[:a, :float], [:b, :int]])
|
532
|
+
relation = relation.cross(relation)
|
533
|
+
throw :schema, relation.schema
|
534
|
+
end
|
535
|
+
end
|
536
|
+
schema.field_names.should eql([:a, :b, :a, :b])
|
537
|
+
schema.field_type(:a).should eql(:float)
|
538
|
+
schema.field_type(:b).should eql(:int)
|
539
|
+
end
|
540
|
+
|
541
|
+
it 'knows the schema of a relation cross joined with another' do
|
542
|
+
schema = catch(:schema) do
|
543
|
+
@interpreter.interpret do
|
544
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
545
|
+
relation2 = load('in2', :schema => [[:c, :chararray], [:d, :double]])
|
546
|
+
relation3 = relation1.cross(relation2)
|
547
|
+
throw :schema, relation3.schema
|
548
|
+
end
|
549
|
+
end
|
550
|
+
schema.field_names.should eql([:a, :b, :c, :d])
|
551
|
+
schema.field_type(:a).should eql(:float)
|
552
|
+
schema.field_type(:b).should eql(:int)
|
553
|
+
schema.field_type(:c).should eql(:chararray)
|
554
|
+
schema.field_type(:d).should eql(:double)
|
555
|
+
end
|
556
|
+
|
557
|
+
it 'knows the schema of a relation joined with another' do
|
558
|
+
schema = catch(:schema) do
|
559
|
+
@interpreter.interpret do
|
560
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
561
|
+
relation2 = load('in2', :schema => [[:c, :int], [:d, :double]])
|
562
|
+
relation3 = relation1.join(relation1 => :b, relation2 => :c)
|
563
|
+
throw :schema, relation3.schema
|
564
|
+
end
|
565
|
+
end
|
566
|
+
schema.field_names.should eql([:a, :b, :c, :d])
|
567
|
+
schema.field_type(:a).should eql(:float)
|
568
|
+
schema.field_type(:b).should eql(:int)
|
569
|
+
schema.field_type(:c).should eql(:int)
|
570
|
+
schema.field_type(:d).should eql(:double)
|
571
|
+
end
|
572
|
+
|
573
|
+
it 'knows the schema of a relation cogrouped with another' do
|
574
|
+
relation1, relation2, relation3 = catch(:relations) do
|
575
|
+
@interpreter.interpret do
|
576
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
577
|
+
relation2 = load('in2', :schema => [[:c, :int], [:d, :double]])
|
578
|
+
relation3 = relation1.cogroup(relation1 => :b, relation2 => :c)
|
579
|
+
throw :relations, [relation1, relation2, relation3]
|
580
|
+
end
|
581
|
+
end
|
582
|
+
relation3.schema.field_names.should eql([:group, relation1.alias.to_sym, relation2.alias.to_sym])
|
583
|
+
relation3.schema.field_type(relation1.alias.to_sym).should be_a(Piglet::Schema::Bag)
|
584
|
+
relation3.schema.field_type(relation2.alias.to_sym).should be_a(Piglet::Schema::Bag)
|
585
|
+
relation3.schema.field_type(relation1.alias.to_sym).field_names.should eql([:a, :b])
|
586
|
+
relation3.schema.field_type(relation2.alias.to_sym).field_names.should eql([:c, :d])
|
587
|
+
end
|
588
|
+
|
589
|
+
it 'knows the schema of a relation projection' do
|
590
|
+
schema = catch(:schema) do
|
591
|
+
@interpreter.interpret do
|
592
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
593
|
+
relation2 = relation1.foreach { |r| [r.a] }
|
594
|
+
throw :schema, relation2.schema
|
595
|
+
end
|
596
|
+
end
|
597
|
+
schema.field_names.should eql([:a])
|
598
|
+
schema.field_type(:a).should eql(:float)
|
599
|
+
end
|
600
|
+
|
601
|
+
it 'knows the schema of a relation projection containing a call to MAX' do
|
602
|
+
schema = catch(:schema) do
|
603
|
+
@interpreter.interpret do
|
604
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
605
|
+
relation2 = relation1.foreach { |r| [r.a.max] }
|
606
|
+
throw :schema, relation2.schema
|
607
|
+
end
|
608
|
+
end
|
609
|
+
schema.field_names.should eql([nil])
|
610
|
+
schema.field_type(0).should eql(:float)
|
611
|
+
end
|
612
|
+
|
613
|
+
it 'knows the schema of a relation projection containing a call to COUNT' do
|
614
|
+
schema = catch(:schema) do
|
615
|
+
@interpreter.interpret do
|
616
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
617
|
+
relation2 = relation1.foreach { |r| [r.a.count] }
|
618
|
+
throw :schema, relation2.schema
|
619
|
+
end
|
620
|
+
end
|
621
|
+
schema.field_names.should eql([nil])
|
622
|
+
schema.field_type(0).should eql(:long)
|
623
|
+
end
|
624
|
+
|
625
|
+
it 'knows the schema of a relation projection containing a field rename' do
|
626
|
+
schema = catch(:schema) do
|
627
|
+
@interpreter.interpret do
|
628
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
629
|
+
relation2 = relation1.foreach { |r| [r.a.count.as(:x)] }
|
630
|
+
throw :schema, relation2.schema
|
631
|
+
end
|
632
|
+
end
|
633
|
+
schema.field_names.should eql([:x])
|
634
|
+
end
|
635
|
+
|
636
|
+
it 'knows the schema of a relation projection containing a literal string' do
|
637
|
+
schema = catch(:schema) do
|
638
|
+
@interpreter.interpret do
|
639
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
640
|
+
relation2 = relation1.foreach { |r| [literal('blipp')] }
|
641
|
+
throw :schema, relation2.schema
|
642
|
+
end
|
643
|
+
end
|
644
|
+
schema.field_type(0).should eql(:chararray)
|
645
|
+
end
|
646
|
+
|
647
|
+
it 'knows the schema of a relation projection containing a literal integer' do
|
648
|
+
schema = catch(:schema) do
|
649
|
+
@interpreter.interpret do
|
650
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
651
|
+
relation2 = relation1.foreach { |r| [literal(4)] }
|
652
|
+
throw :schema, relation2.schema
|
653
|
+
end
|
654
|
+
end
|
655
|
+
schema.field_type(0).should eql(:int)
|
656
|
+
end
|
657
|
+
|
658
|
+
it 'knows the schema of a relation projection containing a literal float' do
|
659
|
+
schema = catch(:schema) do
|
660
|
+
@interpreter.interpret do
|
661
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
662
|
+
relation2 = relation1.foreach { |r| [literal(3.14)] }
|
663
|
+
throw :schema, relation2.schema
|
664
|
+
end
|
665
|
+
end
|
666
|
+
schema.field_type(0).should eql(:double)
|
667
|
+
end
|
668
|
+
|
669
|
+
end
|
6
670
|
|
7
671
|
end
|