piglet 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/README.rdoc +15 -0
- data/lib/piglet.rb +12 -4
- data/lib/piglet/field/binary_conditional.rb +4 -3
- data/lib/piglet/field/call_expression.rb +6 -6
- data/lib/piglet/field/field.rb +134 -0
- data/lib/piglet/field/infix_expression.rb +25 -2
- data/lib/piglet/field/literal.rb +19 -2
- data/lib/piglet/field/prefix_expression.rb +4 -2
- data/lib/piglet/field/reference.rb +8 -3
- data/lib/piglet/field/rename.rb +5 -3
- data/lib/piglet/field/suffix_expression.rb +4 -2
- data/lib/piglet/inout/load.rb +5 -0
- data/lib/piglet/interpreter.rb +2 -4
- data/lib/piglet/relation/cogroup.rb +15 -0
- data/lib/piglet/relation/cross.rb +5 -0
- data/lib/piglet/relation/foreach.rb +5 -0
- data/lib/piglet/relation/group.rb +16 -0
- data/lib/piglet/relation/join.rb +5 -0
- data/lib/piglet/relation/relation.rb +17 -2
- data/lib/piglet/relation/union.rb +1 -1
- data/lib/piglet/schema/bag.rb +21 -0
- data/lib/piglet/schema/tuple.rb +111 -0
- data/spec/piglet/field/binary_conditional_spec.rb +47 -0
- data/spec/piglet/field/field_spec.rb +103 -0
- data/spec/piglet/field/infix_expression_spec.rb +69 -0
- data/spec/piglet/field/literal_spec.rb +27 -0
- data/spec/piglet/field/reference_spec.rb +15 -1
- data/spec/piglet/interpreter_spec.rb +8 -395
- data/spec/piglet/relation/relation_spec.rb +4 -0
- data/spec/piglet/relation/union_spec.rb +37 -0
- data/spec/piglet/schema/tuple_spec.rb +121 -0
- data/spec/piglet_spec.rb +664 -0
- metadata +17 -3
- data/lib/piglet/field/operators.rb +0 -80
@@ -74,6 +74,10 @@ describe Piglet::Relation::Relation do
|
|
74
74
|
it 'returns fields with positional notation' do
|
75
75
|
@relation[1].to_s.should eql('$1')
|
76
76
|
end
|
77
|
+
|
78
|
+
it 'returns fields through a direct call to #field' do
|
79
|
+
@relation.field(:a).to_s.should eql('a')
|
80
|
+
end
|
77
81
|
end
|
78
82
|
|
79
83
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../spec_helper')
|
2
|
+
|
3
|
+
|
4
|
+
describe Piglet::Relation::Union do
|
5
|
+
|
6
|
+
before do
|
7
|
+
@relation1 = Object.new
|
8
|
+
@relation1.extend Piglet::Relation::Relation
|
9
|
+
@relation2 = mock('relation2')
|
10
|
+
@relation3 = mock('relation3')
|
11
|
+
@relation1.stub!(:alias).and_return('relation1')
|
12
|
+
@relation2.stub!(:alias).and_return('relation2')
|
13
|
+
@relation3.stub!(:alias).and_return('relation3')
|
14
|
+
end
|
15
|
+
|
16
|
+
describe '#to_s' do
|
17
|
+
it 'outputs the names of all the relations (given as separate arguments)' do
|
18
|
+
pig_latin = @relation1.union(@relation2, @relation3).to_s
|
19
|
+
pig_latin.should include('relation1')
|
20
|
+
pig_latin.should include('relation2')
|
21
|
+
pig_latin.should include('relation3')
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'outputs the names of all the relations (given as an array)' do
|
25
|
+
pig_latin = @relation1.union([@relation2, @relation3]).to_s
|
26
|
+
pig_latin.should include('relation1')
|
27
|
+
pig_latin.should include('relation2')
|
28
|
+
pig_latin.should include('relation3')
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'outputs a UNION statement with the right number of relations' do
|
32
|
+
pig_latin = @relation1.union(@relation2, @relation3).to_s
|
33
|
+
pig_latin.should match(/UNION \w+, \w+, \w+/)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../spec_helper'
|
2
|
+
|
3
|
+
|
4
|
+
include Piglet::Schema
|
5
|
+
|
6
|
+
|
7
|
+
describe Tuple do
|
8
|
+
|
9
|
+
describe '.parse' do
|
10
|
+
it 'can parse a non-typed, single field description' do
|
11
|
+
tuple = Tuple.parse([:a])
|
12
|
+
tuple.field_names.should eql([:a])
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'can parse a non-typed, multiple field description' do
|
16
|
+
tuple = Tuple.parse([:a, :b, :c])
|
17
|
+
tuple.field_names.should eql([:a, :b, :c])
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'can parse a typed, single field description' do
|
21
|
+
tuple = Tuple.parse([[:a, :chararray]])
|
22
|
+
tuple.field_names.should eql([:a])
|
23
|
+
tuple.field_type(:a).should eql(:chararray)
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'can parse a typed, multiple field description' do
|
27
|
+
tuple = Tuple.parse([[:a, :chararray], [:b, :double]])
|
28
|
+
tuple.field_names.should eql([:a, :b])
|
29
|
+
tuple.field_type(:a).should eql(:chararray)
|
30
|
+
tuple.field_type(:b).should eql(:double)
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'can parse a mixed typed and non-typed field description' do
|
34
|
+
tuple = Tuple.parse([:a, [:b, :double]])
|
35
|
+
tuple.field_names.should eql([:a, :b])
|
36
|
+
tuple.field_type(:b).should eql(:double)
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'defaults to :bytearray for untyped fields' do
|
40
|
+
tuple = Tuple.parse([:a])
|
41
|
+
tuple.field_type(:a).should eql(:bytearray)
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'accepts a Tuple object as the type of a field' do
|
45
|
+
tuple = Tuple.parse([[:a, Tuple.parse([:c, :d])]])
|
46
|
+
tuple.field_type(:a).should be_a(Tuple)
|
47
|
+
tuple.field_type(:a).field_names.should eql([:c, :d])
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'can parse a Tuple from a field typed as :tuple' do
|
51
|
+
tuple = Tuple.parse([[:a, :tuple, [:c, :d]]])
|
52
|
+
tuple.field_type(:a).should be_a(Tuple)
|
53
|
+
tuple.field_type(:a).field_names.should eql([:c, :d])
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'accepts a Bag object as the type of a field' do
|
57
|
+
tuple = Tuple.parse([[:a, Bag.new(Tuple.parse([:c, :d]))]])
|
58
|
+
tuple.field_type(:a).should be_a(Bag)
|
59
|
+
tuple.field_type(:a).field_names.should eql([:c, :d])
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'can parse a Bag from a field typed as :bag' do
|
63
|
+
tuple = Tuple.parse([[:a, :bag, [:c, :d]]])
|
64
|
+
tuple.field_type(:a).should be_a(Bag)
|
65
|
+
tuple.field_type(:a).field_names.should eql([:c, :d])
|
66
|
+
end
|
67
|
+
|
68
|
+
it 'can parse a description that lacks field names (and fall back to making the fields accessible by index)' do
|
69
|
+
tuple = Tuple.parse([[nil, :chararray], [nil, :int]])
|
70
|
+
tuple.field_type(1).should eql(:int)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
describe '#union' do
|
75
|
+
it 'creates a new tuple with the fields from two tuples' do
|
76
|
+
t1 = Tuple.parse([:a, :b, :c])
|
77
|
+
t2 = Tuple.parse([:d, :e, :f])
|
78
|
+
t3 = t1.union(t2)
|
79
|
+
t3.field_names.should eql([:a, :b, :c, :d, :e, :f])
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'creates a new tuple with the fields from three tuples' do
|
83
|
+
t1 = Tuple.parse([:a, :b, :c])
|
84
|
+
t2 = Tuple.parse([:d, :e, :f])
|
85
|
+
t3 = Tuple.parse([:g, :h, :i])
|
86
|
+
t4 = t1.union(t2, t3)
|
87
|
+
t4.field_names.should eql([:a, :b, :c, :d, :e, :f, :g, :h, :i])
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'creates a new tuple with the fields from three tuples (arguments as an array)' do
|
91
|
+
t1 = Tuple.parse([:a, :b, :c])
|
92
|
+
t2 = Tuple.parse([:d, :e, :f])
|
93
|
+
t3 = Tuple.parse([:g, :h, :i])
|
94
|
+
t4 = t1.union([t2, t3])
|
95
|
+
t4.field_names.should eql([:a, :b, :c, :d, :e, :f, :g, :h, :i])
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'retains all the fields even if some have the same name' do
|
99
|
+
t1 = Tuple.parse([:a, :b, :c])
|
100
|
+
t2 = Tuple.parse([:b, :c, :d])
|
101
|
+
t3 = t1.union(t2)
|
102
|
+
t3.field_names.should eql([:a, :b, :c, :b, :c, :d])
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
describe '#to_s' do
|
107
|
+
it 'returns the schema string for a simple untyped schema' do
|
108
|
+
Tuple.parse([:a, :b]).to_s.should eql('(a:bytearray, b:bytearray)')
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'returns the schema string for a simple typed schema' do
|
112
|
+
Tuple.parse([[:a, :chararray], [:b, :int]]).to_s.should eql('(a:chararray, b:int)')
|
113
|
+
end
|
114
|
+
|
115
|
+
it 'returns the schema string for a nested schema' do
|
116
|
+
description = [[:a, :tuple, [[:x, :int], [:y, :float]]], [:b, :bag, [[:w, :bytearray]]]]
|
117
|
+
Tuple.parse(description).to_s.should eql('(a:tuple (x:int, y:float), b:bag {w:bytearray})')
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
data/spec/piglet_spec.rb
CHANGED
@@ -3,5 +3,669 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
3
3
|
|
4
4
|
describe Piglet do
|
5
5
|
|
6
|
+
before do
|
7
|
+
@interpreter = Piglet::Interpreter.new
|
8
|
+
end
|
9
|
+
|
10
|
+
context 'load & store operators:' do
|
11
|
+
describe 'LOAD' do
|
12
|
+
it 'outputs a LOAD statement' do
|
13
|
+
@interpreter.interpret { store(load('some/path'), 'out') }
|
14
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path'")
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'outputs a LOAD statement without a USING clause if none specified' do
|
18
|
+
@interpreter.interpret { store(load('some/path'), 'out') }
|
19
|
+
@interpreter.to_pig_latin.should_not include('USING')
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'outputs a LOAD statement with a USING clause with a specified function' do
|
23
|
+
@interpreter.interpret { store(load('some/path', :using => 'XYZ'), 'out') }
|
24
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path' USING XYZ;")
|
25
|
+
end
|
26
|
+
|
27
|
+
Piglet::Inout::StorageTypes::LOAD_STORE_FUNCTIONS.each do |symbolic_name, function|
|
28
|
+
it "knows that the load method :#{symbolic_name} means #{function}" do
|
29
|
+
@interpreter.interpret { store(load('some/path', :using => symbolic_name), 'out') }
|
30
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path' USING #{function};")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'outputs a LOAD statement with an AS clause' do
|
35
|
+
@interpreter.interpret { store(load('some/path', :schema => %w(a b c)), 'out') }
|
36
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path' AS (a, b, c);")
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'outputs a LOAD statement with an AS clause with types' do
|
40
|
+
@interpreter.interpret { store(load('some/path', :schema => [:a, [:b, :chararray], :c]), 'out') }
|
41
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path' AS (a, b:chararray, c);")
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'outputs a LOAD statement with an AS clause with types specified as both strings and symbols' do
|
45
|
+
@interpreter.interpret { store(load('some/path', :schema => [:a, %w(b chararray), :c]), 'out') }
|
46
|
+
@interpreter.to_pig_latin.should include("LOAD 'some/path' AS (a, b:chararray, c);")
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
describe 'STORE' do
|
51
|
+
it 'outputs a STORE statement' do
|
52
|
+
@interpreter.interpret { store(load('some/path'), 'out') }
|
53
|
+
@interpreter.to_pig_latin.should match(/STORE \w+ INTO 'out'/)
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'outputs a STORE statement without a USING clause if none specified' do
|
57
|
+
@interpreter.interpret { store(load('some/path'), 'out') }
|
58
|
+
@interpreter.to_pig_latin.should_not include("USING")
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'outputs a STORE statement with a USING clause with a specified function' do
|
62
|
+
@interpreter.interpret { store(load('some/path'), 'out', :using => 'XYZ') }
|
63
|
+
@interpreter.to_pig_latin.should match(/STORE \w+ INTO 'out' USING XYZ/)
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'knows that the load method :pig_storage means PigStorage' do
|
67
|
+
@interpreter.interpret { store(load('some/path'), 'out', :using => :pig_storage) }
|
68
|
+
@interpreter.to_pig_latin.should match(/STORE \w+ INTO 'out' USING PigStorage/)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
describe 'DUMP' do
|
73
|
+
it 'outputs a DUMP statement' do
|
74
|
+
@interpreter.interpret { dump(load('some/path')) }
|
75
|
+
@interpreter.to_pig_latin.should match(/DUMP \w+/)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
context 'diagnostic operators:' do
|
81
|
+
describe 'ILLUSTRATE' do
|
82
|
+
it 'outputs an ILLUSTRATE statement' do
|
83
|
+
@interpreter.interpret { illustrate(load('some/path')) }
|
84
|
+
@interpreter.to_pig_latin.should match(/ILLUSTRATE \w+/)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
describe 'DESCRIBE' do
|
89
|
+
it 'outputs a DESCRIBE statement' do
|
90
|
+
@interpreter.interpret { describe(load('some/path')) }
|
91
|
+
@interpreter.to_pig_latin.should match(/DESCRIBE \w+/)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
describe 'EXPLAIN' do
|
96
|
+
it 'outputs an EXPLAIN statement' do
|
97
|
+
@interpreter.interpret { explain(load('some/path')) }
|
98
|
+
@interpreter.to_pig_latin.should match(/EXPLAIN \w+/)
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'outputs an EXPLAIN statement without an alias' do
|
102
|
+
@interpreter.interpret { explain }
|
103
|
+
@interpreter.to_pig_latin.should match(/EXPLAIN;/)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
context 'relation operators:' do
|
109
|
+
describe 'GROUP' do
|
110
|
+
it 'outputs a GROUP statement with one grouping field' do
|
111
|
+
@interpreter.interpret { store(load('in').group(:a), 'out') }
|
112
|
+
@interpreter.to_pig_latin.should match(/GROUP \w+ BY a/)
|
113
|
+
end
|
114
|
+
|
115
|
+
it 'outputs a GROUP statement with more than one grouping field' do
|
116
|
+
@interpreter.interpret { store(load('in').group(:a, :b, :c), 'out') }
|
117
|
+
@interpreter.to_pig_latin.should match(/GROUP \w+ BY \(a, b, c\)/)
|
118
|
+
end
|
119
|
+
|
120
|
+
it 'outputs a GROUP statement with a PARALLEL clause' do
|
121
|
+
@interpreter.interpret { store(load('in').group([:a, :b, :c], :parallel => 3), 'out') }
|
122
|
+
@interpreter.to_pig_latin.should match(/GROUP \w+ BY \(a, b, c\) PARALLEL 3/)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
describe 'DISTINCT' do
|
127
|
+
it 'outputs a DISTINCT statement' do
|
128
|
+
@interpreter.interpret { store(load('in').distinct, 'out') }
|
129
|
+
@interpreter.to_pig_latin.should match(/DISTINCT \w+/)
|
130
|
+
end
|
131
|
+
|
132
|
+
it 'outputs a DISTINCT statement with a PARALLEL clause' do
|
133
|
+
@interpreter.interpret { store(load('in').distinct(:parallel => 4), 'out') }
|
134
|
+
@interpreter.to_pig_latin.should match(/DISTINCT \w+ PARALLEL 4/)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
describe 'CROSS' do
|
139
|
+
it 'outputs a CROSS statement with two relations' do
|
140
|
+
@interpreter.interpret do
|
141
|
+
a = load('in1')
|
142
|
+
b = load('in2')
|
143
|
+
c = a.cross(b)
|
144
|
+
dump(c)
|
145
|
+
end
|
146
|
+
@interpreter.to_pig_latin.should match(/CROSS \w+, \w+/)
|
147
|
+
end
|
148
|
+
|
149
|
+
it 'outputs a CROSS statement with many relations' do
|
150
|
+
@interpreter.interpret do
|
151
|
+
a = load('in1')
|
152
|
+
b = load('in2')
|
153
|
+
c = load('in3')
|
154
|
+
d = load('in4')
|
155
|
+
e = a.cross(b, c, d)
|
156
|
+
dump(e)
|
157
|
+
end
|
158
|
+
@interpreter.to_pig_latin.should match(/CROSS \w+, \w+, \w+, \w+/)
|
159
|
+
end
|
160
|
+
|
161
|
+
it 'outputs a CROSS statement with a PARALLEL clause' do
|
162
|
+
@interpreter.interpret do
|
163
|
+
a = load('in1')
|
164
|
+
b = load('in2')
|
165
|
+
c = load('in3')
|
166
|
+
d = a.cross([b, c], :parallel => 4)
|
167
|
+
dump(d)
|
168
|
+
end
|
169
|
+
@interpreter.to_pig_latin.should match(/CROSS \w+, \w+, \w+ PARALLEL 4/)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
describe 'UNION' do
|
174
|
+
it 'outputs a UNION statement with two relations' do
|
175
|
+
@interpreter.interpret do
|
176
|
+
a = load('in1')
|
177
|
+
b = load('in2')
|
178
|
+
c = a.union(b)
|
179
|
+
dump(c)
|
180
|
+
end
|
181
|
+
@interpreter.to_pig_latin.should match(/UNION \w+, \w+/)
|
182
|
+
end
|
183
|
+
|
184
|
+
it 'outputs a UNION statement with many relations' do
|
185
|
+
@interpreter.interpret do
|
186
|
+
a = load('in1')
|
187
|
+
b = load('in2')
|
188
|
+
c = load('in3')
|
189
|
+
d = load('in4')
|
190
|
+
e = a.union(b, c, d)
|
191
|
+
dump(e)
|
192
|
+
end
|
193
|
+
@interpreter.to_pig_latin.should match(/UNION \w+, \w+, \w+, \w+/)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
describe 'SAMPLE' do
|
198
|
+
it 'outputs a SAMPLE statement' do
|
199
|
+
@interpreter.interpret { dump(load('in').sample(10)) }
|
200
|
+
@interpreter.to_pig_latin.should match(/SAMPLE \w+ 10/)
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
describe 'LIMIT' do
|
205
|
+
it 'outputs a LIMIT statement' do
|
206
|
+
@interpreter.interpret { dump(load('in').limit(42)) }
|
207
|
+
@interpreter.to_pig_latin.should match(/LIMIT \w+ 42/)
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
describe 'FOREACH … GENERATE' do
|
212
|
+
it 'outputs a FOREACH … GENERATE statement' do
|
213
|
+
@interpreter.interpret { dump(load('in').foreach { |r| :a }) }
|
214
|
+
@interpreter.to_pig_latin.should match(/FOREACH \w+ GENERATE a/)
|
215
|
+
end
|
216
|
+
|
217
|
+
it 'outputs a FOREACH … GENERATE statement with a list of fields' do
|
218
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [:a, :b, :c] }) }
|
219
|
+
@interpreter.to_pig_latin.should match(/FOREACH \w+ GENERATE a, b, c/)
|
220
|
+
end
|
221
|
+
|
222
|
+
it 'outputs a FOREACH … GENERATE statement with fields resolved from the relation' do
|
223
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [r.a, r.b, r.c] }) }
|
224
|
+
@interpreter.to_pig_latin.should match(/FOREACH (\w+) GENERATE a, b, c/)
|
225
|
+
end
|
226
|
+
|
227
|
+
it 'outputs a FOREACH … GENERATE statement with fields resolved from the relation with positional syntax' do
|
228
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [r[0], r[1], r[2]] }) }
|
229
|
+
@interpreter.to_pig_latin.should match(/FOREACH (\w+) GENERATE \$0, \$1, \$2/)
|
230
|
+
end
|
231
|
+
|
232
|
+
it 'outputs a FOREACH … GENERATE statement with aggregate functions applied to the fields' do
|
233
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [r.a.max, r.b.min, r.c.avg] }) }
|
234
|
+
@interpreter.to_pig_latin.should match(/FOREACH (\w+) GENERATE MAX\(a\), MIN\(b\), AVG\(c\)/)
|
235
|
+
end
|
236
|
+
|
237
|
+
it 'outputs a FOREACH … GENERATE statement with fields that access inner fields' do
|
238
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [r.a.b, r.b.c, r.c.d] }) }
|
239
|
+
@interpreter.to_pig_latin.should match(/FOREACH (\w+) GENERATE a.b, b.c, c.d/)
|
240
|
+
end
|
241
|
+
|
242
|
+
it 'outputs a FOREACH … GENERATE statement that includes field aliasing' do
|
243
|
+
@interpreter.interpret { dump(load('in').foreach { |r| [r.a.b.as(:c), r.a.b.as(:d)] }) }
|
244
|
+
@interpreter.to_pig_latin.should match(/FOREACH (\w+) GENERATE a.b AS c, a.b AS d/)
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
describe 'FILTER' do
|
249
|
+
it 'outputs a FILTER statement' do
|
250
|
+
@interpreter.interpret { dump(load('in').filter { |r| r.a == 3 }) }
|
251
|
+
@interpreter.to_pig_latin.should match(/FILTER \w+ BY a == 3/)
|
252
|
+
end
|
253
|
+
|
254
|
+
it 'outputs a FILTER statement with a complex test' do
|
255
|
+
@interpreter.interpret { dump(load('in').filter { |r| (r.a > r.b).and(r.c.ne(3)) }) }
|
256
|
+
@interpreter.to_pig_latin.should match(/FILTER \w+ BY \(a > b\) AND \(c != 3\)/)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
describe 'SPLIT' do
|
261
|
+
it 'outputs a SPLIT statement' do
|
262
|
+
@interpreter.interpret do
|
263
|
+
a, b = load('in').split { |r| [r.a >= 0, r.a < 0]}
|
264
|
+
dump(a)
|
265
|
+
dump(b)
|
266
|
+
end
|
267
|
+
@interpreter.to_pig_latin.should match(/SPLIT \w+ INTO \w+ IF a >= 0, \w+ IF a < 0/)
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
describe 'ORDER' do
|
272
|
+
it 'outputs an ORDER statement' do
|
273
|
+
@interpreter.interpret { dump(load('in').order(:a)) }
|
274
|
+
@interpreter.to_pig_latin.should match(/ORDER \w+ BY a/)
|
275
|
+
end
|
276
|
+
|
277
|
+
it 'outputs an ORDER statement with multiple fields' do
|
278
|
+
@interpreter.interpret { dump(load('in').order(:a, :b)) }
|
279
|
+
@interpreter.to_pig_latin.should match(/ORDER \w+ BY a, b/)
|
280
|
+
end
|
281
|
+
|
282
|
+
it 'outputs an ORDER statement with ASC and DESC' do
|
283
|
+
@interpreter.interpret { dump(load('in').order([:a, :asc], [:b, :desc])) }
|
284
|
+
@interpreter.to_pig_latin.should match(/ORDER \w+ BY a ASC, b DESC/)
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
describe 'JOIN' do
|
289
|
+
it 'outputs a JOIN statement' do
|
290
|
+
@interpreter.interpret do
|
291
|
+
a = load('in1')
|
292
|
+
b = load('in2')
|
293
|
+
c = a.join(a => :x, b => :y)
|
294
|
+
dump(c)
|
295
|
+
end
|
296
|
+
@interpreter.to_pig_latin.should match(/JOIN \w+ BY \w+, \w+ BY \w+/)
|
297
|
+
end
|
298
|
+
|
299
|
+
it 'outputs a JOIN statement with a PARALLEL clause' do
|
300
|
+
@interpreter.interpret do
|
301
|
+
a = load('in1')
|
302
|
+
b = load('in2')
|
303
|
+
c = a.join(a => :x, b => :y, :parallel => 5)
|
304
|
+
dump(c)
|
305
|
+
end
|
306
|
+
@interpreter.to_pig_latin.should match(/JOIN \w+ BY \w+, \w+ BY \w+ PARALLEL 5/)
|
307
|
+
end
|
308
|
+
|
309
|
+
it 'outputs a JOIN statement with a USING clause' do
|
310
|
+
@interpreter.interpret do
|
311
|
+
a = load('in1')
|
312
|
+
b = load('in2')
|
313
|
+
c = a.join(a => :x, b => :y, :using => :replicated)
|
314
|
+
dump(c)
|
315
|
+
end
|
316
|
+
@interpreter.to_pig_latin.should match(/JOIN \w+ BY \w+, \w+ BY \w+ USING "replicated"/)
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
describe 'COGROUP' do
|
321
|
+
it 'outputs a COGROUP statement' do
|
322
|
+
@interpreter.interpret do
|
323
|
+
a = load('in1')
|
324
|
+
b = load('in2')
|
325
|
+
c = a.cogroup(a => :x, b => :y)
|
326
|
+
dump(c)
|
327
|
+
end
|
328
|
+
@interpreter.to_pig_latin.should match(/COGROUP \w+ BY \w+, \w+ BY \w+/)
|
329
|
+
end
|
330
|
+
|
331
|
+
it 'outputs a COGROUP statement with multiple join fields' do
|
332
|
+
@interpreter.interpret do
|
333
|
+
a = load('in1')
|
334
|
+
b = load('in2')
|
335
|
+
c = a.cogroup(a => :x, b => [:y, :z, :w])
|
336
|
+
dump(c)
|
337
|
+
end
|
338
|
+
@interpreter.to_pig_latin.should match(/\w+ BY \(y, z, w\)/)
|
339
|
+
end
|
340
|
+
|
341
|
+
it 'outputs a COGROUP statement with a PARALLEL clause' do
|
342
|
+
@interpreter.interpret do
|
343
|
+
a = load('in1')
|
344
|
+
b = load('in2')
|
345
|
+
c = a.cogroup(a => :x, b => :y, :parallel => 5)
|
346
|
+
dump(c)
|
347
|
+
end
|
348
|
+
@interpreter.to_pig_latin.should match(/COGROUP \w+ BY \w+, \w+ BY \w+ PARALLEL 5/)
|
349
|
+
end
|
350
|
+
|
351
|
+
it 'outputs a COGROUP statement with INNER and OUTER' do
|
352
|
+
@interpreter.interpret do
|
353
|
+
a = load('in1')
|
354
|
+
b = load('in2')
|
355
|
+
c = a.cogroup(a => [:x, :inner], b => [:y, :outer])
|
356
|
+
dump(c)
|
357
|
+
end
|
358
|
+
@interpreter.to_pig_latin.should match(/\w+ BY x INNER/)
|
359
|
+
@interpreter.to_pig_latin.should match(/\w+ BY y OUTER/)
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
context 'aliasing & multiple statements' do
|
365
|
+
it 'aliases the loaded relation and uses the same alias in the STORE statement' do
|
366
|
+
@interpreter.interpret { store(load('in'), 'out') }
|
367
|
+
@interpreter.to_pig_latin.should match(/(\w+) = LOAD 'in';\nSTORE \1 INTO 'out';/)
|
368
|
+
end
|
369
|
+
|
370
|
+
it 'aliases both a loaded relation and a grouped relation and uses the latter in the STORE statement' do
|
371
|
+
@interpreter.interpret { store(load('in', :schema => [:a]).group(:a), 'out') }
|
372
|
+
@interpreter.to_pig_latin.should match(/(\w+) = LOAD 'in' AS \(a\);\n(\w+) = GROUP \1 BY a;\nSTORE \2 INTO 'out';/)
|
373
|
+
end
|
374
|
+
|
375
|
+
it 'aliases a whole row of statements' do
|
376
|
+
@interpreter.interpret do
|
377
|
+
a = load('in', :schema => [:a])
|
378
|
+
b = a.group(:a)
|
379
|
+
c = b.group(:a)
|
380
|
+
d = c.group(:a)
|
381
|
+
store(d, 'out')
|
382
|
+
end
|
383
|
+
@interpreter.to_pig_latin.should match(/(\w+) = LOAD 'in' AS \(a\);\n(\w+) = GROUP \1 BY a;\n(\w+) = GROUP \2 BY a;\n(\w+) = GROUP \3 BY a;\nSTORE \4 INTO 'out';/)
|
384
|
+
end
|
385
|
+
|
386
|
+
it 'outputs the statements for an alias only once, regardless of home many times it is stored' do
|
387
|
+
@interpreter.interpret do
|
388
|
+
a = load('in')
|
389
|
+
b = a.distinct
|
390
|
+
store(b, 'out1')
|
391
|
+
store(b, 'out2')
|
392
|
+
end
|
393
|
+
@interpreter.to_pig_latin.should match(/(\w+) = LOAD 'in';\n(\w+) = DISTINCT \1;\nSTORE \2 INTO 'out1';\nSTORE \2 INTO 'out2';/)
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
context 'long and complex scripts' do
|
398
|
+
before do
|
399
|
+
@interpreter.interpret do
|
400
|
+
sessions = load('sessions', :schema => [
|
401
|
+
[:ad_id, :chararray],
|
402
|
+
[:site, :chararray],
|
403
|
+
[:size, :chararray],
|
404
|
+
[:name, :chararray],
|
405
|
+
[:impression, :int],
|
406
|
+
[:engagement, :int],
|
407
|
+
[:click_thru, :int]
|
408
|
+
])
|
409
|
+
%w(site size name).each do |dimension|
|
410
|
+
result = sessions.group(:ad_id, dimension).foreach do |r|
|
411
|
+
[
|
412
|
+
r[0].ad_id.as(:ad_id),
|
413
|
+
literal(dimension).as(:dimension),
|
414
|
+
r[0].field(dimension).as(:value),
|
415
|
+
r[1].exposure.sum.as(:exposures),
|
416
|
+
r[1].impression.sum.as(:impressions),
|
417
|
+
r[1].engagement.sum.as(:engagements),
|
418
|
+
r[1].click_thru.sum.as(:click_thrus)
|
419
|
+
]
|
420
|
+
end
|
421
|
+
store(result, "report_metrics-#{dimension}")
|
422
|
+
end
|
423
|
+
end
|
424
|
+
@output = @interpreter.to_pig_latin
|
425
|
+
end
|
426
|
+
|
427
|
+
it 'outputs the correct number of LOAD statements' do
|
428
|
+
@output.scan(/LOAD/).size.should eql(1)
|
429
|
+
end
|
430
|
+
|
431
|
+
it 'outputs the correct number of STORE statements' do
|
432
|
+
@output.scan(/STORE/).size.should eql(3)
|
433
|
+
end
|
434
|
+
|
435
|
+
it 'doesn\'t assign to the same relation twice' do
|
436
|
+
@assignments = @output.scan(/^(\w+)(?=\s*=)/).flatten
|
437
|
+
@assignments.uniq.should eql(@assignments)
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
context 'schemas' do
|
442
|
+
it 'knows the schema of a relation returned by #load, with types' do
|
443
|
+
schema = catch(:schema) do
|
444
|
+
@interpreter.interpret do
|
445
|
+
schema = load('in', :schema => [[:a, :chararray], [:b, :chararray]]).schema
|
446
|
+
throw :schema, schema
|
447
|
+
end
|
448
|
+
end
|
449
|
+
schema.field_names.should eql([:a, :b])
|
450
|
+
schema.field_type(:a).should eql(:chararray)
|
451
|
+
end
|
452
|
+
|
453
|
+
it 'knows the schema of a relation returned by #load, without types' do
|
454
|
+
schema = catch(:schema) do
|
455
|
+
@interpreter.interpret do
|
456
|
+
schema = load('in', :schema => [:a, :b]).schema
|
457
|
+
throw :schema, schema
|
458
|
+
end
|
459
|
+
end
|
460
|
+
schema.field_names.should eql([:a, :b])
|
461
|
+
schema.field_type(:a).should eql(:bytearray)
|
462
|
+
end
|
463
|
+
|
464
|
+
it 'knows the schema of a relation returned by #load, with and without types' do
|
465
|
+
schema = catch(:schema) do
|
466
|
+
@interpreter.interpret do
|
467
|
+
schema = load('in', :schema => [[:a, :float], :b]).schema
|
468
|
+
throw :schema, schema
|
469
|
+
end
|
470
|
+
end
|
471
|
+
schema.field_names.should eql([:a, :b])
|
472
|
+
schema.field_type(:a).should eql(:float)
|
473
|
+
end
|
474
|
+
|
475
|
+
it 'does not know anything about the schema of a relation returned by #load if no schema was given' do
|
476
|
+
relation = catch(:relation) do
|
477
|
+
@interpreter.interpret do
|
478
|
+
throw :relation, load('in')
|
479
|
+
end
|
480
|
+
end
|
481
|
+
relation.schema.should be_nil
|
482
|
+
end
|
483
|
+
|
484
|
+
it 'knows the schema of a relation derived through non-schema-changing operations' do
|
485
|
+
schema = catch(:schema) do
|
486
|
+
@interpreter.interpret do
|
487
|
+
relation = load('in', :schema => [[:a, :float], [:b, :int]]).limit(3).sample(0.1).distinct.order(:a)
|
488
|
+
throw :schema, relation.schema
|
489
|
+
end
|
490
|
+
end
|
491
|
+
schema.field_names.should eql([:a, :b])
|
492
|
+
schema.field_type(:a).should eql(:float)
|
493
|
+
schema.field_type(:b).should eql(:int)
|
494
|
+
end
|
495
|
+
|
496
|
+
it 'knows the schema of a relation grouped on one field' do
|
497
|
+
relation = catch(:relation) do
|
498
|
+
@interpreter.interpret do
|
499
|
+
relation = load('in', :schema => [[:a, :float], [:b, :int]]).group(:a)
|
500
|
+
throw :relation, relation
|
501
|
+
end
|
502
|
+
end
|
503
|
+
source_relation_name = relation.sources.first.alias.to_sym
|
504
|
+
relation.schema.field_names.should eql([:group, source_relation_name])
|
505
|
+
relation.schema.field_type(:group).should eql(:float)
|
506
|
+
relation.schema.field_type(source_relation_name).should be_a(Piglet::Schema::Bag)
|
507
|
+
relation.schema.field_type(source_relation_name).field_names.should eql([:a, :b])
|
508
|
+
relation.schema.field_type(source_relation_name).field_type(:a).should eql(:float)
|
509
|
+
end
|
510
|
+
|
511
|
+
it 'knows the schema of a relation grouped on more than one field' do
|
512
|
+
relation = catch(:relation) do
|
513
|
+
@interpreter.interpret do
|
514
|
+
relation = load('in', :schema => [[:a, :float], [:b, :int]]).group(:a, :b)
|
515
|
+
throw :relation, relation
|
516
|
+
end
|
517
|
+
end
|
518
|
+
source_relation_name = relation.sources.first.alias.to_sym
|
519
|
+
relation.schema.field_names.should eql([:group, source_relation_name])
|
520
|
+
relation.schema.field_type(:group).should be_a(Piglet::Schema::Tuple)
|
521
|
+
relation.schema.field_type(:group).field_names.should eql([:a, :b])
|
522
|
+
relation.schema.field_type(:group).field_type(:a).should eql(:float)
|
523
|
+
relation.schema.field_type(source_relation_name).should be_a(Piglet::Schema::Bag)
|
524
|
+
relation.schema.field_type(source_relation_name).field_names.should eql([:a, :b])
|
525
|
+
relation.schema.field_type(source_relation_name).field_type(:b).should eql(:int)
|
526
|
+
end
|
527
|
+
|
528
|
+
it 'knows the schema of a relation cross joined with itself' do
|
529
|
+
schema = catch(:schema) do
|
530
|
+
@interpreter.interpret do
|
531
|
+
relation = load('in', :schema => [[:a, :float], [:b, :int]])
|
532
|
+
relation = relation.cross(relation)
|
533
|
+
throw :schema, relation.schema
|
534
|
+
end
|
535
|
+
end
|
536
|
+
schema.field_names.should eql([:a, :b, :a, :b])
|
537
|
+
schema.field_type(:a).should eql(:float)
|
538
|
+
schema.field_type(:b).should eql(:int)
|
539
|
+
end
|
540
|
+
|
541
|
+
it 'knows the schema of a relation cross joined with another' do
|
542
|
+
schema = catch(:schema) do
|
543
|
+
@interpreter.interpret do
|
544
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
545
|
+
relation2 = load('in2', :schema => [[:c, :chararray], [:d, :double]])
|
546
|
+
relation3 = relation1.cross(relation2)
|
547
|
+
throw :schema, relation3.schema
|
548
|
+
end
|
549
|
+
end
|
550
|
+
schema.field_names.should eql([:a, :b, :c, :d])
|
551
|
+
schema.field_type(:a).should eql(:float)
|
552
|
+
schema.field_type(:b).should eql(:int)
|
553
|
+
schema.field_type(:c).should eql(:chararray)
|
554
|
+
schema.field_type(:d).should eql(:double)
|
555
|
+
end
|
556
|
+
|
557
|
+
it 'knows the schema of a relation joined with another' do
|
558
|
+
schema = catch(:schema) do
|
559
|
+
@interpreter.interpret do
|
560
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
561
|
+
relation2 = load('in2', :schema => [[:c, :int], [:d, :double]])
|
562
|
+
relation3 = relation1.join(relation1 => :b, relation2 => :c)
|
563
|
+
throw :schema, relation3.schema
|
564
|
+
end
|
565
|
+
end
|
566
|
+
schema.field_names.should eql([:a, :b, :c, :d])
|
567
|
+
schema.field_type(:a).should eql(:float)
|
568
|
+
schema.field_type(:b).should eql(:int)
|
569
|
+
schema.field_type(:c).should eql(:int)
|
570
|
+
schema.field_type(:d).should eql(:double)
|
571
|
+
end
|
572
|
+
|
573
|
+
it 'knows the schema of a relation cogrouped with another' do
|
574
|
+
relation1, relation2, relation3 = catch(:relations) do
|
575
|
+
@interpreter.interpret do
|
576
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
577
|
+
relation2 = load('in2', :schema => [[:c, :int], [:d, :double]])
|
578
|
+
relation3 = relation1.cogroup(relation1 => :b, relation2 => :c)
|
579
|
+
throw :relations, [relation1, relation2, relation3]
|
580
|
+
end
|
581
|
+
end
|
582
|
+
relation3.schema.field_names.should eql([:group, relation1.alias.to_sym, relation2.alias.to_sym])
|
583
|
+
relation3.schema.field_type(relation1.alias.to_sym).should be_a(Piglet::Schema::Bag)
|
584
|
+
relation3.schema.field_type(relation2.alias.to_sym).should be_a(Piglet::Schema::Bag)
|
585
|
+
relation3.schema.field_type(relation1.alias.to_sym).field_names.should eql([:a, :b])
|
586
|
+
relation3.schema.field_type(relation2.alias.to_sym).field_names.should eql([:c, :d])
|
587
|
+
end
|
588
|
+
|
589
|
+
it 'knows the schema of a relation projection' do
|
590
|
+
schema = catch(:schema) do
|
591
|
+
@interpreter.interpret do
|
592
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
593
|
+
relation2 = relation1.foreach { |r| [r.a] }
|
594
|
+
throw :schema, relation2.schema
|
595
|
+
end
|
596
|
+
end
|
597
|
+
schema.field_names.should eql([:a])
|
598
|
+
schema.field_type(:a).should eql(:float)
|
599
|
+
end
|
600
|
+
|
601
|
+
it 'knows the schema of a relation projection containing a call to MAX' do
|
602
|
+
schema = catch(:schema) do
|
603
|
+
@interpreter.interpret do
|
604
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
605
|
+
relation2 = relation1.foreach { |r| [r.a.max] }
|
606
|
+
throw :schema, relation2.schema
|
607
|
+
end
|
608
|
+
end
|
609
|
+
schema.field_names.should eql([nil])
|
610
|
+
schema.field_type(0).should eql(:float)
|
611
|
+
end
|
612
|
+
|
613
|
+
it 'knows the schema of a relation projection containing a call to COUNT' do
|
614
|
+
schema = catch(:schema) do
|
615
|
+
@interpreter.interpret do
|
616
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
617
|
+
relation2 = relation1.foreach { |r| [r.a.count] }
|
618
|
+
throw :schema, relation2.schema
|
619
|
+
end
|
620
|
+
end
|
621
|
+
schema.field_names.should eql([nil])
|
622
|
+
schema.field_type(0).should eql(:long)
|
623
|
+
end
|
624
|
+
|
625
|
+
it 'knows the schema of a relation projection containing a field rename' do
|
626
|
+
schema = catch(:schema) do
|
627
|
+
@interpreter.interpret do
|
628
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
629
|
+
relation2 = relation1.foreach { |r| [r.a.count.as(:x)] }
|
630
|
+
throw :schema, relation2.schema
|
631
|
+
end
|
632
|
+
end
|
633
|
+
schema.field_names.should eql([:x])
|
634
|
+
end
|
635
|
+
|
636
|
+
it 'knows the schema of a relation projection containing a literal string' do
|
637
|
+
schema = catch(:schema) do
|
638
|
+
@interpreter.interpret do
|
639
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
640
|
+
relation2 = relation1.foreach { |r| [literal('blipp')] }
|
641
|
+
throw :schema, relation2.schema
|
642
|
+
end
|
643
|
+
end
|
644
|
+
schema.field_type(0).should eql(:chararray)
|
645
|
+
end
|
646
|
+
|
647
|
+
it 'knows the schema of a relation projection containing a literal integer' do
|
648
|
+
schema = catch(:schema) do
|
649
|
+
@interpreter.interpret do
|
650
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
651
|
+
relation2 = relation1.foreach { |r| [literal(4)] }
|
652
|
+
throw :schema, relation2.schema
|
653
|
+
end
|
654
|
+
end
|
655
|
+
schema.field_type(0).should eql(:int)
|
656
|
+
end
|
657
|
+
|
658
|
+
it 'knows the schema of a relation projection containing a literal float' do
|
659
|
+
schema = catch(:schema) do
|
660
|
+
@interpreter.interpret do
|
661
|
+
relation1 = load('in1', :schema => [[:a, :float], [:b, :int]])
|
662
|
+
relation2 = relation1.foreach { |r| [literal(3.14)] }
|
663
|
+
throw :schema, relation2.schema
|
664
|
+
end
|
665
|
+
end
|
666
|
+
schema.field_type(0).should eql(:double)
|
667
|
+
end
|
668
|
+
|
669
|
+
end
|
6
670
|
|
7
671
|
end
|