piglet 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +22 -0
- data/LICENSE +20 -0
- data/README.rdoc +293 -0
- data/Rakefile +50 -0
- data/bin/piglet +9 -0
- data/examples/analysis.rb +311 -0
- data/examples/scratch.rb +11 -0
- data/examples/spike1.rb +43 -0
- data/examples/spike2.rb +40 -0
- data/examples/test1.rb +3 -0
- data/examples/test2.rb +5 -0
- data/examples/test3.rb +4 -0
- data/lib/piglet/assignment.rb +13 -0
- data/lib/piglet/cogroup.rb +31 -0
- data/lib/piglet/cross.rb +22 -0
- data/lib/piglet/describe.rb +5 -0
- data/lib/piglet/distinct.rb +16 -0
- data/lib/piglet/dump.rb +5 -0
- data/lib/piglet/explain.rb +13 -0
- data/lib/piglet/field.rb +40 -0
- data/lib/piglet/field_expression_functions.rb +62 -0
- data/lib/piglet/field_function_expression.rb +19 -0
- data/lib/piglet/field_infix_expression.rb +17 -0
- data/lib/piglet/field_prefix_expression.rb +21 -0
- data/lib/piglet/field_rename.rb +11 -0
- data/lib/piglet/field_suffix_expression.rb +17 -0
- data/lib/piglet/filter.rb +13 -0
- data/lib/piglet/foreach.rb +19 -0
- data/lib/piglet/group.rb +21 -0
- data/lib/piglet/illustrate.rb +5 -0
- data/lib/piglet/interpreter.rb +108 -0
- data/lib/piglet/join.rb +20 -0
- data/lib/piglet/limit.rb +13 -0
- data/lib/piglet/load.rb +31 -0
- data/lib/piglet/load_and_store.rb +16 -0
- data/lib/piglet/order.rb +29 -0
- data/lib/piglet/relation.rb +177 -0
- data/lib/piglet/sample.rb +13 -0
- data/lib/piglet/split.rb +41 -0
- data/lib/piglet/store.rb +17 -0
- data/lib/piglet/storing.rb +13 -0
- data/lib/piglet/stream.rb +5 -0
- data/lib/piglet/union.rb +19 -0
- data/lib/piglet.rb +45 -0
- data/spec/piglet/field_spec.rb +130 -0
- data/spec/piglet/interpreter_spec.rb +413 -0
- data/spec/piglet/relation_spec.rb +79 -0
- data/spec/piglet/split_spec.rb +34 -0
- data/spec/piglet_spec.rb +7 -0
- data/spec/spec.opts +3 -0
- data/spec/spec_helper.rb +14 -0
- metadata +123 -0
@@ -0,0 +1,311 @@
|
|
1
|
+
# raw_sessions =
|
2
|
+
# LOAD '$INPUT/sessions*'
|
3
|
+
# USING PigStorage AS (
|
4
|
+
# date:chararray,
|
5
|
+
# api_key:chararray,
|
6
|
+
# ad_id:chararray,
|
7
|
+
# user_id:chararray,
|
8
|
+
# site:chararray,
|
9
|
+
# size:chararray,
|
10
|
+
# name:chararray,
|
11
|
+
# destination:chararray,
|
12
|
+
# indeterminate_visibility:int,
|
13
|
+
# impression:int,
|
14
|
+
# engagement:int,
|
15
|
+
# click_thru:int,
|
16
|
+
# extra:int,
|
17
|
+
# session_time:int,
|
18
|
+
# visible_time:int,
|
19
|
+
# engagement_time:int
|
20
|
+
# );
|
21
|
+
raw_sessions = load('$INPUT/sessions*', :schema => [
|
22
|
+
[:date :chararray],
|
23
|
+
[:api_key :chararray],
|
24
|
+
[:ad_id :chararray],
|
25
|
+
[:user_id :chararray],
|
26
|
+
[:site :chararray],
|
27
|
+
[:size :chararray],
|
28
|
+
[:name :chararray],
|
29
|
+
[:destination :chararray],
|
30
|
+
[:indeterminate_visibility :int],
|
31
|
+
[:impression :int],
|
32
|
+
[:engagement :int],
|
33
|
+
[:click_thru :int],
|
34
|
+
[:extra :int],
|
35
|
+
[:session_time :int],
|
36
|
+
[:visible_time :int],
|
37
|
+
[:engagement_time :int]
|
38
|
+
])
|
39
|
+
|
40
|
+
# raw_actions =
|
41
|
+
# LOAD '$INPUT/actions*'
|
42
|
+
# USING PigStorage AS (
|
43
|
+
# date:chararray,
|
44
|
+
# api_key:chararray,
|
45
|
+
# ad_id:chararray,
|
46
|
+
# user_id:chararray,
|
47
|
+
# action:chararray,
|
48
|
+
# site:chararray,
|
49
|
+
# size:chararray,
|
50
|
+
# name:chararray,
|
51
|
+
# destination:chararray,
|
52
|
+
# extra:int
|
53
|
+
# );
|
54
|
+
raw_actions = load('$INPUT/actions*', :schema =>
|
55
|
+
[:date :chararray],
|
56
|
+
[:api_key :chararray],
|
57
|
+
[:ad_id :chararray],
|
58
|
+
[:user_id :chararray],
|
59
|
+
[:action :chararray],
|
60
|
+
[:site :chararray],
|
61
|
+
[:size :chararray],
|
62
|
+
[:name :chararray],
|
63
|
+
[:destination :chararray],
|
64
|
+
[:extra :int]
|
65
|
+
)
|
66
|
+
|
67
|
+
#sessions = FILTER raw_sessions BY date is not null;
|
68
|
+
sessions = raw_sessions.filter { |r| r.date.not_null? }
|
69
|
+
|
70
|
+
#actions = FILTER raw_actions BY date is not null;
|
71
|
+
actions = raw_actions.filter { |r| r.date.not_null? }
|
72
|
+
|
73
|
+
# /*
|
74
|
+
# * Modify each session and action based on whether or not it's an extra session
|
75
|
+
# * (a session that was logged only because it was a click thru). Extra sessions
|
76
|
+
# * should affect only the total number of click thrus, not the number of
|
77
|
+
# * exposures, impressions, etc. nor the durations. By setting these values to
|
78
|
+
# * zero and introducing a field for whether or not the session was an exposure
|
79
|
+
# * (zero for extra sessions, one for all other), the calculations below can
|
80
|
+
# * filter out extra sessions without too much work.
|
81
|
+
# */
|
82
|
+
# sessions =
|
83
|
+
# FOREACH
|
84
|
+
# sessions
|
85
|
+
# GENERATE
|
86
|
+
# date,
|
87
|
+
# api_key,
|
88
|
+
# ad_id,
|
89
|
+
# user_id,
|
90
|
+
# site,
|
91
|
+
# size,
|
92
|
+
# name,
|
93
|
+
# destination,
|
94
|
+
# (extra == 1 ? 0 : indeterminate_visibility) AS indeterminate_visibility,
|
95
|
+
# (extra == 1 ? 0 : 1) AS exposure,
|
96
|
+
# (extra == 1 ? 0 : impression) AS impression,
|
97
|
+
# (extra == 1 ? 0 : engagement) AS engagement,
|
98
|
+
# click_thru,
|
99
|
+
# (extra == 1 ? 0 : session_time) AS session_time,
|
100
|
+
# (extra == 1 ? 0 : visible_time) AS visible_time,
|
101
|
+
# (extra == 1 ? 0 : engagement_time) AS engagement_time;
|
102
|
+
sessions = sessions.foreach do |r|
|
103
|
+
[
|
104
|
+
r.date,
|
105
|
+
r.api_key,
|
106
|
+
r.ad_id,
|
107
|
+
r.user_id,
|
108
|
+
r.site,
|
109
|
+
r.size,
|
110
|
+
r.name,
|
111
|
+
r.destination,
|
112
|
+
r.test(r.extra == 1, 0, r.indeterminate_visibility).as(:indeterminate_visibility),
|
113
|
+
r.test(r.extra == 1, 0, 1).as(:exposure),
|
114
|
+
r.test(r.extra == 1, 0, r.impression).as(:impression),
|
115
|
+
r.test(r.extra == 1, 0, r.engagement).as(:engagement),
|
116
|
+
r.click_thru,
|
117
|
+
r.test(r.extra == 1, 0, r.session_time).as(:session_time),
|
118
|
+
r.test(r.extra == 1, 0, r.visible_time).as(:visible_time),
|
119
|
+
r.test(r.extra == 1, 0, r.engagement_time).as(:engagement_time)
|
120
|
+
]
|
121
|
+
end
|
122
|
+
|
123
|
+
# actions =
|
124
|
+
# FOREACH
|
125
|
+
# actions
|
126
|
+
# GENERATE
|
127
|
+
# date,
|
128
|
+
# api_key,
|
129
|
+
# ad_id,
|
130
|
+
# user_id,
|
131
|
+
# action,
|
132
|
+
# site,
|
133
|
+
# size,
|
134
|
+
# name,
|
135
|
+
# destination,
|
136
|
+
# (extra == 1 ? 0 : 1) AS exposure;
|
137
|
+
actions = actions.foreach do |r|
|
138
|
+
[
|
139
|
+
r.date,
|
140
|
+
r.api_key,
|
141
|
+
r.ad_id,
|
142
|
+
r.user_id,
|
143
|
+
r.action,
|
144
|
+
r.site,
|
145
|
+
r.size,
|
146
|
+
r.name,
|
147
|
+
r.destination,
|
148
|
+
r.test(r.extra == 1, 0, 1).as(:exposure)
|
149
|
+
]
|
150
|
+
end
|
151
|
+
|
152
|
+
%w(all site size name).each do |name|
|
153
|
+
# session_category_<%= name %> =
|
154
|
+
# FOREACH
|
155
|
+
# (GROUP sessions BY (date, ad_id, api_key, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
|
156
|
+
# GENERATE
|
157
|
+
# $0.date AS date,
|
158
|
+
# $0.ad_id AS ad_id,
|
159
|
+
# $0.api_key AS api_key,
|
160
|
+
# '<%= name %>' AS category,
|
161
|
+
# <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
|
162
|
+
# SUM($1.exposure) AS exposures,
|
163
|
+
# SUM($1.impression) AS impressions,
|
164
|
+
# SUM($1.engagement) AS engagements,
|
165
|
+
# SUM($1.click_thru) AS click_thrus,
|
166
|
+
# SUM($1.indeterminate_visibility) AS indeterminate_visibility,
|
167
|
+
# SUM($1.session_time) AS session_time,
|
168
|
+
# SUM($1.visible_time) AS visible_time,
|
169
|
+
# SUM($1.engagement_time) AS engagement_time;
|
170
|
+
session_category = sessions.group(:date, :ad_id, :api_key, (name == 'all' ? 'all' : name), :parallel => '$PARALLELISM')
|
171
|
+
session_category = session_category.foreach do |r|
|
172
|
+
[
|
173
|
+
r[0].date.as(:date),
|
174
|
+
r[0].ad_id.as(:ad_id),
|
175
|
+
r[0].api_key.as(:api_key),
|
176
|
+
name.as(:category),
|
177
|
+
(name == 'all' ? "'all'" : r[0].name).as(:segment),
|
178
|
+
r[1].sum.as(:exposure),
|
179
|
+
r[1].sum.as(:impression),
|
180
|
+
r[1].sum.as(:engagement),
|
181
|
+
r[1].sum.as(:click_thru),
|
182
|
+
r[1].sum.as(:indeterminate_visibility),
|
183
|
+
r[1].sum.as(:session_time),
|
184
|
+
r[1].sum.as(:visible_time),
|
185
|
+
r[1].sum.as(:engagement_time)
|
186
|
+
]
|
187
|
+
end
|
188
|
+
|
189
|
+
# session_category_<%= name %>_by_user_id =
|
190
|
+
# FOREACH
|
191
|
+
# (GROUP sessions BY (date, ad_id, api_key, user_id, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
|
192
|
+
# GENERATE
|
193
|
+
# $0.date AS date,
|
194
|
+
# $0.ad_id AS ad_id,
|
195
|
+
# $0.api_key AS api_key,
|
196
|
+
# '<%= name %>' AS category,
|
197
|
+
# <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
|
198
|
+
# 1 AS exposures,
|
199
|
+
# MAX($1.impression) AS impressions,
|
200
|
+
# MAX($1.engagement) AS engagements,
|
201
|
+
# MAX($1.click_thru) AS click_thrus;
|
202
|
+
session_category_by_user_id = sessions.group(:date, :ad_id, :api_key, (name == 'all' ? 'all' : name), :parallel => '$PARALLELISM')
|
203
|
+
session_category_by_user_id = session_category_by_user_id.foreach do |r|
|
204
|
+
r[0].date.as(:date),
|
205
|
+
r[0].ad_id.as(:ad_id),
|
206
|
+
r[0].api_key.as(:api_key),
|
207
|
+
name.as(:category),
|
208
|
+
(name == 'all' ? "'all'" : r[0].name).as(:segment)
|
209
|
+
1.as(:exposures),
|
210
|
+
r[1].impression.max.as(:impressions),
|
211
|
+
r[1].engagement.max.as(:engagements),
|
212
|
+
r[1].click_thru.max.as(:click_thrus)
|
213
|
+
end
|
214
|
+
|
215
|
+
# unique_session_category_<%= name %> =
|
216
|
+
# FOREACH
|
217
|
+
# (GROUP session_category_<%= name %>_by_user_id BY (date, ad_id, api_key, category, segment) PARALLEL $PARALLELISM)
|
218
|
+
# GENERATE
|
219
|
+
# $0.date AS date,
|
220
|
+
# $0.ad_id AS ad_id,
|
221
|
+
# $0.api_key AS api_key,
|
222
|
+
# $0.category,
|
223
|
+
# $0.segment,
|
224
|
+
# COUNT($1.ad_id) AS unique_exposures,
|
225
|
+
# SUM($1.impressions) AS unique_impressions,
|
226
|
+
# SUM($1.engagements) AS unique_engagements,
|
227
|
+
# SUM($1.click_thrus) AS unique_click_thrus;
|
228
|
+
#
|
229
|
+
# action_category_<%= name %> =
|
230
|
+
# FOREACH
|
231
|
+
# (GROUP actions BY (date, ad_id, api_key, action, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
|
232
|
+
# GENERATE
|
233
|
+
# $0.date AS date,
|
234
|
+
# $0.ad_id AS ad_id,
|
235
|
+
# $0.api_key AS api_key,
|
236
|
+
# $0.action AS action,
|
237
|
+
# '<%= name %>' AS category,
|
238
|
+
# <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
|
239
|
+
# SUM($1.exposure) AS engagements;
|
240
|
+
#
|
241
|
+
# action_category_<%= name %>_by_user_id =
|
242
|
+
# FOREACH
|
243
|
+
# (GROUP actions BY (date, ad_id, api_key, action, user_id, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
|
244
|
+
# GENERATE
|
245
|
+
# $0.date AS date,
|
246
|
+
# $0.ad_id AS ad_id,
|
247
|
+
# $0.api_key AS api_key,
|
248
|
+
# $0.action AS action,
|
249
|
+
# '<%= name %>' AS category,
|
250
|
+
# <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
|
251
|
+
# 1 AS exposures,
|
252
|
+
# 1 AS engagements;
|
253
|
+
#
|
254
|
+
# unique_action_category_<%= name %> =
|
255
|
+
# FOREACH
|
256
|
+
# (GROUP action_category_<%= name %>_by_user_id BY (date, ad_id, api_key, action, category, segment) PARALLEL $PARALLELISM)
|
257
|
+
# GENERATE
|
258
|
+
# $0.date AS date,
|
259
|
+
# $0.ad_id AS ad_id,
|
260
|
+
# $0.api_key AS api_key,
|
261
|
+
# $0.action AS action,
|
262
|
+
# $0.category,
|
263
|
+
# $0.segment,
|
264
|
+
# SUM($1.engagements) AS unique_engagements;
|
265
|
+
end
|
266
|
+
|
267
|
+
-- unions ----------------------------------------------------------------------
|
268
|
+
-- -----------------------------------------------------------------------------
|
269
|
+
|
270
|
+
<% if @categories.size > 1 -%>
|
271
|
+
report_metrics =
|
272
|
+
UNION
|
273
|
+
<%= @categories.map { |name| "session_category_#{name}" }.join(",\n ") %>;
|
274
|
+
<% else -%>
|
275
|
+
report_metrics = FILTER session_category_<%= @categories.first %> BY 1 == 1;
|
276
|
+
<% end -%>
|
277
|
+
|
278
|
+
<% if @categories.size > 1 -%>
|
279
|
+
unique_report_metrics =
|
280
|
+
UNION
|
281
|
+
<%= @categories.map { |name| "unique_session_category_#{name}" }.join(",\n ") %>;
|
282
|
+
<% else -%>
|
283
|
+
unique_report_metrics = FILTER unique_session_category_<%= @categories.first %> BY 1 == 1;
|
284
|
+
<% end -%>
|
285
|
+
<% if @categories.size > 1 -%>
|
286
|
+
report_action_metrics =
|
287
|
+
UNION
|
288
|
+
<%= @categories.map { |name| "action_category_#{name}" }.join(",\n ") %>;
|
289
|
+
<% else -%>
|
290
|
+
report_action_metrics = FILTER action_category_<%= @categories.first %> BY 1 == 1;
|
291
|
+
<% end -%>
|
292
|
+
<% if @categories.size > 1 -%>
|
293
|
+
unique_report_action_metrics =
|
294
|
+
UNION
|
295
|
+
<%= @categories.map { |name| "unique_action_category_#{name}" }.join(",\n ") %>;
|
296
|
+
<% else -%>
|
297
|
+
unique_report_action_metrics = FILTER unique_action_category_<%= @categories.first %> BY 1 == 1;
|
298
|
+
<% end %>
|
299
|
+
|
300
|
+
-- complete output -------------------------------------------------------------
|
301
|
+
-- -----------------------------------------------------------------------------
|
302
|
+
|
303
|
+
|
304
|
+
<% %w(report_metrics unique_report_metrics report_action_metrics unique_report_action_metrics).each do |relation| -%>
|
305
|
+
<%= relation %> = FILTER <%= relation %> BY date is not null AND date != '' AND api_key is not null AND api_key != '';
|
306
|
+
<% end -%>
|
307
|
+
|
308
|
+
STORE report_metrics INTO '$OUTPUT/report_metrics' USING PigStorage;
|
309
|
+
STORE unique_report_metrics INTO '$OUTPUT/unique_report_metrics' USING PigStorage;
|
310
|
+
STORE report_action_metrics INTO '$OUTPUT/report_action_metrics' USING PigStorage;
|
311
|
+
STORE unique_report_action_metrics INTO '$OUTPUT/unique_report_action_metrics' USING PigStorage;
|
data/examples/scratch.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
module Piglet::Relation
|
2
|
+
def samples(*sizes)
|
3
|
+
sizes.map { |s| sample(s) }
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
input = load('input', :schema => %w(country browser site visit_duration))
|
8
|
+
a, b, c = input.samples(0.1, 0.2, 0.3)
|
9
|
+
store(a, 'output1')
|
10
|
+
store(b, 'output2')
|
11
|
+
store(c, 'output3')
|
data/examples/spike1.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# raw_ads =
|
2
|
+
# LOAD '$INPUT/ads*'
|
3
|
+
# USING PigStorage AS (
|
4
|
+
# ad_id:chararray,
|
5
|
+
# api_key:chararray,
|
6
|
+
# name:chararray,
|
7
|
+
# dimensions:chararray,
|
8
|
+
# destination:chararray,
|
9
|
+
# agent_version:chararray
|
10
|
+
# );
|
11
|
+
raw_ads << load('$INPUT/ads*').using(:pig_storage).as(
|
12
|
+
[:ad_id, :chararray],
|
13
|
+
[:api_key, :chararray],
|
14
|
+
[:name, :chararray],
|
15
|
+
[:dimensions, :chararray],
|
16
|
+
[:destination, :chararray],
|
17
|
+
[:agent_version, :chararray]
|
18
|
+
)
|
19
|
+
|
20
|
+
# ads =
|
21
|
+
# FOREACH
|
22
|
+
# (GROUP raw_ads BY ad_id PARALLEL $PARALLELISM)
|
23
|
+
# GENERATE
|
24
|
+
# $0 AS ad_id,
|
25
|
+
# MAX($1.api_key) AS api_key,
|
26
|
+
# MAX($1.name) AS name,
|
27
|
+
# MAX($1.dimensions) AS dimensions,
|
28
|
+
# MAX($1.destination) AS destination,
|
29
|
+
# MAX($1.agent_version) AS agent_version
|
30
|
+
# ;
|
31
|
+
ads << (raw_ads.group(:ad_id)).foreach do |relation|
|
32
|
+
[
|
33
|
+
relation[0].as(:ad_id),
|
34
|
+
relation[1].api_key.max.as(:api_key),
|
35
|
+
relation[1].name.max.as(:name),
|
36
|
+
relation[1].dimensions.max.as(:dimensions),
|
37
|
+
relation[1].destination.max.as(:destination),
|
38
|
+
relation[1].agent_version.max.as(:agent_version)
|
39
|
+
]
|
40
|
+
end
|
41
|
+
|
42
|
+
# STORE ads INTO '$OUTPUT/ads' USING PigStorage;
|
43
|
+
ads.store('$OUTPUT/ads').using(:pig_storage)
|
data/examples/spike2.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# raw_ads =
|
2
|
+
# LOAD '$INPUT/ads*'
|
3
|
+
# USING PigStorage AS (
|
4
|
+
# ad_id:chararray,
|
5
|
+
# api_key:chararray,
|
6
|
+
# name:chararray,
|
7
|
+
# dimensions:chararray,
|
8
|
+
# destination:chararray,
|
9
|
+
# agent_version:chararray
|
10
|
+
# );
|
11
|
+
raw_ads = load(
|
12
|
+
'$INPUT/ads*',
|
13
|
+
:using => :pig_storage,
|
14
|
+
:schema => %w(ad_id api_key name dimensions destination agent_version)
|
15
|
+
)
|
16
|
+
|
17
|
+
# ads =
|
18
|
+
# FOREACH
|
19
|
+
# (GROUP raw_ads BY ad_id PARALLEL $PARALLELISM)
|
20
|
+
# GENERATE
|
21
|
+
# $0 AS ad_id,
|
22
|
+
# MAX($1.api_key) AS api_key,
|
23
|
+
# MAX($1.name) AS name,
|
24
|
+
# MAX($1.dimensions) AS dimensions,
|
25
|
+
# MAX($1.destination) AS destination,
|
26
|
+
# MAX($1.agent_version) AS agent_version
|
27
|
+
# ;
|
28
|
+
ads = raw_ads.group(:ad_id, :parallel => 2).foreach do |relation|
|
29
|
+
[
|
30
|
+
relation[0].as(:ad_id),
|
31
|
+
relation[1].api_key.as(:api_key)
|
32
|
+
relation[1].name.max.as(:name)
|
33
|
+
relation[1].dimensions.max.as(:dimensions)
|
34
|
+
relation[1].destination.max.as(:destination)
|
35
|
+
relation[1].agent_version.max.as(:agent_version)
|
36
|
+
]
|
37
|
+
end
|
38
|
+
|
39
|
+
# STORE ads INTO '$OUTPUT/ads' USING PigStorage;
|
40
|
+
store(ads, '$OUTPUT/ads', :using => :pig_storage)
|
data/examples/test1.rb
ADDED
data/examples/test2.rb
ADDED
data/examples/test3.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module Piglet
|
2
|
+
class Cogroup # :nodoc:
|
3
|
+
include Relation
|
4
|
+
|
5
|
+
def initialize(relation, description)
|
6
|
+
@join_fields = description.reject { |k, v| ! (k.is_a?(Relation)) }
|
7
|
+
@sources = @join_fields.keys
|
8
|
+
@parallel = description[:parallel]
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
joins = @sources.map do |s|
|
13
|
+
fields = @join_fields[s]
|
14
|
+
if fields.is_a?(Enumerable) && fields.size > 1 && (fields.last == :inner || fields.last == :outer)
|
15
|
+
inout = fields.last.to_s.upcase
|
16
|
+
fields = fields[0..-2]
|
17
|
+
end
|
18
|
+
if fields.is_a?(Enumerable) && fields.size > 1
|
19
|
+
str = "#{s.alias} BY (#{fields.join(', ')})"
|
20
|
+
else
|
21
|
+
str = "#{s.alias} BY #{fields}"
|
22
|
+
end
|
23
|
+
str << " #{inout}" if inout
|
24
|
+
str
|
25
|
+
end
|
26
|
+
str = "COGROUP #{joins.join(', ')}"
|
27
|
+
str << " PARALLEL #{@parallel}" if @parallel
|
28
|
+
str
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/lib/piglet/cross.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Piglet
|
2
|
+
class Cross # :nodoc:
|
3
|
+
include Relation
|
4
|
+
|
5
|
+
def initialize(relations, options={})
|
6
|
+
options ||= {}
|
7
|
+
@sources, @parallel = relations, options[:parallel]
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_s
|
11
|
+
str = "CROSS #{source_aliases.join(', ')}"
|
12
|
+
str << " PARALLEL #{@parallel}" if @parallel
|
13
|
+
str
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def source_aliases
|
19
|
+
@sources.map { |s| s.alias }
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Piglet
|
2
|
+
class Distinct # :nodoc:
|
3
|
+
include Relation
|
4
|
+
|
5
|
+
def initialize(relation, options={})
|
6
|
+
options ||= {}
|
7
|
+
@sources, @parallel = [relation], options[:parallel]
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_s
|
11
|
+
str = "DISTINCT #{@sources.first.alias}"
|
12
|
+
str << " PARALLEL #{@parallel}" if @parallel
|
13
|
+
str
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/piglet/dump.rb
ADDED
data/lib/piglet/field.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
module Piglet
|
2
|
+
class Field # :nodoc:
|
3
|
+
include FieldExpressionFunctions
|
4
|
+
|
5
|
+
def initialize(name, relation=nil, options=nil)
|
6
|
+
options ||= {}
|
7
|
+
@name, @parent = name, relation
|
8
|
+
@explicit_ancestry = options[:explicit_ancestry] || false
|
9
|
+
end
|
10
|
+
|
11
|
+
def simple?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def method_missing(name, *args)
|
16
|
+
if name.to_s =~ /^\w+$/ && args.empty?
|
17
|
+
Field.new(name, self, :explicit_ancestry => true)
|
18
|
+
else
|
19
|
+
super
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def [](n)
|
24
|
+
Field.new("\$#{n}", self, :explicit_ancestry => true)
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
if @explicit_ancestry
|
29
|
+
if @parent.respond_to?(:alias)
|
30
|
+
"#{@parent.alias}.#{@name.to_s}"
|
31
|
+
else
|
32
|
+
"#{@parent}.#{@name.to_s}"
|
33
|
+
end
|
34
|
+
else
|
35
|
+
@name.to_s
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Piglet
|
2
|
+
module FieldExpressionFunctions # :nodoc:
|
3
|
+
SYMBOLIC_OPERATORS = [:==, :>, :<, :>=, :<=, :%, :+, :-, :*, :/]
|
4
|
+
FUNCTIONS = [:avg, :count, :diff, :max, :min, :size, :sum, :tokenize]
|
5
|
+
|
6
|
+
FUNCTIONS.each do |fun|
|
7
|
+
define_method(fun) { FieldFunctionExpression.new(fun.to_s.upcase, self) }
|
8
|
+
end
|
9
|
+
|
10
|
+
def empty?
|
11
|
+
FieldFunctionExpression.new('IsEmpty', self)
|
12
|
+
end
|
13
|
+
|
14
|
+
def as(new_name)
|
15
|
+
FieldRename.new(new_name, self)
|
16
|
+
end
|
17
|
+
|
18
|
+
def not
|
19
|
+
FieldPrefixExpression.new('NOT', self)
|
20
|
+
end
|
21
|
+
|
22
|
+
def null?
|
23
|
+
FieldSuffixExpression.new('is null', self)
|
24
|
+
end
|
25
|
+
|
26
|
+
def not_null?
|
27
|
+
FieldSuffixExpression.new('is not null', self)
|
28
|
+
end
|
29
|
+
|
30
|
+
def cast(type)
|
31
|
+
FieldPrefixExpression.new("(#{type.to_s})", self)
|
32
|
+
end
|
33
|
+
|
34
|
+
def matches(pattern)
|
35
|
+
regex_options_pattern = /^\(\?.+?:(.*)\)$/
|
36
|
+
pattern = pattern.to_s.sub(regex_options_pattern, '\1') if pattern.is_a?(Regexp) && pattern.to_s =~ regex_options_pattern
|
37
|
+
FieldInfixExpression.new('matches', self, "'#{pattern.to_s}'")
|
38
|
+
end
|
39
|
+
|
40
|
+
def neg
|
41
|
+
FieldPrefixExpression.new('-', self, false)
|
42
|
+
end
|
43
|
+
|
44
|
+
def ne(other)
|
45
|
+
FieldInfixExpression.new('!=', self, other)
|
46
|
+
end
|
47
|
+
|
48
|
+
SYMBOLIC_OPERATORS.each do |op|
|
49
|
+
define_method(op) { |other| FieldInfixExpression.new(op.to_s, self, other) }
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
|
54
|
+
def parenthesise(expr)
|
55
|
+
if expr.respond_to?(:simple?) && ! expr.simple?
|
56
|
+
"(#{expr})"
|
57
|
+
else
|
58
|
+
expr.to_s
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Piglet
|
2
|
+
class FieldFunctionExpression # :nodoc:
|
3
|
+
include FieldExpressionFunctions
|
4
|
+
|
5
|
+
def initialize(name, inner_expression, options=nil)
|
6
|
+
options ||= {}
|
7
|
+
@name, @inner_expression = name, inner_expression
|
8
|
+
@new_name = options[:as]
|
9
|
+
end
|
10
|
+
|
11
|
+
def simple?
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_s
|
16
|
+
"#{@name}(#{@inner_expression})"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Piglet
|
2
|
+
class FieldInfixExpression # :nodoc:
|
3
|
+
include FieldExpressionFunctions
|
4
|
+
|
5
|
+
def initialize(operator, left_expression, right_expression)
|
6
|
+
@operator, @left_expression, @right_expression = operator, left_expression, right_expression
|
7
|
+
end
|
8
|
+
|
9
|
+
def simple?
|
10
|
+
false
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_s
|
14
|
+
"#{parenthesise(@left_expression)} #{@operator} #{parenthesise(@right_expression)}"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|