piglet 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +22 -0
- data/LICENSE +20 -0
- data/README.rdoc +293 -0
- data/Rakefile +50 -0
- data/bin/piglet +9 -0
- data/examples/analysis.rb +311 -0
- data/examples/scratch.rb +11 -0
- data/examples/spike1.rb +43 -0
- data/examples/spike2.rb +40 -0
- data/examples/test1.rb +3 -0
- data/examples/test2.rb +5 -0
- data/examples/test3.rb +4 -0
- data/lib/piglet/assignment.rb +13 -0
- data/lib/piglet/cogroup.rb +31 -0
- data/lib/piglet/cross.rb +22 -0
- data/lib/piglet/describe.rb +5 -0
- data/lib/piglet/distinct.rb +16 -0
- data/lib/piglet/dump.rb +5 -0
- data/lib/piglet/explain.rb +13 -0
- data/lib/piglet/field.rb +40 -0
- data/lib/piglet/field_expression_functions.rb +62 -0
- data/lib/piglet/field_function_expression.rb +19 -0
- data/lib/piglet/field_infix_expression.rb +17 -0
- data/lib/piglet/field_prefix_expression.rb +21 -0
- data/lib/piglet/field_rename.rb +11 -0
- data/lib/piglet/field_suffix_expression.rb +17 -0
- data/lib/piglet/filter.rb +13 -0
- data/lib/piglet/foreach.rb +19 -0
- data/lib/piglet/group.rb +21 -0
- data/lib/piglet/illustrate.rb +5 -0
- data/lib/piglet/interpreter.rb +108 -0
- data/lib/piglet/join.rb +20 -0
- data/lib/piglet/limit.rb +13 -0
- data/lib/piglet/load.rb +31 -0
- data/lib/piglet/load_and_store.rb +16 -0
- data/lib/piglet/order.rb +29 -0
- data/lib/piglet/relation.rb +177 -0
- data/lib/piglet/sample.rb +13 -0
- data/lib/piglet/split.rb +41 -0
- data/lib/piglet/store.rb +17 -0
- data/lib/piglet/storing.rb +13 -0
- data/lib/piglet/stream.rb +5 -0
- data/lib/piglet/union.rb +19 -0
- data/lib/piglet.rb +45 -0
- data/spec/piglet/field_spec.rb +130 -0
- data/spec/piglet/interpreter_spec.rb +413 -0
- data/spec/piglet/relation_spec.rb +79 -0
- data/spec/piglet/split_spec.rb +34 -0
- data/spec/piglet_spec.rb +7 -0
- data/spec/spec.opts +3 -0
- data/spec/spec_helper.rb +14 -0
- metadata +123 -0
@@ -0,0 +1,311 @@
|
|
1
|
+
# raw_sessions =
|
2
|
+
# LOAD '$INPUT/sessions*'
|
3
|
+
# USING PigStorage AS (
|
4
|
+
# date:chararray,
|
5
|
+
# api_key:chararray,
|
6
|
+
# ad_id:chararray,
|
7
|
+
# user_id:chararray,
|
8
|
+
# site:chararray,
|
9
|
+
# size:chararray,
|
10
|
+
# name:chararray,
|
11
|
+
# destination:chararray,
|
12
|
+
# indeterminate_visibility:int,
|
13
|
+
# impression:int,
|
14
|
+
# engagement:int,
|
15
|
+
# click_thru:int,
|
16
|
+
# extra:int,
|
17
|
+
# session_time:int,
|
18
|
+
# visible_time:int,
|
19
|
+
# engagement_time:int
|
20
|
+
# );
|
21
|
+
raw_sessions = load('$INPUT/sessions*', :schema => [
|
22
|
+
[:date :chararray],
|
23
|
+
[:api_key :chararray],
|
24
|
+
[:ad_id :chararray],
|
25
|
+
[:user_id :chararray],
|
26
|
+
[:site :chararray],
|
27
|
+
[:size :chararray],
|
28
|
+
[:name :chararray],
|
29
|
+
[:destination :chararray],
|
30
|
+
[:indeterminate_visibility :int],
|
31
|
+
[:impression :int],
|
32
|
+
[:engagement :int],
|
33
|
+
[:click_thru :int],
|
34
|
+
[:extra :int],
|
35
|
+
[:session_time :int],
|
36
|
+
[:visible_time :int],
|
37
|
+
[:engagement_time :int]
|
38
|
+
])
|
39
|
+
|
40
|
+
# raw_actions =
|
41
|
+
# LOAD '$INPUT/actions*'
|
42
|
+
# USING PigStorage AS (
|
43
|
+
# date:chararray,
|
44
|
+
# api_key:chararray,
|
45
|
+
# ad_id:chararray,
|
46
|
+
# user_id:chararray,
|
47
|
+
# action:chararray,
|
48
|
+
# site:chararray,
|
49
|
+
# size:chararray,
|
50
|
+
# name:chararray,
|
51
|
+
# destination:chararray,
|
52
|
+
# extra:int
|
53
|
+
# );
|
54
|
+
raw_actions = load('$INPUT/actions*', :schema =>
|
55
|
+
[:date :chararray],
|
56
|
+
[:api_key :chararray],
|
57
|
+
[:ad_id :chararray],
|
58
|
+
[:user_id :chararray],
|
59
|
+
[:action :chararray],
|
60
|
+
[:site :chararray],
|
61
|
+
[:size :chararray],
|
62
|
+
[:name :chararray],
|
63
|
+
[:destination :chararray],
|
64
|
+
[:extra :int]
|
65
|
+
)
|
66
|
+
|
67
|
+
#sessions = FILTER raw_sessions BY date is not null;
|
68
|
+
sessions = raw_sessions.filter { |r| r.date.not_null? }
|
69
|
+
|
70
|
+
#actions = FILTER raw_actions BY date is not null;
|
71
|
+
actions = raw_actions.filter { |r| r.date.not_null? }
|
72
|
+
|
73
|
+
# /*
|
74
|
+
# * Modify each session and action based on whether or not it's an extra session
|
75
|
+
# * (a session that was logged only because it was a click thru). Extra sessions
|
76
|
+
# * should affect only the total number of click thrus, not the number of
|
77
|
+
# * exposures, impressions, etc. nor the durations. By setting these values to
|
78
|
+
# * zero and introducing a field for whether or not the session was an exposure
|
79
|
+
# * (zero for extra sessions, one for all other), the calculations below can
|
80
|
+
# * filter out extra sessions without too much work.
|
81
|
+
# */
|
82
|
+
# sessions =
|
83
|
+
# FOREACH
|
84
|
+
# sessions
|
85
|
+
# GENERATE
|
86
|
+
# date,
|
87
|
+
# api_key,
|
88
|
+
# ad_id,
|
89
|
+
# user_id,
|
90
|
+
# site,
|
91
|
+
# size,
|
92
|
+
# name,
|
93
|
+
# destination,
|
94
|
+
# (extra == 1 ? 0 : indeterminate_visibility) AS indeterminate_visibility,
|
95
|
+
# (extra == 1 ? 0 : 1) AS exposure,
|
96
|
+
# (extra == 1 ? 0 : impression) AS impression,
|
97
|
+
# (extra == 1 ? 0 : engagement) AS engagement,
|
98
|
+
# click_thru,
|
99
|
+
# (extra == 1 ? 0 : session_time) AS session_time,
|
100
|
+
# (extra == 1 ? 0 : visible_time) AS visible_time,
|
101
|
+
# (extra == 1 ? 0 : engagement_time) AS engagement_time;
|
102
|
+
sessions = sessions.foreach do |r|
|
103
|
+
[
|
104
|
+
r.date,
|
105
|
+
r.api_key,
|
106
|
+
r.ad_id,
|
107
|
+
r.user_id,
|
108
|
+
r.site,
|
109
|
+
r.size,
|
110
|
+
r.name,
|
111
|
+
r.destination,
|
112
|
+
r.test(r.extra == 1, 0, r.indeterminate_visibility).as(:indeterminate_visibility),
|
113
|
+
r.test(r.extra == 1, 0, 1).as(:exposure),
|
114
|
+
r.test(r.extra == 1, 0, r.impression).as(:impression),
|
115
|
+
r.test(r.extra == 1, 0, r.engagement).as(:engagement),
|
116
|
+
r.click_thru,
|
117
|
+
r.test(r.extra == 1, 0, r.session_time).as(:session_time),
|
118
|
+
r.test(r.extra == 1, 0, r.visible_time).as(:visible_time),
|
119
|
+
r.test(r.extra == 1, 0, r.engagement_time).as(:engagement_time)
|
120
|
+
]
|
121
|
+
end
|
122
|
+
|
123
|
+
# actions =
|
124
|
+
# FOREACH
|
125
|
+
# actions
|
126
|
+
# GENERATE
|
127
|
+
# date,
|
128
|
+
# api_key,
|
129
|
+
# ad_id,
|
130
|
+
# user_id,
|
131
|
+
# action,
|
132
|
+
# site,
|
133
|
+
# size,
|
134
|
+
# name,
|
135
|
+
# destination,
|
136
|
+
# (extra == 1 ? 0 : 1) AS exposure;
|
137
|
+
actions = actions.foreach do |r|
|
138
|
+
[
|
139
|
+
r.date,
|
140
|
+
r.api_key,
|
141
|
+
r.ad_id,
|
142
|
+
r.user_id,
|
143
|
+
r.action,
|
144
|
+
r.site,
|
145
|
+
r.size,
|
146
|
+
r.name,
|
147
|
+
r.destination,
|
148
|
+
r.test(r.extra == 1, 0, 1).as(:exposure)
|
149
|
+
]
|
150
|
+
end
|
151
|
+
|
152
|
+
%w(all site size name).each do |name|
|
153
|
+
# session_category_<%= name %> =
|
154
|
+
# FOREACH
|
155
|
+
# (GROUP sessions BY (date, ad_id, api_key, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
|
156
|
+
# GENERATE
|
157
|
+
# $0.date AS date,
|
158
|
+
# $0.ad_id AS ad_id,
|
159
|
+
# $0.api_key AS api_key,
|
160
|
+
# '<%= name %>' AS category,
|
161
|
+
# <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
|
162
|
+
# SUM($1.exposure) AS exposures,
|
163
|
+
# SUM($1.impression) AS impressions,
|
164
|
+
# SUM($1.engagement) AS engagements,
|
165
|
+
# SUM($1.click_thru) AS click_thrus,
|
166
|
+
# SUM($1.indeterminate_visibility) AS indeterminate_visibility,
|
167
|
+
# SUM($1.session_time) AS session_time,
|
168
|
+
# SUM($1.visible_time) AS visible_time,
|
169
|
+
# SUM($1.engagement_time) AS engagement_time;
|
170
|
+
session_category = sessions.group(:date, :ad_id, :api_key, (name == 'all' ? 'all' : name), :parallel => '$PARALLELISM')
|
171
|
+
session_category = session_category.foreach do |r|
|
172
|
+
[
|
173
|
+
r[0].date.as(:date),
|
174
|
+
r[0].ad_id.as(:ad_id),
|
175
|
+
r[0].api_key.as(:api_key),
|
176
|
+
name.as(:category),
|
177
|
+
(name == 'all' ? "'all'" : r[0].name).as(:segment),
|
178
|
+
r[1].sum.as(:exposure),
|
179
|
+
r[1].sum.as(:impression),
|
180
|
+
r[1].sum.as(:engagement),
|
181
|
+
r[1].sum.as(:click_thru),
|
182
|
+
r[1].sum.as(:indeterminate_visibility),
|
183
|
+
r[1].sum.as(:session_time),
|
184
|
+
r[1].sum.as(:visible_time),
|
185
|
+
r[1].sum.as(:engagement_time)
|
186
|
+
]
|
187
|
+
end
|
188
|
+
|
189
|
+
# session_category_<%= name %>_by_user_id =
|
190
|
+
# FOREACH
|
191
|
+
# (GROUP sessions BY (date, ad_id, api_key, user_id, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
|
192
|
+
# GENERATE
|
193
|
+
# $0.date AS date,
|
194
|
+
# $0.ad_id AS ad_id,
|
195
|
+
# $0.api_key AS api_key,
|
196
|
+
# '<%= name %>' AS category,
|
197
|
+
# <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
|
198
|
+
# 1 AS exposures,
|
199
|
+
# MAX($1.impression) AS impressions,
|
200
|
+
# MAX($1.engagement) AS engagements,
|
201
|
+
# MAX($1.click_thru) AS click_thrus;
|
202
|
+
session_category_by_user_id = sessions.group(:date, :ad_id, :api_key, (name == 'all' ? 'all' : name), :parallel => '$PARALLELISM')
|
203
|
+
session_category_by_user_id = session_category_by_user_id.foreach do |r|
|
204
|
+
r[0].date.as(:date),
|
205
|
+
r[0].ad_id.as(:ad_id),
|
206
|
+
r[0].api_key.as(:api_key),
|
207
|
+
name.as(:category),
|
208
|
+
(name == 'all' ? "'all'" : r[0].name).as(:segment)
|
209
|
+
1.as(:exposures),
|
210
|
+
r[1].impression.max.as(:impressions),
|
211
|
+
r[1].engagement.max.as(:engagements),
|
212
|
+
r[1].click_thru.max.as(:click_thrus)
|
213
|
+
end
|
214
|
+
|
215
|
+
# unique_session_category_<%= name %> =
|
216
|
+
# FOREACH
|
217
|
+
# (GROUP session_category_<%= name %>_by_user_id BY (date, ad_id, api_key, category, segment) PARALLEL $PARALLELISM)
|
218
|
+
# GENERATE
|
219
|
+
# $0.date AS date,
|
220
|
+
# $0.ad_id AS ad_id,
|
221
|
+
# $0.api_key AS api_key,
|
222
|
+
# $0.category,
|
223
|
+
# $0.segment,
|
224
|
+
# COUNT($1.ad_id) AS unique_exposures,
|
225
|
+
# SUM($1.impressions) AS unique_impressions,
|
226
|
+
# SUM($1.engagements) AS unique_engagements,
|
227
|
+
# SUM($1.click_thrus) AS unique_click_thrus;
|
228
|
+
#
|
229
|
+
# action_category_<%= name %> =
|
230
|
+
# FOREACH
|
231
|
+
# (GROUP actions BY (date, ad_id, api_key, action, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
|
232
|
+
# GENERATE
|
233
|
+
# $0.date AS date,
|
234
|
+
# $0.ad_id AS ad_id,
|
235
|
+
# $0.api_key AS api_key,
|
236
|
+
# $0.action AS action,
|
237
|
+
# '<%= name %>' AS category,
|
238
|
+
# <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
|
239
|
+
# SUM($1.exposure) AS engagements;
|
240
|
+
#
|
241
|
+
# action_category_<%= name %>_by_user_id =
|
242
|
+
# FOREACH
|
243
|
+
# (GROUP actions BY (date, ad_id, api_key, action, user_id, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
|
244
|
+
# GENERATE
|
245
|
+
# $0.date AS date,
|
246
|
+
# $0.ad_id AS ad_id,
|
247
|
+
# $0.api_key AS api_key,
|
248
|
+
# $0.action AS action,
|
249
|
+
# '<%= name %>' AS category,
|
250
|
+
# <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
|
251
|
+
# 1 AS exposures,
|
252
|
+
# 1 AS engagements;
|
253
|
+
#
|
254
|
+
# unique_action_category_<%= name %> =
|
255
|
+
# FOREACH
|
256
|
+
# (GROUP action_category_<%= name %>_by_user_id BY (date, ad_id, api_key, action, category, segment) PARALLEL $PARALLELISM)
|
257
|
+
# GENERATE
|
258
|
+
# $0.date AS date,
|
259
|
+
# $0.ad_id AS ad_id,
|
260
|
+
# $0.api_key AS api_key,
|
261
|
+
# $0.action AS action,
|
262
|
+
# $0.category,
|
263
|
+
# $0.segment,
|
264
|
+
# SUM($1.engagements) AS unique_engagements;
|
265
|
+
end
|
266
|
+
|
267
|
+
-- unions ----------------------------------------------------------------------
|
268
|
+
-- -----------------------------------------------------------------------------
|
269
|
+
|
270
|
+
<% if @categories.size > 1 -%>
|
271
|
+
report_metrics =
|
272
|
+
UNION
|
273
|
+
<%= @categories.map { |name| "session_category_#{name}" }.join(",\n ") %>;
|
274
|
+
<% else -%>
|
275
|
+
report_metrics = FILTER session_category_<%= @categories.first %> BY 1 == 1;
|
276
|
+
<% end -%>
|
277
|
+
|
278
|
+
<% if @categories.size > 1 -%>
|
279
|
+
unique_report_metrics =
|
280
|
+
UNION
|
281
|
+
<%= @categories.map { |name| "unique_session_category_#{name}" }.join(",\n ") %>;
|
282
|
+
<% else -%>
|
283
|
+
unique_report_metrics = FILTER unique_session_category_<%= @categories.first %> BY 1 == 1;
|
284
|
+
<% end -%>
|
285
|
+
<% if @categories.size > 1 -%>
|
286
|
+
report_action_metrics =
|
287
|
+
UNION
|
288
|
+
<%= @categories.map { |name| "action_category_#{name}" }.join(",\n ") %>;
|
289
|
+
<% else -%>
|
290
|
+
report_action_metrics = FILTER action_category_<%= @categories.first %> BY 1 == 1;
|
291
|
+
<% end -%>
|
292
|
+
<% if @categories.size > 1 -%>
|
293
|
+
unique_report_action_metrics =
|
294
|
+
UNION
|
295
|
+
<%= @categories.map { |name| "unique_action_category_#{name}" }.join(",\n ") %>;
|
296
|
+
<% else -%>
|
297
|
+
unique_report_action_metrics = FILTER unique_action_category_<%= @categories.first %> BY 1 == 1;
|
298
|
+
<% end %>
|
299
|
+
|
300
|
+
-- complete output -------------------------------------------------------------
|
301
|
+
-- -----------------------------------------------------------------------------
|
302
|
+
|
303
|
+
|
304
|
+
<% %w(report_metrics unique_report_metrics report_action_metrics unique_report_action_metrics).each do |relation| -%>
|
305
|
+
<%= relation %> = FILTER <%= relation %> BY date is not null AND date != '' AND api_key is not null AND api_key != '';
|
306
|
+
<% end -%>
|
307
|
+
|
308
|
+
STORE report_metrics INTO '$OUTPUT/report_metrics' USING PigStorage;
|
309
|
+
STORE unique_report_metrics INTO '$OUTPUT/unique_report_metrics' USING PigStorage;
|
310
|
+
STORE report_action_metrics INTO '$OUTPUT/report_action_metrics' USING PigStorage;
|
311
|
+
STORE unique_report_action_metrics INTO '$OUTPUT/unique_report_action_metrics' USING PigStorage;
|
data/examples/scratch.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
module Piglet::Relation
|
2
|
+
def samples(*sizes)
|
3
|
+
sizes.map { |s| sample(s) }
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
input = load('input', :schema => %w(country browser site visit_duration))
|
8
|
+
a, b, c = input.samples(0.1, 0.2, 0.3)
|
9
|
+
store(a, 'output1')
|
10
|
+
store(b, 'output2')
|
11
|
+
store(c, 'output3')
|
data/examples/spike1.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
# raw_ads =
|
2
|
+
# LOAD '$INPUT/ads*'
|
3
|
+
# USING PigStorage AS (
|
4
|
+
# ad_id:chararray,
|
5
|
+
# api_key:chararray,
|
6
|
+
# name:chararray,
|
7
|
+
# dimensions:chararray,
|
8
|
+
# destination:chararray,
|
9
|
+
# agent_version:chararray
|
10
|
+
# );
|
11
|
+
raw_ads << load('$INPUT/ads*').using(:pig_storage).as(
|
12
|
+
[:ad_id, :chararray],
|
13
|
+
[:api_key, :chararray],
|
14
|
+
[:name, :chararray],
|
15
|
+
[:dimensions, :chararray],
|
16
|
+
[:destination, :chararray],
|
17
|
+
[:agent_version, :chararray]
|
18
|
+
)
|
19
|
+
|
20
|
+
# ads =
|
21
|
+
# FOREACH
|
22
|
+
# (GROUP raw_ads BY ad_id PARALLEL $PARALLELISM)
|
23
|
+
# GENERATE
|
24
|
+
# $0 AS ad_id,
|
25
|
+
# MAX($1.api_key) AS api_key,
|
26
|
+
# MAX($1.name) AS name,
|
27
|
+
# MAX($1.dimensions) AS dimensions,
|
28
|
+
# MAX($1.destination) AS destination,
|
29
|
+
# MAX($1.agent_version) AS agent_version
|
30
|
+
# ;
|
31
|
+
ads << (raw_ads.group(:ad_id)).foreach do |relation|
|
32
|
+
[
|
33
|
+
relation[0].as(:ad_id),
|
34
|
+
relation[1].api_key.max.as(:api_key),
|
35
|
+
relation[1].name.max.as(:name),
|
36
|
+
relation[1].dimensions.max.as(:dimensions),
|
37
|
+
relation[1].destination.max.as(:destination),
|
38
|
+
relation[1].agent_version.max.as(:agent_version)
|
39
|
+
]
|
40
|
+
end
|
41
|
+
|
42
|
+
# STORE ads INTO '$OUTPUT/ads' USING PigStorage;
|
43
|
+
ads.store('$OUTPUT/ads').using(:pig_storage)
|
data/examples/spike2.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# raw_ads =
|
2
|
+
# LOAD '$INPUT/ads*'
|
3
|
+
# USING PigStorage AS (
|
4
|
+
# ad_id:chararray,
|
5
|
+
# api_key:chararray,
|
6
|
+
# name:chararray,
|
7
|
+
# dimensions:chararray,
|
8
|
+
# destination:chararray,
|
9
|
+
# agent_version:chararray
|
10
|
+
# );
|
11
|
+
raw_ads = load(
|
12
|
+
'$INPUT/ads*',
|
13
|
+
:using => :pig_storage,
|
14
|
+
:schema => %w(ad_id api_key name dimensions destination agent_version)
|
15
|
+
)
|
16
|
+
|
17
|
+
# ads =
|
18
|
+
# FOREACH
|
19
|
+
# (GROUP raw_ads BY ad_id PARALLEL $PARALLELISM)
|
20
|
+
# GENERATE
|
21
|
+
# $0 AS ad_id,
|
22
|
+
# MAX($1.api_key) AS api_key,
|
23
|
+
# MAX($1.name) AS name,
|
24
|
+
# MAX($1.dimensions) AS dimensions,
|
25
|
+
# MAX($1.destination) AS destination,
|
26
|
+
# MAX($1.agent_version) AS agent_version
|
27
|
+
# ;
|
28
|
+
ads = raw_ads.group(:ad_id, :parallel => 2).foreach do |relation|
|
29
|
+
[
|
30
|
+
relation[0].as(:ad_id),
|
31
|
+
relation[1].api_key.as(:api_key)
|
32
|
+
relation[1].name.max.as(:name)
|
33
|
+
relation[1].dimensions.max.as(:dimensions)
|
34
|
+
relation[1].destination.max.as(:destination)
|
35
|
+
relation[1].agent_version.max.as(:agent_version)
|
36
|
+
]
|
37
|
+
end
|
38
|
+
|
39
|
+
# STORE ads INTO '$OUTPUT/ads' USING PigStorage;
|
40
|
+
store(ads, '$OUTPUT/ads', :using => :pig_storage)
|
data/examples/test1.rb
ADDED
data/examples/test2.rb
ADDED
data/examples/test3.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module Piglet
|
2
|
+
class Cogroup # :nodoc:
|
3
|
+
include Relation
|
4
|
+
|
5
|
+
def initialize(relation, description)
|
6
|
+
@join_fields = description.reject { |k, v| ! (k.is_a?(Relation)) }
|
7
|
+
@sources = @join_fields.keys
|
8
|
+
@parallel = description[:parallel]
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
joins = @sources.map do |s|
|
13
|
+
fields = @join_fields[s]
|
14
|
+
if fields.is_a?(Enumerable) && fields.size > 1 && (fields.last == :inner || fields.last == :outer)
|
15
|
+
inout = fields.last.to_s.upcase
|
16
|
+
fields = fields[0..-2]
|
17
|
+
end
|
18
|
+
if fields.is_a?(Enumerable) && fields.size > 1
|
19
|
+
str = "#{s.alias} BY (#{fields.join(', ')})"
|
20
|
+
else
|
21
|
+
str = "#{s.alias} BY #{fields}"
|
22
|
+
end
|
23
|
+
str << " #{inout}" if inout
|
24
|
+
str
|
25
|
+
end
|
26
|
+
str = "COGROUP #{joins.join(', ')}"
|
27
|
+
str << " PARALLEL #{@parallel}" if @parallel
|
28
|
+
str
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/lib/piglet/cross.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Piglet
|
2
|
+
class Cross # :nodoc:
|
3
|
+
include Relation
|
4
|
+
|
5
|
+
def initialize(relations, options={})
|
6
|
+
options ||= {}
|
7
|
+
@sources, @parallel = relations, options[:parallel]
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_s
|
11
|
+
str = "CROSS #{source_aliases.join(', ')}"
|
12
|
+
str << " PARALLEL #{@parallel}" if @parallel
|
13
|
+
str
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def source_aliases
|
19
|
+
@sources.map { |s| s.alias }
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Piglet
|
2
|
+
class Distinct # :nodoc:
|
3
|
+
include Relation
|
4
|
+
|
5
|
+
def initialize(relation, options={})
|
6
|
+
options ||= {}
|
7
|
+
@sources, @parallel = [relation], options[:parallel]
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_s
|
11
|
+
str = "DISTINCT #{@sources.first.alias}"
|
12
|
+
str << " PARALLEL #{@parallel}" if @parallel
|
13
|
+
str
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/piglet/dump.rb
ADDED
data/lib/piglet/field.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
module Piglet
|
2
|
+
class Field # :nodoc:
|
3
|
+
include FieldExpressionFunctions
|
4
|
+
|
5
|
+
def initialize(name, relation=nil, options=nil)
|
6
|
+
options ||= {}
|
7
|
+
@name, @parent = name, relation
|
8
|
+
@explicit_ancestry = options[:explicit_ancestry] || false
|
9
|
+
end
|
10
|
+
|
11
|
+
def simple?
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def method_missing(name, *args)
|
16
|
+
if name.to_s =~ /^\w+$/ && args.empty?
|
17
|
+
Field.new(name, self, :explicit_ancestry => true)
|
18
|
+
else
|
19
|
+
super
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def [](n)
|
24
|
+
Field.new("\$#{n}", self, :explicit_ancestry => true)
|
25
|
+
end
|
26
|
+
|
27
|
+
def to_s
|
28
|
+
if @explicit_ancestry
|
29
|
+
if @parent.respond_to?(:alias)
|
30
|
+
"#{@parent.alias}.#{@name.to_s}"
|
31
|
+
else
|
32
|
+
"#{@parent}.#{@name.to_s}"
|
33
|
+
end
|
34
|
+
else
|
35
|
+
@name.to_s
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Piglet
|
2
|
+
module FieldExpressionFunctions # :nodoc:
|
3
|
+
SYMBOLIC_OPERATORS = [:==, :>, :<, :>=, :<=, :%, :+, :-, :*, :/]
|
4
|
+
FUNCTIONS = [:avg, :count, :diff, :max, :min, :size, :sum, :tokenize]
|
5
|
+
|
6
|
+
FUNCTIONS.each do |fun|
|
7
|
+
define_method(fun) { FieldFunctionExpression.new(fun.to_s.upcase, self) }
|
8
|
+
end
|
9
|
+
|
10
|
+
def empty?
|
11
|
+
FieldFunctionExpression.new('IsEmpty', self)
|
12
|
+
end
|
13
|
+
|
14
|
+
def as(new_name)
|
15
|
+
FieldRename.new(new_name, self)
|
16
|
+
end
|
17
|
+
|
18
|
+
def not
|
19
|
+
FieldPrefixExpression.new('NOT', self)
|
20
|
+
end
|
21
|
+
|
22
|
+
def null?
|
23
|
+
FieldSuffixExpression.new('is null', self)
|
24
|
+
end
|
25
|
+
|
26
|
+
def not_null?
|
27
|
+
FieldSuffixExpression.new('is not null', self)
|
28
|
+
end
|
29
|
+
|
30
|
+
def cast(type)
|
31
|
+
FieldPrefixExpression.new("(#{type.to_s})", self)
|
32
|
+
end
|
33
|
+
|
34
|
+
def matches(pattern)
|
35
|
+
regex_options_pattern = /^\(\?.+?:(.*)\)$/
|
36
|
+
pattern = pattern.to_s.sub(regex_options_pattern, '\1') if pattern.is_a?(Regexp) && pattern.to_s =~ regex_options_pattern
|
37
|
+
FieldInfixExpression.new('matches', self, "'#{pattern.to_s}'")
|
38
|
+
end
|
39
|
+
|
40
|
+
def neg
|
41
|
+
FieldPrefixExpression.new('-', self, false)
|
42
|
+
end
|
43
|
+
|
44
|
+
def ne(other)
|
45
|
+
FieldInfixExpression.new('!=', self, other)
|
46
|
+
end
|
47
|
+
|
48
|
+
SYMBOLIC_OPERATORS.each do |op|
|
49
|
+
define_method(op) { |other| FieldInfixExpression.new(op.to_s, self, other) }
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
|
54
|
+
def parenthesise(expr)
|
55
|
+
if expr.respond_to?(:simple?) && ! expr.simple?
|
56
|
+
"(#{expr})"
|
57
|
+
else
|
58
|
+
expr.to_s
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Piglet
|
2
|
+
class FieldFunctionExpression # :nodoc:
|
3
|
+
include FieldExpressionFunctions
|
4
|
+
|
5
|
+
def initialize(name, inner_expression, options=nil)
|
6
|
+
options ||= {}
|
7
|
+
@name, @inner_expression = name, inner_expression
|
8
|
+
@new_name = options[:as]
|
9
|
+
end
|
10
|
+
|
11
|
+
def simple?
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_s
|
16
|
+
"#{@name}(#{@inner_expression})"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Piglet
|
2
|
+
class FieldInfixExpression # :nodoc:
|
3
|
+
include FieldExpressionFunctions
|
4
|
+
|
5
|
+
def initialize(operator, left_expression, right_expression)
|
6
|
+
@operator, @left_expression, @right_expression = operator, left_expression, right_expression
|
7
|
+
end
|
8
|
+
|
9
|
+
def simple?
|
10
|
+
false
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_s
|
14
|
+
"#{parenthesise(@left_expression)} #{@operator} #{parenthesise(@right_expression)}"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|