piglet 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -1,50 +1,5 @@
1
- require 'rubygems'
2
- require 'rake'
3
1
  require 'lib/piglet'
4
2
 
5
- begin
6
- require 'jeweler'
7
- Jeweler::Tasks.new do |gem|
8
- gem.name = "piglet"
9
- gem.summary = %Q{Piglet is a DSL for Pig scripts}
10
- gem.description = %Q{Piglet aims to look like Pig Latin while allowing for things like loops and control of flow that are missing from Pig.}
11
- gem.email = "theo@iconara.net"
12
- gem.homepage = "http://github.com/iconara/piglet"
13
- gem.authors = ["Theo Hultberg"]
14
- gem.add_development_dependency "rspec", ">= 1.2.9"
15
- gem.version = Piglet::VERSION
16
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
- end
18
- Jeweler::GemcutterTasks.new
19
- rescue LoadError
20
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
- end
22
-
23
- require 'spec/rake/spectask'
24
-
25
- Spec::Rake::SpecTask.new(:spec) do |spec|
26
- spec.libs << 'lib' << 'spec'
27
- spec.spec_files = FileList['spec/**/*_spec.rb']
28
- end
29
-
30
- Spec::Rake::SpecTask.new(:rcov) do |spec|
31
- spec.libs << 'lib' << 'spec'
32
- spec.pattern = 'spec/**/*_spec.rb'
33
- spec.rcov = true
34
- end
35
-
36
- task :spec => :check_dependencies
37
-
38
3
  task :default => :spec
39
4
 
40
- require 'rake/rdoctask'
41
-
42
- Rake::RDocTask.new do |rdoc|
43
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
44
-
45
- rdoc.rdoc_dir = 'rdoc'
46
- rdoc.title = "piglet #{version}"
47
- rdoc.rdoc_files.include('README*')
48
- rdoc.rdoc_files.include('lib/**/*.rb')
49
- rdoc.options << '--charset' << 'utf-8'
50
- end
5
+ Dir['tasks/*.rake'].each { |t| load t }
data/lib/piglet.rb CHANGED
@@ -1,5 +1,6 @@
1
- module Piglet
2
- VERSION = '0.1.0'
1
+ # :main: README.rdoc
2
+ module Piglet # :nodoc:
3
+ VERSION = '0.1.1'
3
4
 
4
5
  autoload_files = %w(
5
6
  assignment
data/tasks/gem.rake ADDED
@@ -0,0 +1,19 @@
1
+ begin
2
+ require 'jeweler'
3
+
4
+ Jeweler::Tasks.new do |gem|
5
+ gem.name = "piglet"
6
+ gem.summary = %Q{Piglet is a DSL for Pig scripts}
7
+ gem.description = %Q{Piglet aims to look like Pig Latin while allowing for things like loops and control of flow that are missing from Pig.}
8
+ gem.email = "theo@iconara.net"
9
+ gem.homepage = "http://github.com/iconara/piglet"
10
+ gem.authors = ["Theo Hultberg"]
11
+ gem.add_development_dependency "rspec", ">= 1.2.9"
12
+ gem.version = Piglet::VERSION
13
+ gem.test_files = FileList['spec/**/*.rb']
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ end
data/tasks/rdoc.rake ADDED
@@ -0,0 +1,13 @@
1
+ gem 'rdoc', '>= 2.4.0'
2
+
3
+ require 'rake/rdoctask'
4
+ require 'sdoc' rescue LoadError
5
+
6
+
7
+ Rake::RDocTask.new do |rdoc|
8
+ rdoc.rdoc_dir = 'rdoc'
9
+ rdoc.title = "piglet #{Piglet::VERSION}"
10
+ rdoc.rdoc_files.include('lib/**/*.rb')
11
+ rdoc.rdoc_files.include('README.rdoc')
12
+ rdoc.options << '--charset' << 'utf-8'
13
+ end
data/tasks/spec.rake ADDED
@@ -0,0 +1,15 @@
1
+ require 'spec/rake/spectask'
2
+
3
+ Spec::Rake::SpecTask.new(:spec) do |spec|
4
+ spec.libs << 'lib' << 'spec'
5
+ spec.spec_files = FileList['spec/**/*_spec.rb']
6
+ end
7
+
8
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
9
+ spec.libs << 'lib' << 'spec'
10
+ spec.pattern = 'spec/**/*_spec.rb'
11
+ spec.rcov = true
12
+ end
13
+
14
+ task :spec => :check_dependencies
15
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: piglet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Theo Hultberg
@@ -79,6 +79,9 @@ files:
79
79
  - spec/piglet_spec.rb
80
80
  - spec/spec.opts
81
81
  - spec/spec_helper.rb
82
+ - tasks/gem.rake
83
+ - tasks/rdoc.rake
84
+ - tasks/spec.rake
82
85
  has_rdoc: true
83
86
  homepage: http://github.com/iconara/piglet
84
87
  licenses: []
@@ -114,10 +117,3 @@ test_files:
114
117
  - spec/piglet/split_spec.rb
115
118
  - spec/piglet_spec.rb
116
119
  - spec/spec_helper.rb
117
- - examples/analysis.rb
118
- - examples/scratch.rb
119
- - examples/spike1.rb
120
- - examples/spike2.rb
121
- - examples/test1.rb
122
- - examples/test2.rb
123
- - examples/test3.rb
data/examples/analysis.rb DELETED
@@ -1,311 +0,0 @@
1
- # raw_sessions =
2
- # LOAD '$INPUT/sessions*'
3
- # USING PigStorage AS (
4
- # date:chararray,
5
- # api_key:chararray,
6
- # ad_id:chararray,
7
- # user_id:chararray,
8
- # site:chararray,
9
- # size:chararray,
10
- # name:chararray,
11
- # destination:chararray,
12
- # indeterminate_visibility:int,
13
- # impression:int,
14
- # engagement:int,
15
- # click_thru:int,
16
- # extra:int,
17
- # session_time:int,
18
- # visible_time:int,
19
- # engagement_time:int
20
- # );
21
- raw_sessions = load('$INPUT/sessions*', :schema => [
22
- [:date :chararray],
23
- [:api_key :chararray],
24
- [:ad_id :chararray],
25
- [:user_id :chararray],
26
- [:site :chararray],
27
- [:size :chararray],
28
- [:name :chararray],
29
- [:destination :chararray],
30
- [:indeterminate_visibility :int],
31
- [:impression :int],
32
- [:engagement :int],
33
- [:click_thru :int],
34
- [:extra :int],
35
- [:session_time :int],
36
- [:visible_time :int],
37
- [:engagement_time :int]
38
- ])
39
-
40
- # raw_actions =
41
- # LOAD '$INPUT/actions*'
42
- # USING PigStorage AS (
43
- # date:chararray,
44
- # api_key:chararray,
45
- # ad_id:chararray,
46
- # user_id:chararray,
47
- # action:chararray,
48
- # site:chararray,
49
- # size:chararray,
50
- # name:chararray,
51
- # destination:chararray,
52
- # extra:int
53
- # );
54
- raw_actions = load('$INPUT/actions*', :schema =>
55
- [:date :chararray],
56
- [:api_key :chararray],
57
- [:ad_id :chararray],
58
- [:user_id :chararray],
59
- [:action :chararray],
60
- [:site :chararray],
61
- [:size :chararray],
62
- [:name :chararray],
63
- [:destination :chararray],
64
- [:extra :int]
65
- )
66
-
67
- #sessions = FILTER raw_sessions BY date is not null;
68
- sessions = raw_sessions.filter { |r| r.date.not_null? }
69
-
70
- #actions = FILTER raw_actions BY date is not null;
71
- actions = raw_actions.filter { |r| r.date.not_null? }
72
-
73
- # /*
74
- # * Modify each session and action based on whether or not it's an extra session
75
- # * (a session that was logged only because it was a click thru). Extra sessions
76
- # * should affect only the total number of click thrus, not the number of
77
- # * exposures, impressions, etc. nor the durations. By setting these values to
78
- # * zero and introducing a field for whether or not the session was an exposure
79
- # * (zero for extra sessions, one for all other), the calculations below can
80
- # * filter out extra sessions without too much work.
81
- # */
82
- # sessions =
83
- # FOREACH
84
- # sessions
85
- # GENERATE
86
- # date,
87
- # api_key,
88
- # ad_id,
89
- # user_id,
90
- # site,
91
- # size,
92
- # name,
93
- # destination,
94
- # (extra == 1 ? 0 : indeterminate_visibility) AS indeterminate_visibility,
95
- # (extra == 1 ? 0 : 1) AS exposure,
96
- # (extra == 1 ? 0 : impression) AS impression,
97
- # (extra == 1 ? 0 : engagement) AS engagement,
98
- # click_thru,
99
- # (extra == 1 ? 0 : session_time) AS session_time,
100
- # (extra == 1 ? 0 : visible_time) AS visible_time,
101
- # (extra == 1 ? 0 : engagement_time) AS engagement_time;
102
- sessions = sessions.foreach do |r|
103
- [
104
- r.date,
105
- r.api_key,
106
- r.ad_id,
107
- r.user_id,
108
- r.site,
109
- r.size,
110
- r.name,
111
- r.destination,
112
- r.test(r.extra == 1, 0, r.indeterminate_visibility).as(:indeterminate_visibility),
113
- r.test(r.extra == 1, 0, 1).as(:exposure),
114
- r.test(r.extra == 1, 0, r.impression).as(:impression),
115
- r.test(r.extra == 1, 0, r.engagement).as(:engagement),
116
- r.click_thru,
117
- r.test(r.extra == 1, 0, r.session_time).as(:session_time),
118
- r.test(r.extra == 1, 0, r.visible_time).as(:visible_time),
119
- r.test(r.extra == 1, 0, r.engagement_time).as(:engagement_time)
120
- ]
121
- end
122
-
123
- # actions =
124
- # FOREACH
125
- # actions
126
- # GENERATE
127
- # date,
128
- # api_key,
129
- # ad_id,
130
- # user_id,
131
- # action,
132
- # site,
133
- # size,
134
- # name,
135
- # destination,
136
- # (extra == 1 ? 0 : 1) AS exposure;
137
- actions = actions.foreach do |r|
138
- [
139
- r.date,
140
- r.api_key,
141
- r.ad_id,
142
- r.user_id,
143
- r.action,
144
- r.site,
145
- r.size,
146
- r.name,
147
- r.destination,
148
- r.test(r.extra == 1, 0, 1).as(:exposure)
149
- ]
150
- end
151
-
152
- %w(all site size name).each do |name|
153
- # session_category_<%= name %> =
154
- # FOREACH
155
- # (GROUP sessions BY (date, ad_id, api_key, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
156
- # GENERATE
157
- # $0.date AS date,
158
- # $0.ad_id AS ad_id,
159
- # $0.api_key AS api_key,
160
- # '<%= name %>' AS category,
161
- # <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
162
- # SUM($1.exposure) AS exposures,
163
- # SUM($1.impression) AS impressions,
164
- # SUM($1.engagement) AS engagements,
165
- # SUM($1.click_thru) AS click_thrus,
166
- # SUM($1.indeterminate_visibility) AS indeterminate_visibility,
167
- # SUM($1.session_time) AS session_time,
168
- # SUM($1.visible_time) AS visible_time,
169
- # SUM($1.engagement_time) AS engagement_time;
170
- session_category = sessions.group(:date, :ad_id, :api_key, (name == 'all' ? 'all' : name), :parallel => '$PARALLELISM')
171
- session_category = session_category.foreach do |r|
172
- [
173
- r[0].date.as(:date),
174
- r[0].ad_id.as(:ad_id),
175
- r[0].api_key.as(:api_key),
176
- name.as(:category),
177
- (name == 'all' ? "'all'" : r[0].name).as(:segment),
178
- r[1].sum.as(:exposure),
179
- r[1].sum.as(:impression),
180
- r[1].sum.as(:engagement),
181
- r[1].sum.as(:click_thru),
182
- r[1].sum.as(:indeterminate_visibility),
183
- r[1].sum.as(:session_time),
184
- r[1].sum.as(:visible_time),
185
- r[1].sum.as(:engagement_time)
186
- ]
187
- end
188
-
189
- # session_category_<%= name %>_by_user_id =
190
- # FOREACH
191
- # (GROUP sessions BY (date, ad_id, api_key, user_id, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
192
- # GENERATE
193
- # $0.date AS date,
194
- # $0.ad_id AS ad_id,
195
- # $0.api_key AS api_key,
196
- # '<%= name %>' AS category,
197
- # <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
198
- # 1 AS exposures,
199
- # MAX($1.impression) AS impressions,
200
- # MAX($1.engagement) AS engagements,
201
- # MAX($1.click_thru) AS click_thrus;
202
- session_category_by_user_id = sessions.group(:date, :ad_id, :api_key, (name == 'all' ? 'all' : name), :parallel => '$PARALLELISM')
203
- session_category_by_user_id = session_category_by_user_id.foreach do |r|
204
- r[0].date.as(:date),
205
- r[0].ad_id.as(:ad_id),
206
- r[0].api_key.as(:api_key),
207
- name.as(:category),
208
- (name == 'all' ? "'all'" : r[0].name).as(:segment)
209
- 1.as(:exposures),
210
- r[1].impression.max.as(:impressions),
211
- r[1].engagement.max.as(:engagements),
212
- r[1].click_thru.max.as(:click_thrus)
213
- end
214
-
215
- # unique_session_category_<%= name %> =
216
- # FOREACH
217
- # (GROUP session_category_<%= name %>_by_user_id BY (date, ad_id, api_key, category, segment) PARALLEL $PARALLELISM)
218
- # GENERATE
219
- # $0.date AS date,
220
- # $0.ad_id AS ad_id,
221
- # $0.api_key AS api_key,
222
- # $0.category,
223
- # $0.segment,
224
- # COUNT($1.ad_id) AS unique_exposures,
225
- # SUM($1.impressions) AS unique_impressions,
226
- # SUM($1.engagements) AS unique_engagements,
227
- # SUM($1.click_thrus) AS unique_click_thrus;
228
- #
229
- # action_category_<%= name %> =
230
- # FOREACH
231
- # (GROUP actions BY (date, ad_id, api_key, action, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
232
- # GENERATE
233
- # $0.date AS date,
234
- # $0.ad_id AS ad_id,
235
- # $0.api_key AS api_key,
236
- # $0.action AS action,
237
- # '<%= name %>' AS category,
238
- # <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
239
- # SUM($1.exposure) AS engagements;
240
- #
241
- # action_category_<%= name %>_by_user_id =
242
- # FOREACH
243
- # (GROUP actions BY (date, ad_id, api_key, action, user_id, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
244
- # GENERATE
245
- # $0.date AS date,
246
- # $0.ad_id AS ad_id,
247
- # $0.api_key AS api_key,
248
- # $0.action AS action,
249
- # '<%= name %>' AS category,
250
- # <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
251
- # 1 AS exposures,
252
- # 1 AS engagements;
253
- #
254
- # unique_action_category_<%= name %> =
255
- # FOREACH
256
- # (GROUP action_category_<%= name %>_by_user_id BY (date, ad_id, api_key, action, category, segment) PARALLEL $PARALLELISM)
257
- # GENERATE
258
- # $0.date AS date,
259
- # $0.ad_id AS ad_id,
260
- # $0.api_key AS api_key,
261
- # $0.action AS action,
262
- # $0.category,
263
- # $0.segment,
264
- # SUM($1.engagements) AS unique_engagements;
265
- end
266
-
267
- -- unions ----------------------------------------------------------------------
268
- -- -----------------------------------------------------------------------------
269
-
270
- <% if @categories.size > 1 -%>
271
- report_metrics =
272
- UNION
273
- <%= @categories.map { |name| "session_category_#{name}" }.join(",\n ") %>;
274
- <% else -%>
275
- report_metrics = FILTER session_category_<%= @categories.first %> BY 1 == 1;
276
- <% end -%>
277
-
278
- <% if @categories.size > 1 -%>
279
- unique_report_metrics =
280
- UNION
281
- <%= @categories.map { |name| "unique_session_category_#{name}" }.join(",\n ") %>;
282
- <% else -%>
283
- unique_report_metrics = FILTER unique_session_category_<%= @categories.first %> BY 1 == 1;
284
- <% end -%>
285
- <% if @categories.size > 1 -%>
286
- report_action_metrics =
287
- UNION
288
- <%= @categories.map { |name| "action_category_#{name}" }.join(",\n ") %>;
289
- <% else -%>
290
- report_action_metrics = FILTER action_category_<%= @categories.first %> BY 1 == 1;
291
- <% end -%>
292
- <% if @categories.size > 1 -%>
293
- unique_report_action_metrics =
294
- UNION
295
- <%= @categories.map { |name| "unique_action_category_#{name}" }.join(",\n ") %>;
296
- <% else -%>
297
- unique_report_action_metrics = FILTER unique_action_category_<%= @categories.first %> BY 1 == 1;
298
- <% end %>
299
-
300
- -- complete output -------------------------------------------------------------
301
- -- -----------------------------------------------------------------------------
302
-
303
-
304
- <% %w(report_metrics unique_report_metrics report_action_metrics unique_report_action_metrics).each do |relation| -%>
305
- <%= relation %> = FILTER <%= relation %> BY date is not null AND date != '' AND api_key is not null AND api_key != '';
306
- <% end -%>
307
-
308
- STORE report_metrics INTO '$OUTPUT/report_metrics' USING PigStorage;
309
- STORE unique_report_metrics INTO '$OUTPUT/unique_report_metrics' USING PigStorage;
310
- STORE report_action_metrics INTO '$OUTPUT/report_action_metrics' USING PigStorage;
311
- STORE unique_report_action_metrics INTO '$OUTPUT/unique_report_action_metrics' USING PigStorage;
data/examples/scratch.rb DELETED
@@ -1,11 +0,0 @@
1
- module Piglet::Relation
2
- def samples(*sizes)
3
- sizes.map { |s| sample(s) }
4
- end
5
- end
6
-
7
- input = load('input', :schema => %w(country browser site visit_duration))
8
- a, b, c = input.samples(0.1, 0.2, 0.3)
9
- store(a, 'output1')
10
- store(b, 'output2')
11
- store(c, 'output3')
data/examples/test1.rb DELETED
@@ -1,3 +0,0 @@
1
- raw_data = load 'test1-data.txt', :schema => %w(name city country)
2
- grouped_by_country = raw_data.group :country
3
- dump grouped_by_country
data/examples/test2.rb DELETED
@@ -1,5 +0,0 @@
1
- a = load('in', :schema => %w(x y z w))
2
- %w(x y z w).each do |f|
3
- r = a.group(f)
4
- store(r, 'out-' + f)
5
- end
data/examples/test3.rb DELETED
@@ -1,4 +0,0 @@
1
- a = load 'input', :schema => [:a, :b, :c]
2
- b = a.group :c
3
- c = b.foreach { |r| [r[0], r[1].a.max, r[1].b.max] }
4
- store c, 'output'