piglet 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -1,50 +1,5 @@
1
- require 'rubygems'
2
- require 'rake'
3
1
  require 'lib/piglet'
4
2
 
5
- begin
6
- require 'jeweler'
7
- Jeweler::Tasks.new do |gem|
8
- gem.name = "piglet"
9
- gem.summary = %Q{Piglet is a DSL for Pig scripts}
10
- gem.description = %Q{Piglet aims to look like Pig Latin while allowing for things like loops and control of flow that are missing from Pig.}
11
- gem.email = "theo@iconara.net"
12
- gem.homepage = "http://github.com/iconara/piglet"
13
- gem.authors = ["Theo Hultberg"]
14
- gem.add_development_dependency "rspec", ">= 1.2.9"
15
- gem.version = Piglet::VERSION
16
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
- end
18
- Jeweler::GemcutterTasks.new
19
- rescue LoadError
20
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
- end
22
-
23
- require 'spec/rake/spectask'
24
-
25
- Spec::Rake::SpecTask.new(:spec) do |spec|
26
- spec.libs << 'lib' << 'spec'
27
- spec.spec_files = FileList['spec/**/*_spec.rb']
28
- end
29
-
30
- Spec::Rake::SpecTask.new(:rcov) do |spec|
31
- spec.libs << 'lib' << 'spec'
32
- spec.pattern = 'spec/**/*_spec.rb'
33
- spec.rcov = true
34
- end
35
-
36
- task :spec => :check_dependencies
37
-
38
3
  task :default => :spec
39
4
 
40
- require 'rake/rdoctask'
41
-
42
- Rake::RDocTask.new do |rdoc|
43
- version = File.exist?('VERSION') ? File.read('VERSION') : ""
44
-
45
- rdoc.rdoc_dir = 'rdoc'
46
- rdoc.title = "piglet #{version}"
47
- rdoc.rdoc_files.include('README*')
48
- rdoc.rdoc_files.include('lib/**/*.rb')
49
- rdoc.options << '--charset' << 'utf-8'
50
- end
5
+ Dir['tasks/*.rake'].each { |t| load t }
data/lib/piglet.rb CHANGED
@@ -1,5 +1,6 @@
1
- module Piglet
2
- VERSION = '0.1.0'
1
+ # :main: README.rdoc
2
+ module Piglet # :nodoc:
3
+ VERSION = '0.1.1'
3
4
 
4
5
  autoload_files = %w(
5
6
  assignment
data/tasks/gem.rake ADDED
@@ -0,0 +1,19 @@
1
+ begin
2
+ require 'jeweler'
3
+
4
+ Jeweler::Tasks.new do |gem|
5
+ gem.name = "piglet"
6
+ gem.summary = %Q{Piglet is a DSL for Pig scripts}
7
+ gem.description = %Q{Piglet aims to look like Pig Latin while allowing for things like loops and control of flow that are missing from Pig.}
8
+ gem.email = "theo@iconara.net"
9
+ gem.homepage = "http://github.com/iconara/piglet"
10
+ gem.authors = ["Theo Hultberg"]
11
+ gem.add_development_dependency "rspec", ">= 1.2.9"
12
+ gem.version = Piglet::VERSION
13
+ gem.test_files = FileList['spec/**/*.rb']
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ end
data/tasks/rdoc.rake ADDED
@@ -0,0 +1,13 @@
1
+ gem 'rdoc', '>= 2.4.0'
2
+
3
+ require 'rake/rdoctask'
4
+ require 'sdoc' rescue LoadError
5
+
6
+
7
+ Rake::RDocTask.new do |rdoc|
8
+ rdoc.rdoc_dir = 'rdoc'
9
+ rdoc.title = "piglet #{Piglet::VERSION}"
10
+ rdoc.rdoc_files.include('lib/**/*.rb')
11
+ rdoc.rdoc_files.include('README.rdoc')
12
+ rdoc.options << '--charset' << 'utf-8'
13
+ end
data/tasks/spec.rake ADDED
@@ -0,0 +1,15 @@
1
+ require 'spec/rake/spectask'
2
+
3
+ Spec::Rake::SpecTask.new(:spec) do |spec|
4
+ spec.libs << 'lib' << 'spec'
5
+ spec.spec_files = FileList['spec/**/*_spec.rb']
6
+ end
7
+
8
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
9
+ spec.libs << 'lib' << 'spec'
10
+ spec.pattern = 'spec/**/*_spec.rb'
11
+ spec.rcov = true
12
+ end
13
+
14
+ task :spec => :check_dependencies
15
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: piglet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Theo Hultberg
@@ -79,6 +79,9 @@ files:
79
79
  - spec/piglet_spec.rb
80
80
  - spec/spec.opts
81
81
  - spec/spec_helper.rb
82
+ - tasks/gem.rake
83
+ - tasks/rdoc.rake
84
+ - tasks/spec.rake
82
85
  has_rdoc: true
83
86
  homepage: http://github.com/iconara/piglet
84
87
  licenses: []
@@ -114,10 +117,3 @@ test_files:
114
117
  - spec/piglet/split_spec.rb
115
118
  - spec/piglet_spec.rb
116
119
  - spec/spec_helper.rb
117
- - examples/analysis.rb
118
- - examples/scratch.rb
119
- - examples/spike1.rb
120
- - examples/spike2.rb
121
- - examples/test1.rb
122
- - examples/test2.rb
123
- - examples/test3.rb
data/examples/analysis.rb DELETED
@@ -1,311 +0,0 @@
1
- # raw_sessions =
2
- # LOAD '$INPUT/sessions*'
3
- # USING PigStorage AS (
4
- # date:chararray,
5
- # api_key:chararray,
6
- # ad_id:chararray,
7
- # user_id:chararray,
8
- # site:chararray,
9
- # size:chararray,
10
- # name:chararray,
11
- # destination:chararray,
12
- # indeterminate_visibility:int,
13
- # impression:int,
14
- # engagement:int,
15
- # click_thru:int,
16
- # extra:int,
17
- # session_time:int,
18
- # visible_time:int,
19
- # engagement_time:int
20
- # );
21
- raw_sessions = load('$INPUT/sessions*', :schema => [
22
- [:date :chararray],
23
- [:api_key :chararray],
24
- [:ad_id :chararray],
25
- [:user_id :chararray],
26
- [:site :chararray],
27
- [:size :chararray],
28
- [:name :chararray],
29
- [:destination :chararray],
30
- [:indeterminate_visibility :int],
31
- [:impression :int],
32
- [:engagement :int],
33
- [:click_thru :int],
34
- [:extra :int],
35
- [:session_time :int],
36
- [:visible_time :int],
37
- [:engagement_time :int]
38
- ])
39
-
40
- # raw_actions =
41
- # LOAD '$INPUT/actions*'
42
- # USING PigStorage AS (
43
- # date:chararray,
44
- # api_key:chararray,
45
- # ad_id:chararray,
46
- # user_id:chararray,
47
- # action:chararray,
48
- # site:chararray,
49
- # size:chararray,
50
- # name:chararray,
51
- # destination:chararray,
52
- # extra:int
53
- # );
54
- raw_actions = load('$INPUT/actions*', :schema =>
55
- [:date :chararray],
56
- [:api_key :chararray],
57
- [:ad_id :chararray],
58
- [:user_id :chararray],
59
- [:action :chararray],
60
- [:site :chararray],
61
- [:size :chararray],
62
- [:name :chararray],
63
- [:destination :chararray],
64
- [:extra :int]
65
- )
66
-
67
- #sessions = FILTER raw_sessions BY date is not null;
68
- sessions = raw_sessions.filter { |r| r.date.not_null? }
69
-
70
- #actions = FILTER raw_actions BY date is not null;
71
- actions = raw_actions.filter { |r| r.date.not_null? }
72
-
73
- # /*
74
- # * Modify each session and action based on whether or not it's an extra session
75
- # * (a session that was logged only because it was a click thru). Extra sessions
76
- # * should affect only the total number of click thrus, not the number of
77
- # * exposures, impressions, etc. nor the durations. By setting these values to
78
- # * zero and introducing a field for whether or not the session was an exposure
79
- # * (zero for extra sessions, one for all other), the calculations below can
80
- # * filter out extra sessions without too much work.
81
- # */
82
- # sessions =
83
- # FOREACH
84
- # sessions
85
- # GENERATE
86
- # date,
87
- # api_key,
88
- # ad_id,
89
- # user_id,
90
- # site,
91
- # size,
92
- # name,
93
- # destination,
94
- # (extra == 1 ? 0 : indeterminate_visibility) AS indeterminate_visibility,
95
- # (extra == 1 ? 0 : 1) AS exposure,
96
- # (extra == 1 ? 0 : impression) AS impression,
97
- # (extra == 1 ? 0 : engagement) AS engagement,
98
- # click_thru,
99
- # (extra == 1 ? 0 : session_time) AS session_time,
100
- # (extra == 1 ? 0 : visible_time) AS visible_time,
101
- # (extra == 1 ? 0 : engagement_time) AS engagement_time;
102
- sessions = sessions.foreach do |r|
103
- [
104
- r.date,
105
- r.api_key,
106
- r.ad_id,
107
- r.user_id,
108
- r.site,
109
- r.size,
110
- r.name,
111
- r.destination,
112
- r.test(r.extra == 1, 0, r.indeterminate_visibility).as(:indeterminate_visibility),
113
- r.test(r.extra == 1, 0, 1).as(:exposure),
114
- r.test(r.extra == 1, 0, r.impression).as(:impression),
115
- r.test(r.extra == 1, 0, r.engagement).as(:engagement),
116
- r.click_thru,
117
- r.test(r.extra == 1, 0, r.session_time).as(:session_time),
118
- r.test(r.extra == 1, 0, r.visible_time).as(:visible_time),
119
- r.test(r.extra == 1, 0, r.engagement_time).as(:engagement_time)
120
- ]
121
- end
122
-
123
- # actions =
124
- # FOREACH
125
- # actions
126
- # GENERATE
127
- # date,
128
- # api_key,
129
- # ad_id,
130
- # user_id,
131
- # action,
132
- # site,
133
- # size,
134
- # name,
135
- # destination,
136
- # (extra == 1 ? 0 : 1) AS exposure;
137
- actions = actions.foreach do |r|
138
- [
139
- r.date,
140
- r.api_key,
141
- r.ad_id,
142
- r.user_id,
143
- r.action,
144
- r.site,
145
- r.size,
146
- r.name,
147
- r.destination,
148
- r.test(r.extra == 1, 0, 1).as(:exposure)
149
- ]
150
- end
151
-
152
- %w(all site size name).each do |name|
153
- # session_category_<%= name %> =
154
- # FOREACH
155
- # (GROUP sessions BY (date, ad_id, api_key, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
156
- # GENERATE
157
- # $0.date AS date,
158
- # $0.ad_id AS ad_id,
159
- # $0.api_key AS api_key,
160
- # '<%= name %>' AS category,
161
- # <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
162
- # SUM($1.exposure) AS exposures,
163
- # SUM($1.impression) AS impressions,
164
- # SUM($1.engagement) AS engagements,
165
- # SUM($1.click_thru) AS click_thrus,
166
- # SUM($1.indeterminate_visibility) AS indeterminate_visibility,
167
- # SUM($1.session_time) AS session_time,
168
- # SUM($1.visible_time) AS visible_time,
169
- # SUM($1.engagement_time) AS engagement_time;
170
- session_category = sessions.group(:date, :ad_id, :api_key, (name == 'all' ? 'all' : name), :parallel => '$PARALLELISM')
171
- session_category = session_category.foreach do |r|
172
- [
173
- r[0].date.as(:date),
174
- r[0].ad_id.as(:ad_id),
175
- r[0].api_key.as(:api_key),
176
- name.as(:category),
177
- (name == 'all' ? "'all'" : r[0].name).as(:segment),
178
- r[1].sum.as(:exposure),
179
- r[1].sum.as(:impression),
180
- r[1].sum.as(:engagement),
181
- r[1].sum.as(:click_thru),
182
- r[1].sum.as(:indeterminate_visibility),
183
- r[1].sum.as(:session_time),
184
- r[1].sum.as(:visible_time),
185
- r[1].sum.as(:engagement_time)
186
- ]
187
- end
188
-
189
- # session_category_<%= name %>_by_user_id =
190
- # FOREACH
191
- # (GROUP sessions BY (date, ad_id, api_key, user_id, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
192
- # GENERATE
193
- # $0.date AS date,
194
- # $0.ad_id AS ad_id,
195
- # $0.api_key AS api_key,
196
- # '<%= name %>' AS category,
197
- # <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
198
- # 1 AS exposures,
199
- # MAX($1.impression) AS impressions,
200
- # MAX($1.engagement) AS engagements,
201
- # MAX($1.click_thru) AS click_thrus;
202
- session_category_by_user_id = sessions.group(:date, :ad_id, :api_key, (name == 'all' ? 'all' : name), :parallel => '$PARALLELISM')
203
- session_category_by_user_id = session_category_by_user_id.foreach do |r|
204
- r[0].date.as(:date),
205
- r[0].ad_id.as(:ad_id),
206
- r[0].api_key.as(:api_key),
207
- name.as(:category),
208
- (name == 'all' ? "'all'" : r[0].name).as(:segment)
209
- 1.as(:exposures),
210
- r[1].impression.max.as(:impressions),
211
- r[1].engagement.max.as(:engagements),
212
- r[1].click_thru.max.as(:click_thrus)
213
- end
214
-
215
- # unique_session_category_<%= name %> =
216
- # FOREACH
217
- # (GROUP session_category_<%= name %>_by_user_id BY (date, ad_id, api_key, category, segment) PARALLEL $PARALLELISM)
218
- # GENERATE
219
- # $0.date AS date,
220
- # $0.ad_id AS ad_id,
221
- # $0.api_key AS api_key,
222
- # $0.category,
223
- # $0.segment,
224
- # COUNT($1.ad_id) AS unique_exposures,
225
- # SUM($1.impressions) AS unique_impressions,
226
- # SUM($1.engagements) AS unique_engagements,
227
- # SUM($1.click_thrus) AS unique_click_thrus;
228
- #
229
- # action_category_<%= name %> =
230
- # FOREACH
231
- # (GROUP actions BY (date, ad_id, api_key, action, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
232
- # GENERATE
233
- # $0.date AS date,
234
- # $0.ad_id AS ad_id,
235
- # $0.api_key AS api_key,
236
- # $0.action AS action,
237
- # '<%= name %>' AS category,
238
- # <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
239
- # SUM($1.exposure) AS engagements;
240
- #
241
- # action_category_<%= name %>_by_user_id =
242
- # FOREACH
243
- # (GROUP actions BY (date, ad_id, api_key, action, user_id, <%= name == 'all' ? "'all'" : name %>) PARALLEL $PARALLELISM)
244
- # GENERATE
245
- # $0.date AS date,
246
- # $0.ad_id AS ad_id,
247
- # $0.api_key AS api_key,
248
- # $0.action AS action,
249
- # '<%= name %>' AS category,
250
- # <%= name == 'all' ? "'all'" : "$0.#{name}" %> AS segment,
251
- # 1 AS exposures,
252
- # 1 AS engagements;
253
- #
254
- # unique_action_category_<%= name %> =
255
- # FOREACH
256
- # (GROUP action_category_<%= name %>_by_user_id BY (date, ad_id, api_key, action, category, segment) PARALLEL $PARALLELISM)
257
- # GENERATE
258
- # $0.date AS date,
259
- # $0.ad_id AS ad_id,
260
- # $0.api_key AS api_key,
261
- # $0.action AS action,
262
- # $0.category,
263
- # $0.segment,
264
- # SUM($1.engagements) AS unique_engagements;
265
- end
266
-
267
- -- unions ----------------------------------------------------------------------
268
- -- -----------------------------------------------------------------------------
269
-
270
- <% if @categories.size > 1 -%>
271
- report_metrics =
272
- UNION
273
- <%= @categories.map { |name| "session_category_#{name}" }.join(",\n ") %>;
274
- <% else -%>
275
- report_metrics = FILTER session_category_<%= @categories.first %> BY 1 == 1;
276
- <% end -%>
277
-
278
- <% if @categories.size > 1 -%>
279
- unique_report_metrics =
280
- UNION
281
- <%= @categories.map { |name| "unique_session_category_#{name}" }.join(",\n ") %>;
282
- <% else -%>
283
- unique_report_metrics = FILTER unique_session_category_<%= @categories.first %> BY 1 == 1;
284
- <% end -%>
285
- <% if @categories.size > 1 -%>
286
- report_action_metrics =
287
- UNION
288
- <%= @categories.map { |name| "action_category_#{name}" }.join(",\n ") %>;
289
- <% else -%>
290
- report_action_metrics = FILTER action_category_<%= @categories.first %> BY 1 == 1;
291
- <% end -%>
292
- <% if @categories.size > 1 -%>
293
- unique_report_action_metrics =
294
- UNION
295
- <%= @categories.map { |name| "unique_action_category_#{name}" }.join(",\n ") %>;
296
- <% else -%>
297
- unique_report_action_metrics = FILTER unique_action_category_<%= @categories.first %> BY 1 == 1;
298
- <% end %>
299
-
300
- -- complete output -------------------------------------------------------------
301
- -- -----------------------------------------------------------------------------
302
-
303
-
304
- <% %w(report_metrics unique_report_metrics report_action_metrics unique_report_action_metrics).each do |relation| -%>
305
- <%= relation %> = FILTER <%= relation %> BY date is not null AND date != '' AND api_key is not null AND api_key != '';
306
- <% end -%>
307
-
308
- STORE report_metrics INTO '$OUTPUT/report_metrics' USING PigStorage;
309
- STORE unique_report_metrics INTO '$OUTPUT/unique_report_metrics' USING PigStorage;
310
- STORE report_action_metrics INTO '$OUTPUT/report_action_metrics' USING PigStorage;
311
- STORE unique_report_action_metrics INTO '$OUTPUT/unique_report_action_metrics' USING PigStorage;
data/examples/scratch.rb DELETED
@@ -1,11 +0,0 @@
1
- module Piglet::Relation
2
- def samples(*sizes)
3
- sizes.map { |s| sample(s) }
4
- end
5
- end
6
-
7
- input = load('input', :schema => %w(country browser site visit_duration))
8
- a, b, c = input.samples(0.1, 0.2, 0.3)
9
- store(a, 'output1')
10
- store(b, 'output2')
11
- store(c, 'output3')
data/examples/test1.rb DELETED
@@ -1,3 +0,0 @@
1
- raw_data = load 'test1-data.txt', :schema => %w(name city country)
2
- grouped_by_country = raw_data.group :country
3
- dump grouped_by_country
data/examples/test2.rb DELETED
@@ -1,5 +0,0 @@
1
- a = load('in', :schema => %w(x y z w))
2
- %w(x y z w).each do |f|
3
- r = a.group(f)
4
- store(r, 'out-' + f)
5
- end
data/examples/test3.rb DELETED
@@ -1,4 +0,0 @@
1
- a = load 'input', :schema => [:a, :b, :c]
2
- b = a.group :c
3
- c = b.foreach { |r| [r[0], r[1].a.max, r[1].b.max] }
4
- store c, 'output'