mrtoolkit 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/test/utest.rb ADDED
@@ -0,0 +1,471 @@
1
+ require 'test/unit'
2
+ require 'mrtoolkit'
3
+ require 'regression'
4
+
5
+ ##############################
6
+ # Log example.
7
+ #
8
+ # Reformats the date and time into one field.
9
+ # Reducer adds an extra column
10
+
11
+ JobBase.testing(true)
12
+
13
+ class LogMap < MapBase
14
+ def declare
15
+ field :date
16
+ field :time
17
+ field :url
18
+
19
+ emit :date_time
20
+ emit :url
21
+ end
22
+
23
+ def process(input, output)
24
+ output.date_time = input.date + "T" + input.time
25
+ output.url = input.url
26
+ output
27
+ end
28
+
29
+ end
30
+
31
+ class LogReduce < ReduceBase
32
+ def declare
33
+ field :date_time
34
+ field :url
35
+
36
+ emit :date_time
37
+ emit :url
38
+ emit :junk
39
+ end
40
+
41
+ def process(input, output)
42
+ output = copy_struct input, output
43
+ output.junk = "x"
44
+ output
45
+ end
46
+ end
47
+
48
+ class LogJob < JobBase
49
+ def job
50
+ mapper LogMap
51
+ reducer LogReduce
52
+ infiles "test-in/test1-in"
53
+ outfiles "test-out"
54
+ end
55
+ end
56
+
57
+ class TestMRToolkit < Test::Unit::TestCase
58
+ def test_log
59
+ LogJob.run_command
60
+ out = File.read("test-out")
61
+ expected = "2008-10-01T10:30:00\t1.2.3.4\tx\n" +
62
+ "2008-10-02T11:30:00\t1.2.3.5\tx\n"
63
+ assert_equal(expected, out)
64
+ end
65
+ end
66
+
67
+
68
+ ##########################################
69
+ #
70
+ # Computs count, total, and sum of squares.
71
+
72
+ class SumMap < MapBase
73
+ def declare
74
+ field :value
75
+
76
+ emit :count
77
+ emit :total
78
+ emit :sum_of_squares
79
+ end
80
+
81
+ def process(input, output)
82
+ v = input.value.to_f
83
+ output.count = 1
84
+ output.total = v
85
+ output.sum_of_squares = v * v
86
+ output
87
+ end
88
+
89
+ end
90
+
91
+ # This could be done with canned reducer
92
+ class MySumReduce < ReduceBase
93
+ def declare
94
+ field :count
95
+ field :total
96
+ field :sum_of_squares
97
+
98
+ emit :count
99
+ emit :total
100
+ emit :sum_of_squares
101
+ end
102
+
103
+ def process_begin(dummy, output)
104
+ @count = 0
105
+ @total = 0
106
+ @sum_of_squares = 0
107
+ nil
108
+ end
109
+ def process(input, output)
110
+ @count += input.count.to_f
111
+ @total += input.total.to_f
112
+ @sum_of_squares += input.sum_of_squares.to_f
113
+ nil
114
+ end
115
+ def process_end(dummy, output)
116
+ output.count = @count
117
+ output.total = @total
118
+ output.sum_of_squares = @sum_of_squares
119
+ output
120
+ end
121
+ end
122
+
123
+ class SumJob < JobBase
124
+ def job
125
+ mapper SumMap
126
+ reducer MySumReduce
127
+ infiles "test-in/test2-in"
128
+ outfiles "test-out"
129
+ end
130
+ end
131
+
132
+ class TestMRToolkit < Test::Unit::TestCase
133
+ def test_sum
134
+ SumJob.run_command
135
+ out = File.read("test-out")
136
+ expected = "4.0\t43.0\t1005.0\n"
137
+ assert_equal(expected, out)
138
+ end
139
+ end
140
+
141
+
142
+ ######################################
143
+ #
144
+ # Grops times into one-minute buckets
145
+ # Calculates counts for each bucket
146
+
147
+ require 'parsedate'
148
+
149
+ class MinMap < MapBase
150
+ def declare
151
+ field :dt
152
+ field :tm
153
+
154
+ emit :minute
155
+ emit :count
156
+ end
157
+
158
+ def process(input, output)
159
+ res = ParseDate.parsedate(input.dt + " " + input.tm)
160
+ t = Time.local(*res)
161
+ min = t.min + 60 * (t.hour + 24 * t.wday)
162
+ output.count = 1
163
+ output.minute = min
164
+ output
165
+ end
166
+
167
+ end
168
+
169
+ class MyMinReduce < ReduceBase
170
+ def declare
171
+ field :minute
172
+ field :count
173
+
174
+ emit :min
175
+ emit :count
176
+ end
177
+
178
+ def process_init(input, output)
179
+ @count = 0
180
+ nil
181
+ end
182
+ def process_each(input, output)
183
+ @count += 1
184
+ nil
185
+ end
186
+ def process_term(input, output)
187
+ output.min = @last
188
+ output.count = @count
189
+ output
190
+ end
191
+ end
192
+
193
+ class MyMinJob < JobBase
194
+ def job
195
+ mapper MinMap
196
+ reducer MyMinReduce
197
+ infiles "test-in/test3-in"
198
+ outfiles "test-out"
199
+ end
200
+ end
201
+
202
+ class TestMRToolkit < Test::Unit::TestCase
203
+ def test_min
204
+ MyMinJob.run_command
205
+ out = File.read("test-out")
206
+ expected = "8460\t1\n" +
207
+ "8461\t1\n" +
208
+ "8470\t3\n"
209
+ assert_equal(expected, out)
210
+ end
211
+ end
212
+
213
+ #################################
214
+ #
215
+ # This is the previous one, but with a standard reducer.
216
+
217
+ class CollectJob < JobBase
218
+ def job
219
+ mapper MinMap
220
+ reducer CopyReduce, 1
221
+ infiles "test-in/test3-in"
222
+ outfiles "test-out"
223
+ end
224
+ end
225
+
226
+ class TestMRToolkit < Test::Unit::TestCase
227
+ def test_collect
228
+ CollectJob.run_command
229
+ out = File.read("test-out")
230
+ expected = "8460\n" +
231
+ "8461\n" +
232
+ "8470\n" +
233
+ "8470\n" +
234
+ "8470\n"
235
+ assert_equal(expected, out)
236
+ end
237
+ end
238
+
239
+ #################################
240
+ #
241
+ # This is the previous one, but with adifferent
242
+ # standard reducer. This produces the same output
243
+ # as the custom reducer.
244
+
245
+ class UniqueJob < JobBase
246
+ def job
247
+ mapper MinMap
248
+ reducer UniqueReduce
249
+ infiles "test-in/test3-in"
250
+ outfiles "test-out"
251
+ end
252
+ end
253
+
254
+ class TestMRToolkit < Test::Unit::TestCase
255
+ def test_unique
256
+ UniqueJob.run_command
257
+ out = File.read("test-out")
258
+ expected = "8460\n" +
259
+ "8461\n" +
260
+ "8470\n"
261
+ assert_equal(expected, out)
262
+ end
263
+ end
264
+
265
+ ###############################
266
+ #
267
+ # Exercises SumReduce, which sums a variable
268
+ # set of columns.
269
+
270
+
271
+ class GSumJob < JobBase
272
+ def job
273
+ mapper CopyMap, 3
274
+ reducer SumReduce, 3
275
+ infiles "test-in/test6-in"
276
+ outfiles "test-out"
277
+ end
278
+ end
279
+
280
+ class TestMRToolkit < Test::Unit::TestCase
281
+ def test_gsum
282
+ GSumJob.run_command
283
+ out = File.read("test-out")
284
+ expected = "12.0\t9.0\t8.0\n"
285
+ assert_equal(expected, out)
286
+ end
287
+ end
288
+
289
+ class SelectJob < JobBase
290
+ def job
291
+ mapper SelectMap, /^10[23]/
292
+ reducer CopyReduce
293
+ infiles "test-in/test5-in"
294
+ outfiles "test-out"
295
+ end
296
+ end
297
+
298
+ class TestMRToolkit < Test::Unit::TestCase
299
+ def test_select
300
+ SelectJob.run_command
301
+ out = File.read("test-out")
302
+ expected = "102\n102\n102\n102\n103\n"
303
+ assert_equal(expected, out)
304
+ end
305
+ end
306
+
307
+ class SampleJob < JobBase
308
+ def job
309
+ mapper CopyMap, 3
310
+ reducer SampleReduce, 10
311
+ infiles "test-in/test7-in"
312
+ outfiles "test-out"
313
+ end
314
+ end
315
+
316
+ class TestMRToolkit < Test::Unit::TestCase
317
+ def test_sample
318
+ srand 1234
319
+ SampleJob.run_command
320
+ out = File.read("test-out")
321
+ expected = "5\n20\n7\n12\n2\n8\n3\n16\n17\n18\n"
322
+ assert_equal(expected, out)
323
+ end
324
+ end
325
+
326
+ class MaxJob < JobBase
327
+ def job
328
+ mapper CopyMap, 3
329
+ reducer MaxReduce, 3
330
+ infiles "test-in/test4-in"
331
+ outfiles "test-out"
332
+ end
333
+ end
334
+
335
+ class TestMRToolkit < Test::Unit::TestCase
336
+ def test_max
337
+ MaxJob.run_command
338
+ out = File.read("test-out")
339
+ expected = "4\t10\n3\t3\n2\t2\n"
340
+ assert_equal(expected, out)
341
+ end
342
+ end
343
+
344
+ class MinJob < JobBase
345
+ def job
346
+ mapper CopyMap, 3
347
+ reducer MinReduce, 3
348
+ infiles "test-in/test4-in"
349
+ outfiles "test-out"
350
+ end
351
+ end
352
+
353
+ class TestMRToolkit < Test::Unit::TestCase
354
+ def test_min
355
+ MinJob.run_command
356
+ out = File.read("test-out")
357
+ expected = "3\t3\n2\t2\n1\t1\n"
358
+ assert_equal(expected, out)
359
+ end
360
+ end
361
+
362
+ class UniqueSumJob < JobBase
363
+ def job
364
+ mapper CopyMap, 2
365
+ reducer UniqueSumReduce
366
+ infiles "test-in/test5-in"
367
+ outfiles "test-out"
368
+ end
369
+ end
370
+
371
+ class TestMRToolkit < Test::Unit::TestCase
372
+ def test_unique_sum
373
+ UniqueSumJob.run_command
374
+ out = File.read("test-out")
375
+ expected = "100\t3\n101\t2\n102\t4\n103\t1\n104\t2\n"
376
+ assert_equal(expected, out)
377
+ end
378
+ end
379
+
380
+ class UniqueCountJob < JobBase
381
+ def job
382
+ mapper CopyMap
383
+ reducer UniqueCountReduce
384
+ infiles "test-in/test5-in"
385
+ outfiles "test-out"
386
+ end
387
+ end
388
+
389
+ class TestMRToolkit < Test::Unit::TestCase
390
+ def test_unique_count
391
+ UniqueCountJob.run_command
392
+ out = File.read("test-out")
393
+ expected = "100\t3\n101\t2\n102\t4\n103\t1\n104\t2\n"
394
+ assert_equal(expected, out)
395
+ end
396
+ end
397
+
398
+ class MaxUniqueSumJob < JobBase
399
+ def job
400
+ mapper CopyMap, 3
401
+ reducer MaxUniqueSumReduce, 3
402
+ infiles "test-in/test5-in"
403
+ outfiles "test-out"
404
+ end
405
+ end
406
+
407
+ class TestMRToolkit < Test::Unit::TestCase
408
+ def test_max_unique_sum
409
+ MaxUniqueSumJob.run_command
410
+ out = File.read("test-out")
411
+ expected = "102\t4\n100\t3\n101\t2\n"
412
+ assert_equal(expected, out)
413
+ end
414
+ end
415
+
416
+ class UniqueIndexedSumJob < JobBase
417
+ def job
418
+ mapper CopyMap, 3
419
+ reducer UniqueIndexedSumReduce, 3
420
+ infiles "test-in/test8-in"
421
+ outfiles "test-out"
422
+ end
423
+ end
424
+
425
+ class TestMRToolkit < Test::Unit::TestCase
426
+ def test_unique_indexed_sum
427
+ UniqueIndexedSumJob.run_command
428
+ out = File.read("test-out")
429
+ expected = "100\t1000\t3\n100\t1001\t1\n200\t1000\t2\n200\t1001\t1\n"
430
+ assert_equal(expected, out)
431
+ end
432
+ end
433
+
434
+ class UniqueFirstJob < JobBase
435
+ def job
436
+ mapper CopyMap, 4
437
+ reducer UniqueFirstReduce, 3, 1
438
+ infiles "test-in/test9-in"
439
+ outfiles "test-out"
440
+ end
441
+ end
442
+
443
+ class TestMRToolkit < Test::Unit::TestCase
444
+ def test_unique_first
445
+ UniqueFirstJob.run_command
446
+ out = File.read("test-out")
447
+ expected = "a\ta\ta\nx1\ty1\tz1\n"
448
+ assert_equal(expected, out)
449
+ end
450
+ end
451
+
452
+
453
+ class TestRegression < Test::Unit::TestCase
454
+ def test_regress
455
+ x = [1, 2, 3]
456
+ y = [1, 2, 3]
457
+ reg = LinearRegression.new(x, y)
458
+ assert_equal([1, 2, 3], reg.fit(x))
459
+ x = [1, 2, 3, 4]
460
+ y = [1, 5, 5, 9]
461
+ reg = LinearRegression.new(x, y)
462
+ assert_equal(2, reg.slope)
463
+ assert_equal(0, reg.offset)
464
+ y = [1, 5, 5, 9]
465
+ reg = LinearRegression.new(x, y)
466
+ assert_equal(2, reg.slope)
467
+ assert_equal(0, reg.offset)
468
+ end
469
+ end
470
+
471
+
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mrtoolkit
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 2
9
+ version: 0.1.2
10
+ platform: ruby
11
+ authors:
12
+ - cchayden
13
+ - vadimj
14
+ - jashmenn
15
+ autorequire:
16
+ bindir: bin
17
+ cert_chain: []
18
+
19
+ date: 2010-05-17 00:00:00 -07:00
20
+ default_executable:
21
+ dependencies: []
22
+
23
+ description:
24
+ email: nate@natemurray.com
25
+ executables: []
26
+
27
+ extensions: []
28
+
29
+ extra_rdoc_files:
30
+ - README.rdoc
31
+ files:
32
+ - .document
33
+ - .gitignore
34
+ - Makefile
35
+ - README.rdoc
36
+ - Rakefile
37
+ - VERSION.yml
38
+ - examples/Rakefile
39
+ - examples/Readme
40
+ - examples/hour.rb
41
+ - examples/import-logs
42
+ - examples/import.rb
43
+ - examples/ip-result.rb
44
+ - examples/ip-size.rb
45
+ - examples/ip-ua.rb
46
+ - examples/ip.rb
47
+ - examples/section.rb
48
+ - examples/top-file.rb
49
+ - lib/mrtoolkit.rb
50
+ - lib/regression.rb
51
+ - lib/stream_runner.rb
52
+ - mrtoolkit.gemspec
53
+ - standalone/hadoop
54
+ - test/Rakefile
55
+ - test/test-in/test1-in
56
+ - test/test-in/test2-in
57
+ - test/test-in/test3-in
58
+ - test/test-in/test4-in
59
+ - test/test-in/test5-in
60
+ - test/test-in/test6-in
61
+ - test/test-in/test7-in
62
+ - test/test-in/test8-in
63
+ - test/test-in/test9-in
64
+ - test/utest.rb
65
+ has_rdoc: true
66
+ homepage: http://github.com/jashmenn/mrtoolkit
67
+ licenses: []
68
+
69
+ post_install_message:
70
+ rdoc_options:
71
+ - --charset=UTF-8
72
+ require_paths:
73
+ - lib
74
+ required_ruby_version: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ segments:
79
+ - 0
80
+ version: "0"
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ segments:
86
+ - 0
87
+ version: "0"
88
+ requirements: []
89
+
90
+ rubyforge_project:
91
+ rubygems_version: 1.3.6
92
+ signing_key:
93
+ specification_version: 3
94
+ summary: Simplify the creation of Hadoop Map/Reduce jobs
95
+ test_files:
96
+ - test/utest.rb
97
+ - examples/hour.rb
98
+ - examples/import.rb
99
+ - examples/ip-result.rb
100
+ - examples/ip-size.rb
101
+ - examples/ip-ua.rb
102
+ - examples/ip.rb
103
+ - examples/section.rb
104
+ - examples/top-file.rb