mrtoolkit 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +6 -0
- data/Makefile +6 -0
- data/README.rdoc +19 -0
- data/Rakefile +57 -0
- data/VERSION.yml +4 -0
- data/examples/Rakefile +80 -0
- data/examples/Readme +12 -0
- data/examples/hour.rb +57 -0
- data/examples/import-logs +14 -0
- data/examples/import.rb +22 -0
- data/examples/ip-result.rb +33 -0
- data/examples/ip-size.rb +33 -0
- data/examples/ip-ua.rb +36 -0
- data/examples/ip.rb +10 -0
- data/examples/section.rb +37 -0
- data/examples/top-file.rb +36 -0
- data/lib/mrtoolkit.rb +908 -0
- data/lib/regression.rb +33 -0
- data/lib/stream_runner.rb +100 -0
- data/mrtoolkit.gemspec +79 -0
- data/standalone/hadoop +104 -0
- data/test/Rakefile +21 -0
- data/test/test-in/test1-in +2 -0
- data/test/test-in/test2-in +4 -0
- data/test/test-in/test3-in +5 -0
- data/test/test-in/test4-in +6 -0
- data/test/test-in/test5-in +12 -0
- data/test/test-in/test6-in +3 -0
- data/test/test-in/test7-in +20 -0
- data/test/test-in/test8-in +12 -0
- data/test/test-in/test9-in +6 -0
- data/test/utest.rb +471 -0
- metadata +104 -0
data/test/utest.rb
ADDED
@@ -0,0 +1,471 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'mrtoolkit'
|
3
|
+
require 'regression'
|
4
|
+
|
5
|
+
##############################
|
6
|
+
# Log example.
|
7
|
+
#
|
8
|
+
# Reformats the date and time into one field.
|
9
|
+
# Reducer adds an extra column
|
10
|
+
|
11
|
+
JobBase.testing(true)
|
12
|
+
|
13
|
+
class LogMap < MapBase
|
14
|
+
def declare
|
15
|
+
field :date
|
16
|
+
field :time
|
17
|
+
field :url
|
18
|
+
|
19
|
+
emit :date_time
|
20
|
+
emit :url
|
21
|
+
end
|
22
|
+
|
23
|
+
def process(input, output)
|
24
|
+
output.date_time = input.date + "T" + input.time
|
25
|
+
output.url = input.url
|
26
|
+
output
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
class LogReduce < ReduceBase
|
32
|
+
def declare
|
33
|
+
field :date_time
|
34
|
+
field :url
|
35
|
+
|
36
|
+
emit :date_time
|
37
|
+
emit :url
|
38
|
+
emit :junk
|
39
|
+
end
|
40
|
+
|
41
|
+
def process(input, output)
|
42
|
+
output = copy_struct input, output
|
43
|
+
output.junk = "x"
|
44
|
+
output
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
class LogJob < JobBase
|
49
|
+
def job
|
50
|
+
mapper LogMap
|
51
|
+
reducer LogReduce
|
52
|
+
infiles "test-in/test1-in"
|
53
|
+
outfiles "test-out"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class TestMRToolkit < Test::Unit::TestCase
|
58
|
+
def test_log
|
59
|
+
LogJob.run_command
|
60
|
+
out = File.read("test-out")
|
61
|
+
expected = "2008-10-01T10:30:00\t1.2.3.4\tx\n" +
|
62
|
+
"2008-10-02T11:30:00\t1.2.3.5\tx\n"
|
63
|
+
assert_equal(expected, out)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
##########################################
|
69
|
+
#
|
70
|
+
# Computs count, total, and sum of squares.
|
71
|
+
|
72
|
+
class SumMap < MapBase
|
73
|
+
def declare
|
74
|
+
field :value
|
75
|
+
|
76
|
+
emit :count
|
77
|
+
emit :total
|
78
|
+
emit :sum_of_squares
|
79
|
+
end
|
80
|
+
|
81
|
+
def process(input, output)
|
82
|
+
v = input.value.to_f
|
83
|
+
output.count = 1
|
84
|
+
output.total = v
|
85
|
+
output.sum_of_squares = v * v
|
86
|
+
output
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
# This could be done with canned reducer
|
92
|
+
class MySumReduce < ReduceBase
|
93
|
+
def declare
|
94
|
+
field :count
|
95
|
+
field :total
|
96
|
+
field :sum_of_squares
|
97
|
+
|
98
|
+
emit :count
|
99
|
+
emit :total
|
100
|
+
emit :sum_of_squares
|
101
|
+
end
|
102
|
+
|
103
|
+
def process_begin(dummy, output)
|
104
|
+
@count = 0
|
105
|
+
@total = 0
|
106
|
+
@sum_of_squares = 0
|
107
|
+
nil
|
108
|
+
end
|
109
|
+
def process(input, output)
|
110
|
+
@count += input.count.to_f
|
111
|
+
@total += input.total.to_f
|
112
|
+
@sum_of_squares += input.sum_of_squares.to_f
|
113
|
+
nil
|
114
|
+
end
|
115
|
+
def process_end(dummy, output)
|
116
|
+
output.count = @count
|
117
|
+
output.total = @total
|
118
|
+
output.sum_of_squares = @sum_of_squares
|
119
|
+
output
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
class SumJob < JobBase
|
124
|
+
def job
|
125
|
+
mapper SumMap
|
126
|
+
reducer MySumReduce
|
127
|
+
infiles "test-in/test2-in"
|
128
|
+
outfiles "test-out"
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
class TestMRToolkit < Test::Unit::TestCase
|
133
|
+
def test_sum
|
134
|
+
SumJob.run_command
|
135
|
+
out = File.read("test-out")
|
136
|
+
expected = "4.0\t43.0\t1005.0\n"
|
137
|
+
assert_equal(expected, out)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
######################################
|
143
|
+
#
|
144
|
+
# Grops times into one-minute buckets
|
145
|
+
# Calculates counts for each bucket
|
146
|
+
|
147
|
+
require 'parsedate'
|
148
|
+
|
149
|
+
class MinMap < MapBase
|
150
|
+
def declare
|
151
|
+
field :dt
|
152
|
+
field :tm
|
153
|
+
|
154
|
+
emit :minute
|
155
|
+
emit :count
|
156
|
+
end
|
157
|
+
|
158
|
+
def process(input, output)
|
159
|
+
res = ParseDate.parsedate(input.dt + " " + input.tm)
|
160
|
+
t = Time.local(*res)
|
161
|
+
min = t.min + 60 * (t.hour + 24 * t.wday)
|
162
|
+
output.count = 1
|
163
|
+
output.minute = min
|
164
|
+
output
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
|
169
|
+
class MyMinReduce < ReduceBase
|
170
|
+
def declare
|
171
|
+
field :minute
|
172
|
+
field :count
|
173
|
+
|
174
|
+
emit :min
|
175
|
+
emit :count
|
176
|
+
end
|
177
|
+
|
178
|
+
def process_init(input, output)
|
179
|
+
@count = 0
|
180
|
+
nil
|
181
|
+
end
|
182
|
+
def process_each(input, output)
|
183
|
+
@count += 1
|
184
|
+
nil
|
185
|
+
end
|
186
|
+
def process_term(input, output)
|
187
|
+
output.min = @last
|
188
|
+
output.count = @count
|
189
|
+
output
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
class MyMinJob < JobBase
|
194
|
+
def job
|
195
|
+
mapper MinMap
|
196
|
+
reducer MyMinReduce
|
197
|
+
infiles "test-in/test3-in"
|
198
|
+
outfiles "test-out"
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
class TestMRToolkit < Test::Unit::TestCase
|
203
|
+
def test_min
|
204
|
+
MyMinJob.run_command
|
205
|
+
out = File.read("test-out")
|
206
|
+
expected = "8460\t1\n" +
|
207
|
+
"8461\t1\n" +
|
208
|
+
"8470\t3\n"
|
209
|
+
assert_equal(expected, out)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
#################################
|
214
|
+
#
|
215
|
+
# This is the previous one, but with a standard reducer.
|
216
|
+
|
217
|
+
class CollectJob < JobBase
|
218
|
+
def job
|
219
|
+
mapper MinMap
|
220
|
+
reducer CopyReduce, 1
|
221
|
+
infiles "test-in/test3-in"
|
222
|
+
outfiles "test-out"
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
class TestMRToolkit < Test::Unit::TestCase
|
227
|
+
def test_collect
|
228
|
+
CollectJob.run_command
|
229
|
+
out = File.read("test-out")
|
230
|
+
expected = "8460\n" +
|
231
|
+
"8461\n" +
|
232
|
+
"8470\n" +
|
233
|
+
"8470\n" +
|
234
|
+
"8470\n"
|
235
|
+
assert_equal(expected, out)
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
#################################
|
240
|
+
#
|
241
|
+
# This is the previous one, but with adifferent
|
242
|
+
# standard reducer. This produces the same output
|
243
|
+
# as the custom reducer.
|
244
|
+
|
245
|
+
class UniqueJob < JobBase
|
246
|
+
def job
|
247
|
+
mapper MinMap
|
248
|
+
reducer UniqueReduce
|
249
|
+
infiles "test-in/test3-in"
|
250
|
+
outfiles "test-out"
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
class TestMRToolkit < Test::Unit::TestCase
|
255
|
+
def test_unique
|
256
|
+
UniqueJob.run_command
|
257
|
+
out = File.read("test-out")
|
258
|
+
expected = "8460\n" +
|
259
|
+
"8461\n" +
|
260
|
+
"8470\n"
|
261
|
+
assert_equal(expected, out)
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
###############################
|
266
|
+
#
|
267
|
+
# Exercises SumReduce, which sums a variable
|
268
|
+
# set of columns.
|
269
|
+
|
270
|
+
|
271
|
+
class GSumJob < JobBase
|
272
|
+
def job
|
273
|
+
mapper CopyMap, 3
|
274
|
+
reducer SumReduce, 3
|
275
|
+
infiles "test-in/test6-in"
|
276
|
+
outfiles "test-out"
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
class TestMRToolkit < Test::Unit::TestCase
|
281
|
+
def test_gsum
|
282
|
+
GSumJob.run_command
|
283
|
+
out = File.read("test-out")
|
284
|
+
expected = "12.0\t9.0\t8.0\n"
|
285
|
+
assert_equal(expected, out)
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
class SelectJob < JobBase
|
290
|
+
def job
|
291
|
+
mapper SelectMap, /^10[23]/
|
292
|
+
reducer CopyReduce
|
293
|
+
infiles "test-in/test5-in"
|
294
|
+
outfiles "test-out"
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
class TestMRToolkit < Test::Unit::TestCase
|
299
|
+
def test_select
|
300
|
+
SelectJob.run_command
|
301
|
+
out = File.read("test-out")
|
302
|
+
expected = "102\n102\n102\n102\n103\n"
|
303
|
+
assert_equal(expected, out)
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
class SampleJob < JobBase
|
308
|
+
def job
|
309
|
+
mapper CopyMap, 3
|
310
|
+
reducer SampleReduce, 10
|
311
|
+
infiles "test-in/test7-in"
|
312
|
+
outfiles "test-out"
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
class TestMRToolkit < Test::Unit::TestCase
|
317
|
+
def test_sample
|
318
|
+
srand 1234
|
319
|
+
SampleJob.run_command
|
320
|
+
out = File.read("test-out")
|
321
|
+
expected = "5\n20\n7\n12\n2\n8\n3\n16\n17\n18\n"
|
322
|
+
assert_equal(expected, out)
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
class MaxJob < JobBase
|
327
|
+
def job
|
328
|
+
mapper CopyMap, 3
|
329
|
+
reducer MaxReduce, 3
|
330
|
+
infiles "test-in/test4-in"
|
331
|
+
outfiles "test-out"
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
class TestMRToolkit < Test::Unit::TestCase
|
336
|
+
def test_max
|
337
|
+
MaxJob.run_command
|
338
|
+
out = File.read("test-out")
|
339
|
+
expected = "4\t10\n3\t3\n2\t2\n"
|
340
|
+
assert_equal(expected, out)
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
class MinJob < JobBase
|
345
|
+
def job
|
346
|
+
mapper CopyMap, 3
|
347
|
+
reducer MinReduce, 3
|
348
|
+
infiles "test-in/test4-in"
|
349
|
+
outfiles "test-out"
|
350
|
+
end
|
351
|
+
end
|
352
|
+
|
353
|
+
class TestMRToolkit < Test::Unit::TestCase
|
354
|
+
def test_min
|
355
|
+
MinJob.run_command
|
356
|
+
out = File.read("test-out")
|
357
|
+
expected = "3\t3\n2\t2\n1\t1\n"
|
358
|
+
assert_equal(expected, out)
|
359
|
+
end
|
360
|
+
end
|
361
|
+
|
362
|
+
class UniqueSumJob < JobBase
|
363
|
+
def job
|
364
|
+
mapper CopyMap, 2
|
365
|
+
reducer UniqueSumReduce
|
366
|
+
infiles "test-in/test5-in"
|
367
|
+
outfiles "test-out"
|
368
|
+
end
|
369
|
+
end
|
370
|
+
|
371
|
+
class TestMRToolkit < Test::Unit::TestCase
|
372
|
+
def test_unique_sum
|
373
|
+
UniqueSumJob.run_command
|
374
|
+
out = File.read("test-out")
|
375
|
+
expected = "100\t3\n101\t2\n102\t4\n103\t1\n104\t2\n"
|
376
|
+
assert_equal(expected, out)
|
377
|
+
end
|
378
|
+
end
|
379
|
+
|
380
|
+
class UniqueCountJob < JobBase
|
381
|
+
def job
|
382
|
+
mapper CopyMap
|
383
|
+
reducer UniqueCountReduce
|
384
|
+
infiles "test-in/test5-in"
|
385
|
+
outfiles "test-out"
|
386
|
+
end
|
387
|
+
end
|
388
|
+
|
389
|
+
class TestMRToolkit < Test::Unit::TestCase
|
390
|
+
def test_unique_count
|
391
|
+
UniqueCountJob.run_command
|
392
|
+
out = File.read("test-out")
|
393
|
+
expected = "100\t3\n101\t2\n102\t4\n103\t1\n104\t2\n"
|
394
|
+
assert_equal(expected, out)
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
class MaxUniqueSumJob < JobBase
|
399
|
+
def job
|
400
|
+
mapper CopyMap, 3
|
401
|
+
reducer MaxUniqueSumReduce, 3
|
402
|
+
infiles "test-in/test5-in"
|
403
|
+
outfiles "test-out"
|
404
|
+
end
|
405
|
+
end
|
406
|
+
|
407
|
+
class TestMRToolkit < Test::Unit::TestCase
|
408
|
+
def test_max_unique_sum
|
409
|
+
MaxUniqueSumJob.run_command
|
410
|
+
out = File.read("test-out")
|
411
|
+
expected = "102\t4\n100\t3\n101\t2\n"
|
412
|
+
assert_equal(expected, out)
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
class UniqueIndexedSumJob < JobBase
|
417
|
+
def job
|
418
|
+
mapper CopyMap, 3
|
419
|
+
reducer UniqueIndexedSumReduce, 3
|
420
|
+
infiles "test-in/test8-in"
|
421
|
+
outfiles "test-out"
|
422
|
+
end
|
423
|
+
end
|
424
|
+
|
425
|
+
class TestMRToolkit < Test::Unit::TestCase
|
426
|
+
def test_unique_indexed_sum
|
427
|
+
UniqueIndexedSumJob.run_command
|
428
|
+
out = File.read("test-out")
|
429
|
+
expected = "100\t1000\t3\n100\t1001\t1\n200\t1000\t2\n200\t1001\t1\n"
|
430
|
+
assert_equal(expected, out)
|
431
|
+
end
|
432
|
+
end
|
433
|
+
|
434
|
+
class UniqueFirstJob < JobBase
|
435
|
+
def job
|
436
|
+
mapper CopyMap, 4
|
437
|
+
reducer UniqueFirstReduce, 3, 1
|
438
|
+
infiles "test-in/test9-in"
|
439
|
+
outfiles "test-out"
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
443
|
+
class TestMRToolkit < Test::Unit::TestCase
|
444
|
+
def test_unique_first
|
445
|
+
UniqueFirstJob.run_command
|
446
|
+
out = File.read("test-out")
|
447
|
+
expected = "a\ta\ta\nx1\ty1\tz1\n"
|
448
|
+
assert_equal(expected, out)
|
449
|
+
end
|
450
|
+
end
|
451
|
+
|
452
|
+
|
453
|
+
class TestRegression < Test::Unit::TestCase
|
454
|
+
def test_regress
|
455
|
+
x = [1, 2, 3]
|
456
|
+
y = [1, 2, 3]
|
457
|
+
reg = LinearRegression.new(x, y)
|
458
|
+
assert_equal([1, 2, 3], reg.fit(x))
|
459
|
+
x = [1, 2, 3, 4]
|
460
|
+
y = [1, 5, 5, 9]
|
461
|
+
reg = LinearRegression.new(x, y)
|
462
|
+
assert_equal(2, reg.slope)
|
463
|
+
assert_equal(0, reg.offset)
|
464
|
+
y = [1, 5, 5, 9]
|
465
|
+
reg = LinearRegression.new(x, y)
|
466
|
+
assert_equal(2, reg.slope)
|
467
|
+
assert_equal(0, reg.offset)
|
468
|
+
end
|
469
|
+
end
|
470
|
+
|
471
|
+
|
metadata
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mrtoolkit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 2
|
9
|
+
version: 0.1.2
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- cchayden
|
13
|
+
- vadimj
|
14
|
+
- jashmenn
|
15
|
+
autorequire:
|
16
|
+
bindir: bin
|
17
|
+
cert_chain: []
|
18
|
+
|
19
|
+
date: 2010-05-17 00:00:00 -07:00
|
20
|
+
default_executable:
|
21
|
+
dependencies: []
|
22
|
+
|
23
|
+
description:
|
24
|
+
email: nate@natemurray.com
|
25
|
+
executables: []
|
26
|
+
|
27
|
+
extensions: []
|
28
|
+
|
29
|
+
extra_rdoc_files:
|
30
|
+
- README.rdoc
|
31
|
+
files:
|
32
|
+
- .document
|
33
|
+
- .gitignore
|
34
|
+
- Makefile
|
35
|
+
- README.rdoc
|
36
|
+
- Rakefile
|
37
|
+
- VERSION.yml
|
38
|
+
- examples/Rakefile
|
39
|
+
- examples/Readme
|
40
|
+
- examples/hour.rb
|
41
|
+
- examples/import-logs
|
42
|
+
- examples/import.rb
|
43
|
+
- examples/ip-result.rb
|
44
|
+
- examples/ip-size.rb
|
45
|
+
- examples/ip-ua.rb
|
46
|
+
- examples/ip.rb
|
47
|
+
- examples/section.rb
|
48
|
+
- examples/top-file.rb
|
49
|
+
- lib/mrtoolkit.rb
|
50
|
+
- lib/regression.rb
|
51
|
+
- lib/stream_runner.rb
|
52
|
+
- mrtoolkit.gemspec
|
53
|
+
- standalone/hadoop
|
54
|
+
- test/Rakefile
|
55
|
+
- test/test-in/test1-in
|
56
|
+
- test/test-in/test2-in
|
57
|
+
- test/test-in/test3-in
|
58
|
+
- test/test-in/test4-in
|
59
|
+
- test/test-in/test5-in
|
60
|
+
- test/test-in/test6-in
|
61
|
+
- test/test-in/test7-in
|
62
|
+
- test/test-in/test8-in
|
63
|
+
- test/test-in/test9-in
|
64
|
+
- test/utest.rb
|
65
|
+
has_rdoc: true
|
66
|
+
homepage: http://github.com/jashmenn/mrtoolkit
|
67
|
+
licenses: []
|
68
|
+
|
69
|
+
post_install_message:
|
70
|
+
rdoc_options:
|
71
|
+
- --charset=UTF-8
|
72
|
+
require_paths:
|
73
|
+
- lib
|
74
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
75
|
+
requirements:
|
76
|
+
- - ">="
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
segments:
|
79
|
+
- 0
|
80
|
+
version: "0"
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
segments:
|
86
|
+
- 0
|
87
|
+
version: "0"
|
88
|
+
requirements: []
|
89
|
+
|
90
|
+
rubyforge_project:
|
91
|
+
rubygems_version: 1.3.6
|
92
|
+
signing_key:
|
93
|
+
specification_version: 3
|
94
|
+
summary: Simplify the creation of Hadoop Map/Reduce jobs
|
95
|
+
test_files:
|
96
|
+
- test/utest.rb
|
97
|
+
- examples/hour.rb
|
98
|
+
- examples/import.rb
|
99
|
+
- examples/ip-result.rb
|
100
|
+
- examples/ip-size.rb
|
101
|
+
- examples/ip-ua.rb
|
102
|
+
- examples/ip.rb
|
103
|
+
- examples/section.rb
|
104
|
+
- examples/top-file.rb
|