jmapreduce 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +70 -0
- data/bin/jmapreduce +62 -0
- data/examples/alice.txt +3736 -0
- data/examples/wordcount.rb +42 -0
- data/lib/jmapreduce/runner.rb +110 -0
- data/release/jmapreduce.jar +0 -0
- data/vendors/gson.jar +0 -0
- data/vendors/javassist.jar +0 -0
- data/vendors/msgpack.jar +0 -0
- metadata +86 -0
data/README.md
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
JMapReduce
|
2
|
+
==========
|
3
|
+
|
4
|
+
JMapReduce is JRuby Map/Reduce Framework built on top of the Hadoop Distributed computing platform.
|
5
|
+
Inspired by [mandy](http://github.com/forward/mandy "Mandy") but runs the map/reduce jobs on the JVM.
|
6
|
+
|
7
|
+
Install
|
8
|
+
-------
|
9
|
+
|
10
|
+
gem install jmapreduce
|
11
|
+
|
12
|
+
Usage
|
13
|
+
-----
|
14
|
+
|
15
|
+
1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
|
16
|
+
2. put files into your hdfs. eg) test/inputs/file1
|
17
|
+
|
18
|
+
3. Now you can run 'jmapreduce' like below:
|
19
|
+
> $ jmapreduce examples/wordcount.rb test/inputs/file1 test/output
|
20
|
+
4. You can also chain map/reduce jobs like the example below. The output of one map/reduce job will be the input of the next job
|
21
|
+
|
22
|
+
5. For full list of options, run:
|
23
|
+
> $ jmapreduce -h
|
24
|
+
|
25
|
+
Example
|
26
|
+
-------
|
27
|
+
|
28
|
+
import org.fingertap.jmapreduce.JMapReduce
|
29
|
+
|
30
|
+
JMapReduce.job 'Count' do
|
31
|
+
reduce_tasks 1
|
32
|
+
|
33
|
+
map do |key, value|
|
34
|
+
value.split.each do |word|
|
35
|
+
emit(word, 1)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
reduce do |key, values|
|
40
|
+
sum = 0
|
41
|
+
values.each {|v| sum += v.to_i }
|
42
|
+
emit(key, sum)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
JMapReduce.job "Histogram" do
|
47
|
+
setup do
|
48
|
+
RANGES = [0..10, 11..20, 21..50, 51..100, 101..200, 201..300, 301..10_000, 10_001..99_999]
|
49
|
+
end
|
50
|
+
|
51
|
+
map do |word, count|
|
52
|
+
range = RANGES.find {|range| range.include?(count.to_i) }
|
53
|
+
emit("#{range.first.to_s}-#{range.last.to_s}", 1)
|
54
|
+
end
|
55
|
+
|
56
|
+
reduce do |range, counts|
|
57
|
+
total = counts.inject(0) {|sum,count| sum+count.to_i }
|
58
|
+
emit(range, '|'*(total/20))
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
Author
|
63
|
+
-------
|
64
|
+
|
65
|
+
Abhinay Mehta <abhinay.mehta@gmail.com>
|
66
|
+
|
67
|
+
Copyright
|
68
|
+
---------
|
69
|
+
|
70
|
+
License: Apache License
|
data/bin/jmapreduce
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'jruby-jars'
|
4
|
+
require 'optparse'
|
5
|
+
require 'ostruct'
|
6
|
+
require 'cgi'
|
7
|
+
|
8
|
+
require File.join(File.dirname(__FILE__), '..', 'lib', 'jmapreduce', 'runner')
|
9
|
+
|
10
|
+
options = OpenStruct.new
|
11
|
+
|
12
|
+
OptionParser.new do |opts|
|
13
|
+
opts.banner = "USAGE: jmapreduce script input output [options]"
|
14
|
+
|
15
|
+
opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file") do |config|
|
16
|
+
options.config = config
|
17
|
+
end
|
18
|
+
|
19
|
+
opts.on("-n", "--namenode namenode:port", "Specify a namenode") do |namenode|
|
20
|
+
options.namenode = namenode
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-t", "--jobtracker jobtracker:port", "Specify a job tracker") do |jobtracker|
|
24
|
+
options.jobtracker = jobtracker
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-l", "--libjars jar1,jar2", "comma-separated jar files to include in the classpath") do |libjars|
|
28
|
+
options.libjars = libjars
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-f", "--files file1,file2", "comma separated files to be copied to the map reduce cluster") do |files|
|
32
|
+
options.files = files
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on("-d", "--dirs dir1,dir2", "comma separated directories to be copied to the map reduce cluster") do |dirs|
|
36
|
+
options.dirs = dirs
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on("-v", '--variables k1=v1,k2=v2', "Pass additional parameters to jobs") do |params|
|
40
|
+
options.properties = params
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on("-j", '--json {"key":"1 value"}', "Pass JSON encoded parameters to jobs") do |json|
|
44
|
+
options.json = "json=#{CGI.escape(json)}"
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
48
|
+
puts opts
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
end.parse!
|
52
|
+
|
53
|
+
Runner.new(ARGV[0], ARGV[1], ARGV[2],
|
54
|
+
:conf => options.config,
|
55
|
+
:namenode => options.namenode,
|
56
|
+
:jobtracker => options.jobtracker,
|
57
|
+
:properties => options.properties,
|
58
|
+
:json => options.json,
|
59
|
+
:libjars => options.libjars,
|
60
|
+
:files => options.files,
|
61
|
+
:dirs => options.dirs
|
62
|
+
).run
|