jmapreduce 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +70 -0
- data/bin/jmapreduce +62 -0
- data/examples/alice.txt +3736 -0
- data/examples/wordcount.rb +42 -0
- data/lib/jmapreduce/runner.rb +110 -0
- data/release/jmapreduce.jar +0 -0
- data/vendors/gson.jar +0 -0
- data/vendors/javassist.jar +0 -0
- data/vendors/msgpack.jar +0 -0
- metadata +86 -0
data/README.md
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
JMapReduce
|
2
|
+
==========
|
3
|
+
|
4
|
+
JMapReduce is JRuby Map/Reduce Framework built on top of the Hadoop Distributed computing platform.
|
5
|
+
Inspired by [mandy](http://github.com/forward/mandy "Mandy") but runs the map/reduce jobs on the JVM.
|
6
|
+
|
7
|
+
Install
|
8
|
+
-------
|
9
|
+
|
10
|
+
gem install jmapreduce
|
11
|
+
|
12
|
+
Usage
|
13
|
+
-----
|
14
|
+
|
15
|
+
1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
|
16
|
+
2. put files into your hdfs. eg) test/inputs/file1
|
17
|
+
|
18
|
+
3. Now you can run 'jmapreduce' like below:
|
19
|
+
> $ jmapreduce examples/wordcount.rb test/inputs/file1 test/output
|
20
|
+
4. You can also chain map/reduce jobs like the example below. The output of one map/reduce job will be the input of the next job
|
21
|
+
|
22
|
+
5. For full list of options, run:
|
23
|
+
> $ jmapreduce -h
|
24
|
+
|
25
|
+
Example
|
26
|
+
-------
|
27
|
+
|
28
|
+
import org.fingertap.jmapreduce.JMapReduce
|
29
|
+
|
30
|
+
JMapReduce.job 'Count' do
|
31
|
+
reduce_tasks 1
|
32
|
+
|
33
|
+
map do |key, value|
|
34
|
+
value.split.each do |word|
|
35
|
+
emit(word, 1)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
reduce do |key, values|
|
40
|
+
sum = 0
|
41
|
+
values.each {|v| sum += v.to_i }
|
42
|
+
emit(key, sum)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
JMapReduce.job "Histogram" do
|
47
|
+
setup do
|
48
|
+
RANGES = [0..10, 11..20, 21..50, 51..100, 101..200, 201..300, 301..10_000, 10_001..99_999]
|
49
|
+
end
|
50
|
+
|
51
|
+
map do |word, count|
|
52
|
+
range = RANGES.find {|range| range.include?(count.to_i) }
|
53
|
+
emit("#{range.first.to_s}-#{range.last.to_s}", 1)
|
54
|
+
end
|
55
|
+
|
56
|
+
reduce do |range, counts|
|
57
|
+
total = counts.inject(0) {|sum,count| sum+count.to_i }
|
58
|
+
emit(range, '|'*(total/20))
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
Author
|
63
|
+
-------
|
64
|
+
|
65
|
+
Abhinay Mehta <abhinay.mehta@gmail.com>
|
66
|
+
|
67
|
+
Copyright
|
68
|
+
---------
|
69
|
+
|
70
|
+
License: Apache License
|
data/bin/jmapreduce
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'jruby-jars'
|
4
|
+
require 'optparse'
|
5
|
+
require 'ostruct'
|
6
|
+
require 'cgi'
|
7
|
+
|
8
|
+
require File.join(File.dirname(__FILE__), '..', 'lib', 'jmapreduce', 'runner')
|
9
|
+
|
10
|
+
options = OpenStruct.new
|
11
|
+
|
12
|
+
OptionParser.new do |opts|
|
13
|
+
opts.banner = "USAGE: jmapreduce script input output [options]"
|
14
|
+
|
15
|
+
opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file") do |config|
|
16
|
+
options.config = config
|
17
|
+
end
|
18
|
+
|
19
|
+
opts.on("-n", "--namenode namenode:port", "Specify a namenode") do |namenode|
|
20
|
+
options.namenode = namenode
|
21
|
+
end
|
22
|
+
|
23
|
+
opts.on("-t", "--jobtracker jobtracker:port", "Specify a job tracker") do |jobtracker|
|
24
|
+
options.jobtracker = jobtracker
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-l", "--libjars jar1,jar2", "comma-separated jar files to include in the classpath") do |libjars|
|
28
|
+
options.libjars = libjars
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-f", "--files file1,file2", "comma separated files to be copied to the map reduce cluster") do |files|
|
32
|
+
options.files = files
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on("-d", "--dirs dir1,dir2", "comma separated directories to be copied to the map reduce cluster") do |dirs|
|
36
|
+
options.dirs = dirs
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on("-v", '--variables k1=v1,k2=v2', "Pass additional parameters to jobs") do |params|
|
40
|
+
options.properties = params
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on("-j", '--json {"key":"1 value"}', "Pass JSON encoded parameters to jobs") do |json|
|
44
|
+
options.json = "json=#{CGI.escape(json)}"
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
48
|
+
puts opts
|
49
|
+
exit
|
50
|
+
end
|
51
|
+
end.parse!
|
52
|
+
|
53
|
+
Runner.new(ARGV[0], ARGV[1], ARGV[2],
|
54
|
+
:conf => options.config,
|
55
|
+
:namenode => options.namenode,
|
56
|
+
:jobtracker => options.jobtracker,
|
57
|
+
:properties => options.properties,
|
58
|
+
:json => options.json,
|
59
|
+
:libjars => options.libjars,
|
60
|
+
:files => options.files,
|
61
|
+
:dirs => options.dirs
|
62
|
+
).run
|