jmapreduce 0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.md ADDED
@@ -0,0 +1,70 @@
1
+ JMapReduce
2
+ ==========
3
+
4
+ JMapReduce is JRuby Map/Reduce Framework built on top of the Hadoop Distributed computing platform.
5
+ Inspired by [mandy](http://github.com/forward/mandy "Mandy") but runs the map/reduce jobs on the JVM.
6
+
7
+ Install
8
+ -------
9
+
10
+ gem install jmapreduce
11
+
12
+ Usage
13
+ -----
14
+
15
+ 1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
16
+ 2. put files into your hdfs. eg) test/inputs/file1
17
+
18
+ 3. Now you can run 'jmapreduce' like below:
19
+ > $ jmapreduce examples/wordcount.rb test/inputs/file1 test/output
20
+ 4. You can also chain map/reduce jobs like the example below. The output of one map/reduce job will be the input of the next job
21
+
22
+ 5. For full list of options, run:
23
+ > $ jmapreduce -h
24
+
25
+ Example
26
+ -------
27
+
28
+ import org.fingertap.jmapreduce.JMapReduce
29
+
30
+ JMapReduce.job 'Count' do
31
+ reduce_tasks 1
32
+
33
+ map do |key, value|
34
+ value.split.each do |word|
35
+ emit(word, 1)
36
+ end
37
+ end
38
+
39
+ reduce do |key, values|
40
+ sum = 0
41
+ values.each {|v| sum += v.to_i }
42
+ emit(key, sum)
43
+ end
44
+ end
45
+
46
+ JMapReduce.job "Histogram" do
47
+ setup do
48
+ RANGES = [0..10, 11..20, 21..50, 51..100, 101..200, 201..300, 301..10_000, 10_001..99_999]
49
+ end
50
+
51
+ map do |word, count|
52
+ range = RANGES.find {|range| range.include?(count.to_i) }
53
+ emit("#{range.first.to_s}-#{range.last.to_s}", 1)
54
+ end
55
+
56
+ reduce do |range, counts|
57
+ total = counts.inject(0) {|sum,count| sum+count.to_i }
58
+ emit(range, '|'*(total/20))
59
+ end
60
+ end
61
+
62
+ Author
63
+ -------
64
+
65
+ Abhinay Mehta <abhinay.mehta@gmail.com>
66
+
67
+ Copyright
68
+ ---------
69
+
70
+ License: Apache License
data/bin/jmapreduce ADDED
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'jruby-jars'
4
+ require 'optparse'
5
+ require 'ostruct'
6
+ require 'cgi'
7
+
8
+ require File.join(File.dirname(__FILE__), '..', 'lib', 'jmapreduce', 'runner')
9
+
10
+ options = OpenStruct.new
11
+
12
+ OptionParser.new do |opts|
13
+ opts.banner = "USAGE: jmapreduce script input output [options]"
14
+
15
+ opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file") do |config|
16
+ options.config = config
17
+ end
18
+
19
+ opts.on("-n", "--namenode namenode:port", "Specify a namenode") do |namenode|
20
+ options.namenode = namenode
21
+ end
22
+
23
+ opts.on("-t", "--jobtracker jobtracker:port", "Specify a job tracker") do |jobtracker|
24
+ options.jobtracker = jobtracker
25
+ end
26
+
27
+ opts.on("-l", "--libjars jar1,jar2", "comma-separated jar files to include in the classpath") do |libjars|
28
+ options.libjars = libjars
29
+ end
30
+
31
+ opts.on("-f", "--files file1,file2", "comma separated files to be copied to the map reduce cluster") do |files|
32
+ options.files = files
33
+ end
34
+
35
+ opts.on("-d", "--dirs dir1,dir2", "comma separated directories to be copied to the map reduce cluster") do |dirs|
36
+ options.dirs = dirs
37
+ end
38
+
39
+ opts.on("-v", '--variables k1=v1,k2=v2', "Pass additional parameters to jobs") do |params|
40
+ options.properties = params
41
+ end
42
+
43
+ opts.on("-j", '--json {"key":"1 value"}', "Pass JSON encoded parameters to jobs") do |json|
44
+ options.json = "json=#{CGI.escape(json)}"
45
+ end
46
+
47
+ opts.on_tail("-h", "--help", "Show this message") do
48
+ puts opts
49
+ exit
50
+ end
51
+ end.parse!
52
+
53
+ Runner.new(ARGV[0], ARGV[1], ARGV[2],
54
+ :conf => options.config,
55
+ :namenode => options.namenode,
56
+ :jobtracker => options.jobtracker,
57
+ :properties => options.properties,
58
+ :json => options.json,
59
+ :libjars => options.libjars,
60
+ :files => options.files,
61
+ :dirs => options.dirs
62
+ ).run