jmapreduce 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md ADDED
@@ -0,0 +1,70 @@
1
+ JMapReduce
2
+ ==========
3
+
4
+ JMapReduce is JRuby Map/Reduce Framework built on top of the Hadoop Distributed computing platform.
5
+ Inspired by [mandy](http://github.com/forward/mandy "Mandy") but runs the map/reduce jobs on the JVM.
6
+
7
+ Install
8
+ -------
9
+
10
+ gem install jmapreduce
11
+
12
+ Usage
13
+ -----
14
+
15
+ 1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
16
+ 2. put files into your hdfs. eg) test/inputs/file1
17
+
18
+ 3. Now you can run 'jmapreduce' like below:
19
+ > $ jmapreduce examples/wordcount.rb test/inputs/file1 test/output
20
+ 4. You can also chain map/reduce jobs like the example below. The output of one map/reduce job will be the input of the next job
21
+
22
+ 5. For full list of options, run:
23
+ > $ jmapreduce -h
24
+
25
+ Example
26
+ -------
27
+
28
+ import org.fingertap.jmapreduce.JMapReduce
29
+
30
+ JMapReduce.job 'Count' do
31
+ reduce_tasks 1
32
+
33
+ map do |key, value|
34
+ value.split.each do |word|
35
+ emit(word, 1)
36
+ end
37
+ end
38
+
39
+ reduce do |key, values|
40
+ sum = 0
41
+ values.each {|v| sum += v.to_i }
42
+ emit(key, sum)
43
+ end
44
+ end
45
+
46
+ JMapReduce.job "Histogram" do
47
+ setup do
48
+ RANGES = [0..10, 11..20, 21..50, 51..100, 101..200, 201..300, 301..10_000, 10_001..99_999]
49
+ end
50
+
51
+ map do |word, count|
52
+ range = RANGES.find {|range| range.include?(count.to_i) }
53
+ emit("#{range.first.to_s}-#{range.last.to_s}", 1)
54
+ end
55
+
56
+ reduce do |range, counts|
57
+ total = counts.inject(0) {|sum,count| sum+count.to_i }
58
+ emit(range, '|'*(total/20))
59
+ end
60
+ end
61
+
62
+ Author
63
+ -------
64
+
65
+ Abhinay Mehta <abhinay.mehta@gmail.com>
66
+
67
+ Copyright
68
+ ---------
69
+
70
+ License: Apache License
data/bin/jmapreduce ADDED
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'jruby-jars'
4
+ require 'optparse'
5
+ require 'ostruct'
6
+ require 'cgi'
7
+
8
+ require File.join(File.dirname(__FILE__), '..', 'lib', 'jmapreduce', 'runner')
9
+
10
+ options = OpenStruct.new
11
+
12
+ OptionParser.new do |opts|
13
+ opts.banner = "USAGE: jmapreduce script input output [options]"
14
+
15
+ opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file") do |config|
16
+ options.config = config
17
+ end
18
+
19
+ opts.on("-n", "--namenode namenode:port", "Specify a namenode") do |namenode|
20
+ options.namenode = namenode
21
+ end
22
+
23
+ opts.on("-t", "--jobtracker jobtracker:port", "Specify a job tracker") do |jobtracker|
24
+ options.jobtracker = jobtracker
25
+ end
26
+
27
+ opts.on("-l", "--libjars jar1,jar2", "comma-separated jar files to include in the classpath") do |libjars|
28
+ options.libjars = libjars
29
+ end
30
+
31
+ opts.on("-f", "--files file1,file2", "comma separated files to be copied to the map reduce cluster") do |files|
32
+ options.files = files
33
+ end
34
+
35
+ opts.on("-d", "--dirs dir1,dir2", "comma separated directories to be copied to the map reduce cluster") do |dirs|
36
+ options.dirs = dirs
37
+ end
38
+
39
+ opts.on("-v", '--variables k1=v1,k2=v2', "Pass additional parameters to jobs") do |params|
40
+ options.properties = params
41
+ end
42
+
43
+ opts.on("-j", '--json {"key":"1 value"}', "Pass JSON encoded parameters to jobs") do |json|
44
+ options.json = "json=#{CGI.escape(json)}"
45
+ end
46
+
47
+ opts.on_tail("-h", "--help", "Show this message") do
48
+ puts opts
49
+ exit
50
+ end
51
+ end.parse!
52
+
53
+ Runner.new(ARGV[0], ARGV[1], ARGV[2],
54
+ :conf => options.config,
55
+ :namenode => options.namenode,
56
+ :jobtracker => options.jobtracker,
57
+ :properties => options.properties,
58
+ :json => options.json,
59
+ :libjars => options.libjars,
60
+ :files => options.files,
61
+ :dirs => options.dirs
62
+ ).run