jmapreduce 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +131 -17
- data/examples/wordcount.rb +2 -0
- data/lib/jmapreduce/runner.rb +3 -2
- data/release/jmapreduce.jar +0 -0
- metadata +7 -5
data/README.md
CHANGED
@@ -1,28 +1,33 @@
|
|
1
1
|
JMapReduce
|
2
2
|
==========
|
3
3
|
|
4
|
-
JMapReduce
|
5
|
-
Inspired by [mandy](http://github.com/forward/mandy "Mandy") but runs the map/reduce jobs on the JVM.
|
4
|
+
JMapReduce provides a simple DSL to run map/reduce jobs on Hadoop in the JVM via JRuby. Because it runs in the JVM, you have access to all the Java objects provided to the Map/Reduce jobs at runtime and can leverage other Java libraries inside your jobs.
|
6
5
|
|
7
6
|
Install
|
8
7
|
-------
|
9
8
|
|
10
|
-
|
9
|
+
> $ gem install jmapreduce
|
11
10
|
|
12
11
|
Usage
|
13
12
|
-----
|
14
13
|
|
15
|
-
1.
|
16
|
-
2.
|
14
|
+
1. Install Hadoop and set HADOOP_HOME env variable
|
15
|
+
2. To run a jmapreduce script:
|
16
|
+
> $ jmapreduce [path-to]/script.rb [path-to]/input [path-to]/output
|
17
17
|
|
18
|
-
3.
|
19
|
-
> $ jmapreduce examples/wordcount.rb test/inputs/file1 test/output
|
20
|
-
4. You can also chain map/reduce jobs like the example below. The output of one map/reduce job will be the input of the next job
|
21
|
-
|
22
|
-
5. For full list of options, run:
|
18
|
+
3. For full list of options, including how to run your scripts against a Hadoop cluster run:
|
23
19
|
> $ jmapreduce -h
|
24
20
|
|
25
|
-
|
21
|
+
Notes
|
22
|
+
-----
|
23
|
+
|
24
|
+
* Key/Value pairs are generated by splitting input lines with the tab character
|
25
|
+
* If no tab character is found in the input line, value is set to the line
|
26
|
+
* Mappers and reducers can emit Integers, Floats, Strings, Arrays and Hashes
|
27
|
+
* Arrays and Hashes can only be built up of Integers, Floats, Strings, Arrays and Hashes
|
28
|
+
* You can chain map/reduce jobs like the example below. The output of one map/reduce job will be the input of the next job
|
29
|
+
|
30
|
+
Example
|
26
31
|
-------
|
27
32
|
|
28
33
|
import org.fingertap.jmapreduce.JMapReduce
|
@@ -36,10 +41,10 @@ Example
|
|
36
41
|
end
|
37
42
|
end
|
38
43
|
|
39
|
-
reduce do |
|
44
|
+
reduce do |word, counts|
|
40
45
|
sum = 0
|
41
|
-
|
42
|
-
emit(
|
46
|
+
counts.each {|count| sum += count }
|
47
|
+
emit(word, sum)
|
43
48
|
end
|
44
49
|
end
|
45
50
|
|
@@ -48,17 +53,126 @@ Example
|
|
48
53
|
RANGES = [0..10, 11..20, 21..50, 51..100, 101..200, 201..300, 301..10_000, 10_001..99_999]
|
49
54
|
end
|
50
55
|
|
51
|
-
map do |word,
|
52
|
-
range = RANGES.find {|range| range.include?(
|
56
|
+
map do |word, sum|
|
57
|
+
range = RANGES.find {|range| range.include?(sum) }
|
53
58
|
emit("#{range.first.to_s}-#{range.last.to_s}", 1)
|
54
59
|
end
|
55
60
|
|
56
61
|
reduce do |range, counts|
|
57
|
-
total = counts.inject(0) {|sum,count| sum+count
|
62
|
+
total = counts.inject(0) {|sum,count| sum+count }
|
58
63
|
emit(range, '|'*(total/20))
|
59
64
|
end
|
60
65
|
end
|
61
66
|
|
67
|
+
To run the above example, run:
|
68
|
+
> jmapreduce examples/wordcount.rb examples/alice.txt /tmp/alice-out
|
69
|
+
|
70
|
+
|
71
|
+
Using Java classes Example
|
72
|
+
--------------------------
|
73
|
+
|
74
|
+
import org.fingertap.jmapreduce.JMapReduce
|
75
|
+
|
76
|
+
import java.util.StringTokenizer
|
77
|
+
|
78
|
+
JMapReduce.job 'Count' do
|
79
|
+
reduce_tasks 1
|
80
|
+
|
81
|
+
map do |key, value|
|
82
|
+
tokenizer = StringTokenizer.new(value, " ")
|
83
|
+
while(tokenizer.hasMoreTokens)
|
84
|
+
word = tokenizer.nextToken
|
85
|
+
emit(word, 1)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
reduce do |word, counts|
|
90
|
+
sum = 0
|
91
|
+
counts.each {|count| sum += count }
|
92
|
+
emit(word, sum)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
To run the above example, run:
|
97
|
+
> jmapreduce examples/wordcount.rb examples/alice.txt /tmp/alice-java-out
|
98
|
+
|
99
|
+
Running a custom org.apache.hadoop.mapreduce.Job Example
|
100
|
+
--------------------------------------------------------
|
101
|
+
|
102
|
+
The example below shows how you can provide a custom job to run and have direct access to the context in your map or reduce blocks so you can write out objects of the class you specified in your custom job.
|
103
|
+
|
104
|
+
import org.fingertap.jmapreduce.JMapReduce
|
105
|
+
|
106
|
+
import org.apache.hadoop.hbase.client.Put
|
107
|
+
import org.apache.hadoop.hbase.HBaseConfiguration
|
108
|
+
|
109
|
+
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
|
110
|
+
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil
|
111
|
+
|
112
|
+
JMapReduce.job "HBase bulk import job" do
|
113
|
+
reduce_tasks 0
|
114
|
+
|
115
|
+
custom_job do |conf|
|
116
|
+
hbase_conf = HBaseConfiguration.create(conf)
|
117
|
+
hbase_conf.set('hbase.zookeeper.quorum', 'hbase.server.address')
|
118
|
+
job = Job.new(hbase_conf, "HBase bulk import job")
|
119
|
+
|
120
|
+
TableMapReduceUtil.initTableReducerJob(property('table_name'), nil, job)
|
121
|
+
TableMapReduceUtil.addDependencyJars(job)
|
122
|
+
TableMapReduceUtil.addDependencyJars(job.getConfiguration)
|
123
|
+
|
124
|
+
job.setMapOutputValueClass(Put.java_class)
|
125
|
+
job
|
126
|
+
end
|
127
|
+
|
128
|
+
setup do
|
129
|
+
@family = "someColumnFamily".to_java_bytes
|
130
|
+
@ts = java.lang.System.currentTimeMillis
|
131
|
+
end
|
132
|
+
|
133
|
+
map do |key, value|
|
134
|
+
row = "#{key}\t#{value}".split("\t")
|
135
|
+
|
136
|
+
row_key = row[0].to_java_bytes
|
137
|
+
someColumnValue = row[1].to_java_bytes
|
138
|
+
someOtherColumnValue = row[2].to_java_bytes
|
139
|
+
|
140
|
+
put = Put.new(row_key)
|
141
|
+
put.add(@family, "someColumn".to_java_bytes, @ts, someColumnValue)
|
142
|
+
put.add(@family, "someOtherColumn".to_java_bytes, @ts, someOtherColumnValue)
|
143
|
+
|
144
|
+
context.write(ImmutableBytesWritable.new(row_key), put)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
To run the above example, run:
|
149
|
+
> jmapreduce examples/hbase_import.rb /path/to/tsv/file /output/path -l $HBASE_HOME/hbase.jar,$HBASE_HOME/lib/zookeeper.jar,$HBASE_HOME/lib/guava.jar -v table_name=someTableName
|
150
|
+
|
151
|
+
Example Hadoop Conf XML File
|
152
|
+
----------------------------
|
153
|
+
|
154
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
155
|
+
<configuration>
|
156
|
+
<property>
|
157
|
+
<name>fs.default.name</name>
|
158
|
+
<value>hdfs://name-node.address:fs-port/</value>
|
159
|
+
</property>
|
160
|
+
<property>
|
161
|
+
<name>mapred.job.tracker</name>
|
162
|
+
<value>job.tracker.address:job-tracker-port</value>
|
163
|
+
</property>
|
164
|
+
</configuration>
|
165
|
+
|
166
|
+
You can pass an XML file in the command line to run your jobs against your chosen Hadoop cluster:
|
167
|
+
> jmapreduce examples/wordcount.rb /path/to/hdfs/input /path/to/hdfs/output -c examples/hadoop\_cluster\_conf\_example.xml
|
168
|
+
|
169
|
+
Todo list
|
170
|
+
---------
|
171
|
+
|
172
|
+
* Rdoc
|
173
|
+
* A way to package and distribute gems
|
174
|
+
* Expose Key/Value separator variable
|
175
|
+
|
62
176
|
Author
|
63
177
|
-------
|
64
178
|
|
data/examples/wordcount.rb
CHANGED
data/lib/jmapreduce/runner.rb
CHANGED
@@ -12,13 +12,14 @@ class Runner
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def hadoop_home
|
15
|
+
raise 'Please set HADOOP_HOME' unless ENV['HADOOP_HOME']
|
15
16
|
ENV['HADOOP_HOME']
|
16
17
|
end
|
17
18
|
|
18
19
|
def hadoop_cmd
|
19
20
|
hadoop = `which hadoop 2>/dev/null`
|
20
21
|
hadoop = "#{hadoop_home}/bin/hadoop" if hadoop.empty? and (!hadoop_home.empty?)
|
21
|
-
raise '
|
22
|
+
raise 'Cannot find hadoop command' if hadoop.empty?
|
22
23
|
hadoop.chomp
|
23
24
|
end
|
24
25
|
|
@@ -32,7 +33,7 @@ class Runner
|
|
32
33
|
end
|
33
34
|
|
34
35
|
def cmd
|
35
|
-
"#{hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS} #{
|
36
|
+
"#{hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS} #{file_args} #{jars_args} #{conf_args} #{archived_args} #{mapred_args} #{properties_args}"
|
36
37
|
end
|
37
38
|
|
38
39
|
def jars_args
|
data/release/jmapreduce.jar
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jmapreduce
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 15
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 2
|
9
|
+
version: "0.2"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Abhinay Mehta
|
@@ -14,7 +14,8 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date:
|
17
|
+
date: 2012-01-25 00:00:00 +00:00
|
18
|
+
default_executable:
|
18
19
|
dependencies:
|
19
20
|
- !ruby/object:Gem::Dependency
|
20
21
|
name: jruby-jars
|
@@ -49,6 +50,7 @@ files:
|
|
49
50
|
- vendors/msgpack.jar
|
50
51
|
- examples/alice.txt
|
51
52
|
- examples/wordcount.rb
|
53
|
+
has_rdoc: true
|
52
54
|
homepage: https://bitbucket.org/abhinaymehta/jmapreduce
|
53
55
|
licenses: []
|
54
56
|
|
@@ -78,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
80
|
requirements: []
|
79
81
|
|
80
82
|
rubyforge_project:
|
81
|
-
rubygems_version: 1.
|
83
|
+
rubygems_version: 1.6.2
|
82
84
|
signing_key:
|
83
85
|
specification_version: 3
|
84
86
|
summary: Map/Reduce Framework
|