jmapreduce 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,28 +1,33 @@
1
1
  JMapReduce
2
2
  ==========
3
3
 
4
- JMapReduce is JRuby Map/Reduce Framework built on top of the Hadoop Distributed computing platform.
5
- Inspired by [mandy](http://github.com/forward/mandy "Mandy") but runs the map/reduce jobs on the JVM.
4
+ JMapReduce provides a simple DSL to run map/reduce jobs on Hadoop in the JVM via JRuby. Because it runs in the JVM, you have access to all the Java objects provided to the Map/Reduce jobs at runtime and can leverage other Java libraries inside your jobs.
6
5
 
7
6
  Install
8
7
  -------
9
8
 
10
- gem install jmapreduce
9
+ > $ gem install jmapreduce
11
10
 
12
11
  Usage
13
12
  -----
14
13
 
15
- 1. Run Hadoop cluster on your machines and set HADOOP_HOME env variable.
16
- 2. put files into your hdfs. eg) test/inputs/file1
14
+ 1. Install Hadoop and set HADOOP_HOME env variable
15
+ 2. To run a jmapreduce script:
16
+ > $ jmapreduce [path-to]/script.rb [path-to]/input [path-to]/output
17
17
 
18
- 3. Now you can run 'jmapreduce' like below:
19
- > $ jmapreduce examples/wordcount.rb test/inputs/file1 test/output
20
- 4. You can also chain map/reduce jobs like the example below. The output of one map/reduce job will be the input of the next job
21
-
22
- 5. For full list of options, run:
18
+ 3. For full list of options, including how to run your scripts against a Hadoop cluster run:
23
19
  > $ jmapreduce -h
24
20
 
25
- Example
21
+ Notes
22
+ -----
23
+
24
+ * Key/Value pairs are generated by splitting input lines with the tab character
25
+ * If no tab character is found in the input line, value is set to the line
26
+ * Mappers and reducers can emit Integers, Floats, Strings, Arrays and Hashes
27
+ * Arrays and Hashes can only be built up of Integers, Floats, Strings, Arrays and Hashes
28
+ * You can chain map/reduce jobs like the example below. The output of one map/reduce job will be the input of the next job
29
+
30
+ Example
26
31
  -------
27
32
 
28
33
  import org.fingertap.jmapreduce.JMapReduce
@@ -36,10 +41,10 @@ Example
36
41
  end
37
42
  end
38
43
 
39
- reduce do |key, values|
44
+ reduce do |word, counts|
40
45
  sum = 0
41
- values.each {|v| sum += v.to_i }
42
- emit(key, sum)
46
+ counts.each {|count| sum += count }
47
+ emit(word, sum)
43
48
  end
44
49
  end
45
50
 
@@ -48,17 +53,126 @@ Example
48
53
  RANGES = [0..10, 11..20, 21..50, 51..100, 101..200, 201..300, 301..10_000, 10_001..99_999]
49
54
  end
50
55
 
51
- map do |word, count|
52
- range = RANGES.find {|range| range.include?(count.to_i) }
56
+ map do |word, sum|
57
+ range = RANGES.find {|range| range.include?(sum) }
53
58
  emit("#{range.first.to_s}-#{range.last.to_s}", 1)
54
59
  end
55
60
 
56
61
  reduce do |range, counts|
57
- total = counts.inject(0) {|sum,count| sum+count.to_i }
62
+ total = counts.inject(0) {|sum,count| sum+count }
58
63
  emit(range, '|'*(total/20))
59
64
  end
60
65
  end
61
66
 
67
+ To run the above example, run:
68
+ > jmapreduce examples/wordcount.rb examples/alice.txt /tmp/alice-out
69
+
70
+
71
+ Using Java classes Example
72
+ --------------------------
73
+
74
+ import org.fingertap.jmapreduce.JMapReduce
75
+
76
+ import java.util.StringTokenizer
77
+
78
+ JMapReduce.job 'Count' do
79
+ reduce_tasks 1
80
+
81
+ map do |key, value|
82
+ tokenizer = StringTokenizer.new(value, " ")
83
+ while(tokenizer.hasMoreTokens)
84
+ word = tokenizer.nextToken
85
+ emit(word, 1)
86
+ end
87
+ end
88
+
89
+ reduce do |word, counts|
90
+ sum = 0
91
+ counts.each {|count| sum += count }
92
+ emit(word, sum)
93
+ end
94
+ end
95
+
96
+ To run the above example, run:
97
+ > jmapreduce examples/wordcount.rb examples/alice.txt /tmp/alice-java-out
98
+
99
+ Running a custom org.apache.hadoop.mapreduce.Job Example
100
+ --------------------------------------------------------
101
+
102
+ The example below shows how you can provide a custom job to run and have direct access to the context in your map or reduce blocks so you can write out objects of the class you specified in your custom job.
103
+
104
+ import org.fingertap.jmapreduce.JMapReduce
105
+
106
+ import org.apache.hadoop.hbase.client.Put
107
+ import org.apache.hadoop.hbase.HBaseConfiguration
108
+
109
+ import org.apache.hadoop.hbase.io.ImmutableBytesWritable
110
+ import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil
111
+
112
+ JMapReduce.job "HBase bulk import job" do
113
+ reduce_tasks 0
114
+
115
+ custom_job do |conf|
116
+ hbase_conf = HBaseConfiguration.create(conf)
117
+ hbase_conf.set('hbase.zookeeper.quorum', 'hbase.server.address')
118
+ job = Job.new(hbase_conf, "HBase bulk import job")
119
+
120
+ TableMapReduceUtil.initTableReducerJob(property('table_name'), nil, job)
121
+ TableMapReduceUtil.addDependencyJars(job)
122
+ TableMapReduceUtil.addDependencyJars(job.getConfiguration)
123
+
124
+ job.setMapOutputValueClass(Put.java_class)
125
+ job
126
+ end
127
+
128
+ setup do
129
+ @family = "someColumnFamily".to_java_bytes
130
+ @ts = java.lang.System.currentTimeMillis
131
+ end
132
+
133
+ map do |key, value|
134
+ row = "#{key}\t#{value}".split("\t")
135
+
136
+ row_key = row[0].to_java_bytes
137
+ someColumnValue = row[1].to_java_bytes
138
+ someOtherColumnValue = row[2].to_java_bytes
139
+
140
+ put = Put.new(row_key)
141
+ put.add(@family, "someColumn".to_java_bytes, @ts, someColumnValue)
142
+ put.add(@family, "someOtherColumn".to_java_bytes, @ts, someOtherColumnValue)
143
+
144
+ context.write(ImmutableBytesWritable.new(row_key), put)
145
+ end
146
+ end
147
+
148
+ To run the above example, run:
149
+ > jmapreduce examples/hbase_import.rb /path/to/tsv/file /output/path -l $HBASE_HOME/hbase.jar,$HBASE_HOME/lib/zookeeper.jar,$HBASE_HOME/lib/guava.jar -v table_name=someTableName
150
+
151
+ Example Hadoop Conf XML File
152
+ ----------------------------
153
+
154
+ <?xml version="1.0" encoding="UTF-8"?>
155
+ <configuration>
156
+ <property>
157
+ <name>fs.default.name</name>
158
+ <value>hdfs://name-node.address:fs-port/</value>
159
+ </property>
160
+ <property>
161
+ <name>mapred.job.tracker</name>
162
+ <value>job.tracker.address:job-tracker-port</value>
163
+ </property>
164
+ </configuration>
165
+
166
+ You can pass an XML file in the command line to run your jobs against your chosen Hadoop cluster:
167
+ > jmapreduce examples/wordcount.rb /path/to/hdfs/input /path/to/hdfs/output -c examples/hadoop\_cluster\_conf\_example.xml
168
+
169
+ Todo list
170
+ ---------
171
+
172
+ * Rdoc
173
+ * A way to package and distribute gems
174
+ * Expose Key/Value separator variable
175
+
62
176
  Author
63
177
  -------
64
178
 
@@ -39,4 +39,6 @@ end
39
39
 
40
40
  __END__
41
41
 
42
+ To run:
43
+
42
44
  ./bin/jmapreduce examples/wordcount.rb examples/alice.txt /tmp/output
@@ -12,13 +12,14 @@ class Runner
12
12
  end
13
13
 
14
14
  def hadoop_home
15
+ raise 'Please set HADOOP_HOME' unless ENV['HADOOP_HOME']
15
16
  ENV['HADOOP_HOME']
16
17
  end
17
18
 
18
19
  def hadoop_cmd
19
20
  hadoop = `which hadoop 2>/dev/null`
20
21
  hadoop = "#{hadoop_home}/bin/hadoop" if hadoop.empty? and (!hadoop_home.empty?)
21
- raise 'cannot find hadoop command' if hadoop.empty?
22
+ raise 'Cannot find hadoop command' if hadoop.empty?
22
23
  hadoop.chomp
23
24
  end
24
25
 
@@ -32,7 +33,7 @@ class Runner
32
33
  end
33
34
 
34
35
  def cmd
35
- "#{hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS} #{jars_args} #{file_args} #{conf_args} #{archived_args} #{mapred_args} #{properties_args}"
36
+ "#{hadoop_cmd} jar #{main_jar_path} #{JAVA_MAIN_CLASS} #{file_args} #{jars_args} #{conf_args} #{archived_args} #{mapred_args} #{properties_args}"
36
37
  end
37
38
 
38
39
  def jars_args
Binary file
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jmapreduce
3
3
  version: !ruby/object:Gem::Version
4
- hash: 9
4
+ hash: 15
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 1
9
- version: "0.1"
8
+ - 2
9
+ version: "0.2"
10
10
  platform: ruby
11
11
  authors:
12
12
  - Abhinay Mehta
@@ -14,7 +14,8 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-09-12 00:00:00 Z
17
+ date: 2012-01-25 00:00:00 +00:00
18
+ default_executable:
18
19
  dependencies:
19
20
  - !ruby/object:Gem::Dependency
20
21
  name: jruby-jars
@@ -49,6 +50,7 @@ files:
49
50
  - vendors/msgpack.jar
50
51
  - examples/alice.txt
51
52
  - examples/wordcount.rb
53
+ has_rdoc: true
52
54
  homepage: https://bitbucket.org/abhinaymehta/jmapreduce
53
55
  licenses: []
54
56
 
@@ -78,7 +80,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
80
  requirements: []
79
81
 
80
82
  rubyforge_project:
81
- rubygems_version: 1.7.2
83
+ rubygems_version: 1.6.2
82
84
  signing_key:
83
85
  specification_version: 3
84
86
  summary: Map/Reduce Framework