hadoop-rubydsl 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +53 -0
- data/Rakefile +18 -0
- data/TODO +2 -0
- data/VERSION +1 -0
- data/bin/hadoop +276 -0
- data/bin/hadoop-ruby.sh +30 -0
- data/conf/hadoop-site.xml +19 -0
- data/examples/apachelog-v2-2.rb +18 -0
- data/examples/apachelog-v2.rb +25 -0
- data/examples/apachelog.rb +15 -0
- data/examples/hive_like_test.rb +14 -0
- data/examples/word_count_test.rb +7 -0
- data/hadoop-rubydsl.gemspec +79 -0
- data/lib/core.rb +108 -0
- data/lib/hive_like.rb +122 -0
- data/lib/init.rb +60 -0
- data/lib/java/.gitignore +1 -0
- data/lib/java/hadoop-ruby.jar +0 -0
- data/lib/log_analysis.rb +165 -0
- data/lib/mapred_factory.rb +43 -0
- data/lib/util.rb +11 -0
- data/lib/word_count.rb +76 -0
- data/spec/core_spec.rb +73 -0
- data/spec/example_spec.rb +82 -0
- data/spec/hive_like_spec.rb +58 -0
- data/spec/init_spec.rb +56 -0
- data/spec/log_analysis_spec.rb +119 -0
- data/spec/mapred_factory_spec.rb +42 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/util_spec.rb +15 -0
- data/spec/word_count_spec.rb +89 -0
- metadata +100 -0
data/README
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
= hadoop-rubydsl
|
2
|
+
|
3
|
+
== Description
|
4
|
+
HadoopのMapper/ReducerをRubyによるDSLで記述することができます。
|
5
|
+
hadoop-ruby.jarを利用します。
|
6
|
+
|
7
|
+
例)
|
8
|
+
apachelog.rb
|
9
|
+
|
10
|
+
# log:
|
11
|
+
# 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
|
12
|
+
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
|
13
|
+
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
|
14
|
+
|
15
|
+
use 'LogAnalysis'
|
16
|
+
data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
|
17
|
+
column[2].count_uniq
|
18
|
+
column[3].count_uniq
|
19
|
+
column[4].count_uniq
|
20
|
+
column[5].count_uniq
|
21
|
+
column[6].sum
|
22
|
+
|
23
|
+
=>
|
24
|
+
col2 frank 1
|
25
|
+
col2 frank2 2
|
26
|
+
col3 [10/Oct/2000:13:55:36 -0700] 3
|
27
|
+
col4 "GET /apache_pb.gif HTTP/1.0" 1
|
28
|
+
col4 "GET /apache_pb2.gif HTTP/1.0" 1
|
29
|
+
col4 "GET /apache_pb3.gif HTTP/1.0" 1
|
30
|
+
col5 200 2
|
31
|
+
col5 404 1
|
32
|
+
col6 6978
|
33
|
+
|
34
|
+
== Usage
|
35
|
+
0. HADOOP_HOMEを正しく設定し、Hadoopを一式立ち上げておく。
|
36
|
+
|
37
|
+
1. jruby-complete-*.jar を lib/java 以下にコピー
|
38
|
+
ex)
|
39
|
+
$ wget http://jruby.kenai.com/downloads/1.4.0RC2/jruby-complete-1.4.0RC2.jar
|
40
|
+
$ cp jruby-complete-*.jar lib/java/
|
41
|
+
|
42
|
+
2. データを HDFS にアップロード
|
43
|
+
ex)
|
44
|
+
$ hadoop dfs -copyFromLocal apachelog inputs/
|
45
|
+
|
46
|
+
3. MapReduce実行
|
47
|
+
$ bin/hadoop-ruby.sh examples/apachelog.rb inputs outputs
|
48
|
+
|
49
|
+
== Author
|
50
|
+
Koichi Fujikawa <fujibee@gmail.com>
|
51
|
+
|
52
|
+
== Copyright
|
53
|
+
License: Apache License
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
begin
|
2
|
+
require 'jeweler'
|
3
|
+
Jeweler::Tasks.new do |gemspec|
|
4
|
+
gemspec.name = "hadoop-rubydsl"
|
5
|
+
gemspec.summary = "Hadoop Ruby DSL"
|
6
|
+
gemspec.description = "Hadoop Ruby DSL"
|
7
|
+
gemspec.email = "fujibee@gmail.com"
|
8
|
+
gemspec.homepage = "http://github.com/fujibee/hadoop-rubydsl"
|
9
|
+
gemspec.authors = ["Koichi Fujikawa"]
|
10
|
+
|
11
|
+
gemspec.add_dependency 'jruby-on-hadoop'
|
12
|
+
gemspec.files.exclude "spec/**/*"
|
13
|
+
end
|
14
|
+
Jeweler::GemcutterTasks.new
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
17
|
+
end
|
18
|
+
|
data/TODO
ADDED
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
data/bin/hadoop
ADDED
@@ -0,0 +1,276 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# Licensed to the Apache Software Foundation (ASF) under one or more
|
4
|
+
# contributor license agreements. See the NOTICE file distributed with
|
5
|
+
# this work for additional information regarding copyright ownership.
|
6
|
+
# The ASF licenses this file to You under the Apache License, Version 2.0
|
7
|
+
# (the "License"); you may not use this file except in compliance with
|
8
|
+
# the License. You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
|
18
|
+
|
19
|
+
# The Hadoop command script
|
20
|
+
#
|
21
|
+
# Environment Variables
|
22
|
+
#
|
23
|
+
# JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
|
24
|
+
#
|
25
|
+
# HADOOP_CLASSPATH Extra Java CLASSPATH entries.
|
26
|
+
#
|
27
|
+
# HADOOP_HEAPSIZE The maximum amount of heap to use, in MB.
|
28
|
+
# Default is 1000.
|
29
|
+
#
|
30
|
+
# HADOOP_OPTS Extra Java runtime options.
|
31
|
+
#
|
32
|
+
# HADOOP_NAMENODE_OPTS These options are added to HADOOP_OPTS
|
33
|
+
# HADOOP_CLIENT_OPTS when the respective command is run.
|
34
|
+
# HADOOP_{COMMAND}_OPTS etc HADOOP_JT_OPTS applies to JobTracker
|
35
|
+
# for e.g. HADOOP_CLIENT_OPTS applies to
|
36
|
+
# more than one command (fs, dfs, fsck,
|
37
|
+
# dfsadmin etc)
|
38
|
+
#
|
39
|
+
# HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
|
40
|
+
#
|
41
|
+
# HADOOP_ROOT_LOGGER The root appender. Default is INFO,console
|
42
|
+
#
|
43
|
+
|
44
|
+
bin=`dirname "$0"`
|
45
|
+
bin=`cd "$bin"; pwd`
|
46
|
+
|
47
|
+
if [ -f "$bin"/hadoop-config.sh ]; then
|
48
|
+
. "$bin"/hadoop-config.sh
|
49
|
+
fi
|
50
|
+
|
51
|
+
cygwin=false
|
52
|
+
case "`uname`" in
|
53
|
+
CYGWIN*) cygwin=true;;
|
54
|
+
esac
|
55
|
+
|
56
|
+
# if no args specified, show usage
|
57
|
+
if [ $# = 0 ]; then
|
58
|
+
echo "Usage: hadoop [--config confdir] COMMAND"
|
59
|
+
echo "where COMMAND is one of:"
|
60
|
+
echo " namenode -format format the DFS filesystem"
|
61
|
+
echo " secondarynamenode run the DFS secondary namenode"
|
62
|
+
echo " namenode run the DFS namenode"
|
63
|
+
echo " datanode run a DFS datanode"
|
64
|
+
echo " dfsadmin run a DFS admin client"
|
65
|
+
echo " fsck run a DFS filesystem checking utility"
|
66
|
+
echo " fs run a generic filesystem user client"
|
67
|
+
echo " balancer run a cluster balancing utility"
|
68
|
+
echo " jobtracker run the MapReduce job Tracker node"
|
69
|
+
echo " pipes run a Pipes job"
|
70
|
+
echo " tasktracker run a MapReduce task Tracker node"
|
71
|
+
echo " job manipulate MapReduce jobs"
|
72
|
+
echo " queue get information regarding JobQueues"
|
73
|
+
echo " version print the version"
|
74
|
+
echo " jar <jar> run a jar file"
|
75
|
+
echo " distcp <srcurl> <desturl> copy file or directories recursively"
|
76
|
+
echo " archive -archiveName NAME <src>* <dest> create a hadoop archive"
|
77
|
+
echo " daemonlog get/set the log level for each daemon"
|
78
|
+
echo " or"
|
79
|
+
echo " CLASSNAME run the class named CLASSNAME"
|
80
|
+
echo "Most commands print help when invoked w/o parameters."
|
81
|
+
exit 1
|
82
|
+
fi
|
83
|
+
|
84
|
+
# get arguments
|
85
|
+
COMMAND=$1
|
86
|
+
shift
|
87
|
+
|
88
|
+
if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
|
89
|
+
. "${HADOOP_CONF_DIR}/hadoop-env.sh"
|
90
|
+
fi
|
91
|
+
|
92
|
+
# some Java parameters
|
93
|
+
if [ "$JAVA_HOME" != "" ]; then
|
94
|
+
#echo "run java in $JAVA_HOME"
|
95
|
+
JAVA_HOME=$JAVA_HOME
|
96
|
+
fi
|
97
|
+
|
98
|
+
if [ "$JAVA_HOME" = "" ]; then
|
99
|
+
echo "Error: JAVA_HOME is not set."
|
100
|
+
exit 1
|
101
|
+
fi
|
102
|
+
|
103
|
+
JAVA=$JAVA_HOME/bin/java
|
104
|
+
JAVA_HEAP_MAX=-Xmx1000m
|
105
|
+
|
106
|
+
# check envvars which might override default args
|
107
|
+
if [ "$HADOOP_HEAPSIZE" != "" ]; then
|
108
|
+
#echo "run with heapsize $HADOOP_HEAPSIZE"
|
109
|
+
JAVA_HEAP_MAX="-Xmx""$HADOOP_HEAPSIZE""m"
|
110
|
+
#echo $JAVA_HEAP_MAX
|
111
|
+
fi
|
112
|
+
|
113
|
+
# CLASSPATH initially contains $HADOOP_CONF_DIR
|
114
|
+
CLASSPATH="${HADOOP_CONF_DIR}"
|
115
|
+
CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
|
116
|
+
|
117
|
+
# for developers, add Hadoop classes to CLASSPATH
|
118
|
+
if [ -d "$HADOOP_HOME/build/classes" ]; then
|
119
|
+
CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/classes
|
120
|
+
fi
|
121
|
+
if [ -d "$HADOOP_HOME/build/webapps" ]; then
|
122
|
+
CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build
|
123
|
+
fi
|
124
|
+
if [ -d "$HADOOP_HOME/build/test/classes" ]; then
|
125
|
+
CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/test/classes
|
126
|
+
fi
|
127
|
+
if [ -d "$HADOOP_HOME/build/tools" ]; then
|
128
|
+
CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/tools
|
129
|
+
fi
|
130
|
+
|
131
|
+
# so that filenames w/ spaces are handled correctly in loops below
|
132
|
+
IFS=
|
133
|
+
|
134
|
+
# for releases, add core hadoop jar & webapps to CLASSPATH
|
135
|
+
if [ -d "$HADOOP_HOME/webapps" ]; then
|
136
|
+
CLASSPATH=${CLASSPATH}:$HADOOP_HOME
|
137
|
+
fi
|
138
|
+
for f in $HADOOP_HOME/hadoop-*-core.jar; do
|
139
|
+
CLASSPATH=${CLASSPATH}:$f;
|
140
|
+
done
|
141
|
+
|
142
|
+
# add libs to CLASSPATH
|
143
|
+
for f in $HADOOP_HOME/lib/*.jar; do
|
144
|
+
CLASSPATH=${CLASSPATH}:$f;
|
145
|
+
done
|
146
|
+
|
147
|
+
for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do
|
148
|
+
CLASSPATH=${CLASSPATH}:$f;
|
149
|
+
done
|
150
|
+
|
151
|
+
for f in $HADOOP_HOME/hadoop-*-tools.jar; do
|
152
|
+
TOOL_PATH=${TOOL_PATH}:$f;
|
153
|
+
done
|
154
|
+
for f in $HADOOP_HOME/build/hadoop-*-tools.jar; do
|
155
|
+
TOOL_PATH=${TOOL_PATH}:$f;
|
156
|
+
done
|
157
|
+
|
158
|
+
# add user-specified CLASSPATH last
|
159
|
+
if [ "$HADOOP_CLASSPATH" != "" ]; then
|
160
|
+
CLASSPATH=${CLASSPATH}:${HADOOP_CLASSPATH}
|
161
|
+
fi
|
162
|
+
|
163
|
+
# default log directory & file
|
164
|
+
if [ "$HADOOP_LOG_DIR" = "" ]; then
|
165
|
+
HADOOP_LOG_DIR="$HADOOP_HOME/logs"
|
166
|
+
fi
|
167
|
+
if [ "$HADOOP_LOGFILE" = "" ]; then
|
168
|
+
HADOOP_LOGFILE='hadoop.log'
|
169
|
+
fi
|
170
|
+
|
171
|
+
# restore ordinary behaviour
|
172
|
+
unset IFS
|
173
|
+
|
174
|
+
# figure out which class to run
|
175
|
+
if [ "$COMMAND" = "namenode" ] ; then
|
176
|
+
CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
|
177
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"
|
178
|
+
elif [ "$COMMAND" = "secondarynamenode" ] ; then
|
179
|
+
CLASS='org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode'
|
180
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_SECONDARYNAMENODE_OPTS"
|
181
|
+
elif [ "$COMMAND" = "datanode" ] ; then
|
182
|
+
CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
|
183
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_DATANODE_OPTS"
|
184
|
+
elif [ "$COMMAND" = "fs" ] ; then
|
185
|
+
CLASS=org.apache.hadoop.fs.FsShell
|
186
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
187
|
+
elif [ "$COMMAND" = "dfs" ] ; then
|
188
|
+
CLASS=org.apache.hadoop.fs.FsShell
|
189
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
190
|
+
elif [ "$COMMAND" = "dfsadmin" ] ; then
|
191
|
+
CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin
|
192
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
193
|
+
elif [ "$COMMAND" = "fsck" ] ; then
|
194
|
+
CLASS=org.apache.hadoop.hdfs.tools.DFSck
|
195
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
196
|
+
elif [ "$COMMAND" = "balancer" ] ; then
|
197
|
+
CLASS=org.apache.hadoop.hdfs.server.balancer.Balancer
|
198
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS"
|
199
|
+
elif [ "$COMMAND" = "jobtracker" ] ; then
|
200
|
+
CLASS=org.apache.hadoop.mapred.JobTracker
|
201
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
|
202
|
+
elif [ "$COMMAND" = "tasktracker" ] ; then
|
203
|
+
CLASS=org.apache.hadoop.mapred.TaskTracker
|
204
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
|
205
|
+
elif [ "$COMMAND" = "job" ] ; then
|
206
|
+
CLASS=org.apache.hadoop.mapred.JobClient
|
207
|
+
elif [ "$COMMAND" = "queue" ] ; then
|
208
|
+
CLASS=org.apache.hadoop.mapred.JobQueueClient
|
209
|
+
elif [ "$COMMAND" = "pipes" ] ; then
|
210
|
+
CLASS=org.apache.hadoop.mapred.pipes.Submitter
|
211
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
212
|
+
elif [ "$COMMAND" = "version" ] ; then
|
213
|
+
CLASS=org.apache.hadoop.util.VersionInfo
|
214
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
215
|
+
elif [ "$COMMAND" = "jar" ] ; then
|
216
|
+
CLASS=org.apache.hadoop.mapred.JobShell
|
217
|
+
elif [ "$COMMAND" = "distcp" ] ; then
|
218
|
+
CLASS=org.apache.hadoop.tools.DistCp
|
219
|
+
CLASSPATH=${CLASSPATH}:${TOOL_PATH}
|
220
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
221
|
+
elif [ "$COMMAND" = "daemonlog" ] ; then
|
222
|
+
CLASS=org.apache.hadoop.log.LogLevel
|
223
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
224
|
+
elif [ "$COMMAND" = "archive" ] ; then
|
225
|
+
CLASS=org.apache.hadoop.tools.HadoopArchives
|
226
|
+
CLASSPATH=${CLASSPATH}:${TOOL_PATH}
|
227
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
228
|
+
elif [ "$COMMAND" = "sampler" ] ; then
|
229
|
+
CLASS=org.apache.hadoop.mapred.lib.InputSampler
|
230
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
231
|
+
else
|
232
|
+
CLASS=$COMMAND
|
233
|
+
fi
|
234
|
+
|
235
|
+
# cygwin path translation
|
236
|
+
if $cygwin; then
|
237
|
+
CLASSPATH=`cygpath -p -w "$CLASSPATH"`
|
238
|
+
HADOOP_HOME=`cygpath -d "$HADOOP_HOME"`
|
239
|
+
HADOOP_LOG_DIR=`cygpath -d "$HADOOP_LOG_DIR"`
|
240
|
+
TOOL_PATH=`cygpath -p -w "$TOOL_PATH"`
|
241
|
+
fi
|
242
|
+
# setup 'java.library.path' for native-hadoop code if necessary
|
243
|
+
JAVA_LIBRARY_PATH=''
|
244
|
+
if [ -d "${HADOOP_HOME}/build/native" -o -d "${HADOOP_HOME}/lib/native" ]; then
|
245
|
+
JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"`
|
246
|
+
|
247
|
+
if [ -d "$HADOOP_HOME/build/native" ]; then
|
248
|
+
JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib
|
249
|
+
fi
|
250
|
+
|
251
|
+
if [ -d "${HADOOP_HOME}/lib/native" ]; then
|
252
|
+
if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
|
253
|
+
JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
|
254
|
+
else
|
255
|
+
JAVA_LIBRARY_PATH=${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
|
256
|
+
fi
|
257
|
+
fi
|
258
|
+
fi
|
259
|
+
|
260
|
+
# cygwin path translation
|
261
|
+
if $cygwin; then
|
262
|
+
JAVA_LIBRARY_PATH=`cygpath -p "$JAVA_LIBRARY_PATH"`
|
263
|
+
fi
|
264
|
+
|
265
|
+
HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.dir=$HADOOP_LOG_DIR"
|
266
|
+
HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.file=$HADOOP_LOGFILE"
|
267
|
+
HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.home.dir=$HADOOP_HOME"
|
268
|
+
HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.id.str=$HADOOP_IDENT_STRING"
|
269
|
+
HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.root.logger=${HADOOP_ROOT_LOGGER:-INFO,console}"
|
270
|
+
if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
|
271
|
+
HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
|
272
|
+
fi
|
273
|
+
|
274
|
+
# run it
|
275
|
+
#echo exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
|
276
|
+
exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
|
data/bin/hadoop-ruby.sh
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
BIN_DIR=`dirname "$0"`
|
3
|
+
BASE_DIR=`cd $BIN_DIR/..; pwd`
|
4
|
+
|
5
|
+
# choose hadoop sh
|
6
|
+
HADOOP=$HADOOP_HOME/bin/hadoop
|
7
|
+
if [ ! -f $HADOOP ]; then
|
8
|
+
HADOOP=$BIN_DIR/hadoop
|
9
|
+
#HADOOP_OPTS="--config $BASE_DIR/conf"
|
10
|
+
fi
|
11
|
+
|
12
|
+
# fetch jruby jar if not exist
|
13
|
+
LIB_DIR=$BASE_DIR/lib/java
|
14
|
+
JRUBY_JAR=jruby-complete-1.4.0.jar
|
15
|
+
if [ ! -f "$LIB_DIR/$JRUBY_JAR" ]; then
|
16
|
+
wget http://jruby.kenai.com/downloads/1.4.0/jruby-complete-1.4.0.jar
|
17
|
+
mv $JRUBY_JAR $LIB_DIR/
|
18
|
+
fi
|
19
|
+
|
20
|
+
# construct command line
|
21
|
+
HADOOP_RUBY_LIB_DIR=$BASE_DIR/lib
|
22
|
+
export HADOOP_CLASSPATH=$HADOOP_RUBY_LIB_DIR
|
23
|
+
for x in `ls $HADOOP_RUBY_LIB_DIR`; do
|
24
|
+
DSL_FILES=$HADOOP_RUBY_LIB_DIR/$x,$DSL_FILES
|
25
|
+
done
|
26
|
+
DSL_FILES=$DSL_FILES$1
|
27
|
+
|
28
|
+
# execute hadoop ruby
|
29
|
+
echo runnig $1...
|
30
|
+
$HADOOP $HADOOP_OPTS jar $LIB_DIR/hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars $LIB_DIR/$JRUBY_JAR -files $DSL_FILES $1 $2 $3
|
@@ -0,0 +1,19 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
3
|
+
|
4
|
+
<!-- Put site-specific property overrides in this file. -->
|
5
|
+
|
6
|
+
<configuration>
|
7
|
+
<property>
|
8
|
+
<name>fs.default.name</name>
|
9
|
+
<value>hdfs://localhost:9000/</value>
|
10
|
+
</property>
|
11
|
+
<property>
|
12
|
+
<name>mapred.job.tracker</name>
|
13
|
+
<value>localhost:50040</value>
|
14
|
+
</property>
|
15
|
+
<property>
|
16
|
+
<name>mapred.child.java.opts</name>
|
17
|
+
<value>-Xmx512m</value>
|
18
|
+
</property>
|
19
|
+
</configuration>
|
@@ -0,0 +1,18 @@
|
|
1
|
+
use 'LogAnalysis'
|
2
|
+
|
3
|
+
data 'apache log on test2' do
|
4
|
+
from 'apachelog/inputs'
|
5
|
+
to 'apachelog/outputs'
|
6
|
+
|
7
|
+
# 119.63.199.8 - - [15/Nov/2009:01:18:16 +0900] "GET /ranking/game?page=31 HTTP/1.1" 200 10077 "-" "Baiduspider+(+http://www.baidu.jp/spider/)"
|
8
|
+
# 203.83.243.81 - - [15/Nov/2009:01:18:33 +0900] "GET /dns_zones.txt HTTP/1.1" 404 294 "-" "libwww-perl/5.65"
|
9
|
+
|
10
|
+
each_line do
|
11
|
+
pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
|
12
|
+
column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
|
13
|
+
|
14
|
+
topic 'ua counts', :label => 'ua' do
|
15
|
+
count_uniq column[:ua]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
use 'LogAnalysis'
|
2
|
+
|
3
|
+
data 'apache log on test1' do
|
4
|
+
from 'apachlog/inputs'
|
5
|
+
to 'apachlog/outputs'
|
6
|
+
|
7
|
+
each_line do
|
8
|
+
pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
|
9
|
+
column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes' # 各カラムにラベルをつける
|
10
|
+
|
11
|
+
topic 'which users?', :label => 'user' do
|
12
|
+
count_uniq column[:user]
|
13
|
+
end
|
14
|
+
|
15
|
+
# topic 'access date by monthly' do
|
16
|
+
# select_date column[:access_date], BY_MONTHLY
|
17
|
+
# count column[:access_date]
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# topic 'total bytes' do
|
21
|
+
# select_date column[:access_date], BY_MONTHLY
|
22
|
+
# sum column[:bytes].to_kilobytes # / 1024
|
23
|
+
# end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# Apache log analysis
|
2
|
+
#
|
3
|
+
# example target data:
|
4
|
+
# 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
|
5
|
+
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
|
6
|
+
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
|
7
|
+
|
8
|
+
use 'LogAnalysis'
|
9
|
+
|
10
|
+
data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
|
11
|
+
column[2].count_uniq
|
12
|
+
column[3].count_uniq
|
13
|
+
column[4].count_uniq
|
14
|
+
column[5].count_uniq
|
15
|
+
column[6].sum
|
@@ -0,0 +1,14 @@
|
|
1
|
+
use 'HiveLike'
|
2
|
+
|
3
|
+
# hive-like/items.txt
|
4
|
+
# apple, 3, 100
|
5
|
+
# banana, 1, 50
|
6
|
+
|
7
|
+
create_table items(item STRING, quantity INT, price INT);
|
8
|
+
load_data "hive-like/items.txt" items;
|
9
|
+
|
10
|
+
select quantity, price, item from items;
|
11
|
+
|
12
|
+
# expect
|
13
|
+
# 0 apple 3 300
|
14
|
+
# 1 banana 1 50
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{hadoop-rubydsl}
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Koichi Fujikawa"]
|
12
|
+
s.date = %q{2009-12-26}
|
13
|
+
s.description = %q{Hadoop Ruby DSL}
|
14
|
+
s.email = %q{fujibee@gmail.com}
|
15
|
+
s.executables = ["hadoop", "hadoop-ruby.sh"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README",
|
18
|
+
"TODO"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
"README",
|
22
|
+
"Rakefile",
|
23
|
+
"TODO",
|
24
|
+
"VERSION",
|
25
|
+
"bin/hadoop",
|
26
|
+
"bin/hadoop-ruby.sh",
|
27
|
+
"conf/hadoop-site.xml",
|
28
|
+
"examples/apachelog-v2-2.rb",
|
29
|
+
"examples/apachelog-v2.rb",
|
30
|
+
"examples/apachelog.rb",
|
31
|
+
"examples/hive_like_test.rb",
|
32
|
+
"examples/word_count_test.rb",
|
33
|
+
"hadoop-rubydsl.gemspec",
|
34
|
+
"lib/core.rb",
|
35
|
+
"lib/hive_like.rb",
|
36
|
+
"lib/init.rb",
|
37
|
+
"lib/java/.gitignore",
|
38
|
+
"lib/java/hadoop-ruby.jar",
|
39
|
+
"lib/log_analysis.rb",
|
40
|
+
"lib/mapred_factory.rb",
|
41
|
+
"lib/util.rb",
|
42
|
+
"lib/word_count.rb"
|
43
|
+
]
|
44
|
+
s.homepage = %q{http://github.com/fujibee/hadoop-rubydsl}
|
45
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
46
|
+
s.require_paths = ["lib"]
|
47
|
+
s.rubygems_version = %q{1.3.5}
|
48
|
+
s.summary = %q{Hadoop Ruby DSL}
|
49
|
+
s.test_files = [
|
50
|
+
"spec/spec_helper.rb",
|
51
|
+
"spec/core_spec.rb",
|
52
|
+
"spec/util_spec.rb",
|
53
|
+
"spec/mapred_factory_spec.rb",
|
54
|
+
"spec/word_count_spec.rb",
|
55
|
+
"spec/hive_like_spec.rb",
|
56
|
+
"spec/log_analysis_spec.rb",
|
57
|
+
"spec/example_spec.rb",
|
58
|
+
"spec/init_spec.rb",
|
59
|
+
"examples/apachelog-v2.rb",
|
60
|
+
"examples/hive_like_test.rb",
|
61
|
+
"examples/word_count_test.rb",
|
62
|
+
"examples/apachelog-v2-2.rb",
|
63
|
+
"examples/apachelog.rb"
|
64
|
+
]
|
65
|
+
|
66
|
+
if s.respond_to? :specification_version then
|
67
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
68
|
+
s.specification_version = 3
|
69
|
+
|
70
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
71
|
+
s.add_runtime_dependency(%q<jruby-on-hadoop>, [">= 0"])
|
72
|
+
else
|
73
|
+
s.add_dependency(%q<jruby-on-hadoop>, [">= 0"])
|
74
|
+
end
|
75
|
+
else
|
76
|
+
s.add_dependency(%q<jruby-on-hadoop>, [">= 0"])
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
data/lib/core.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'util'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module HadoopDsl
|
5
|
+
# controller
|
6
|
+
class BaseMapRed
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
attr_reader :emitted
|
10
|
+
|
11
|
+
def initialize(script, model)
|
12
|
+
@script, @model = script, model
|
13
|
+
@model.controller = self
|
14
|
+
@emitted = []
|
15
|
+
end
|
16
|
+
|
17
|
+
def run
|
18
|
+
body = pre_process(read_file(@script))
|
19
|
+
eval(body, binding, @script)
|
20
|
+
end
|
21
|
+
|
22
|
+
def pre_process(body)
|
23
|
+
body # do nothing
|
24
|
+
end
|
25
|
+
|
26
|
+
def emit(hash) @emitted << hash end
|
27
|
+
|
28
|
+
# all DSL statements without def is processed here
|
29
|
+
def method_missing(method_name, *args) self end
|
30
|
+
end
|
31
|
+
|
32
|
+
class BaseSetup
|
33
|
+
def initialize(script, conf)
|
34
|
+
@script, @conf = script, conf
|
35
|
+
output_format
|
36
|
+
end
|
37
|
+
|
38
|
+
def run
|
39
|
+
body = pre_process(read_file(@script))
|
40
|
+
eval(body, binding, @script)
|
41
|
+
end
|
42
|
+
|
43
|
+
def pre_process(body)
|
44
|
+
body # do nothing
|
45
|
+
end
|
46
|
+
|
47
|
+
# do nothing
|
48
|
+
def output_format; end
|
49
|
+
|
50
|
+
def paths; [@from, @to] end
|
51
|
+
|
52
|
+
def from(path) @from = path end
|
53
|
+
def to(path) @to = path end
|
54
|
+
|
55
|
+
# all DSL statements without def is processed here
|
56
|
+
def method_missing(method_name, *args) self end
|
57
|
+
end
|
58
|
+
|
59
|
+
class BaseMapper < BaseMapRed
|
60
|
+
def initialize(script, model)
|
61
|
+
super(script, model)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class BaseReducer < BaseMapRed
|
66
|
+
def initialize(script, model)
|
67
|
+
super(script, model)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# model
|
72
|
+
class BaseModel
|
73
|
+
attr_accessor :controller
|
74
|
+
|
75
|
+
# all DSL statements without def is processed here
|
76
|
+
def method_missing(method_name, *args) self end
|
77
|
+
end
|
78
|
+
|
79
|
+
class BaseMapperModel < BaseModel
|
80
|
+
attr_reader :key, :value
|
81
|
+
|
82
|
+
def initialize(key, value)
|
83
|
+
@key, @value = key, value
|
84
|
+
end
|
85
|
+
|
86
|
+
# common functions
|
87
|
+
def identity
|
88
|
+
@controller.emit(@key => @value)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
class BaseReducerModel < BaseModel
|
93
|
+
attr_reader :key, :values
|
94
|
+
|
95
|
+
def initialize(key, values)
|
96
|
+
@key, @values = key, values
|
97
|
+
end
|
98
|
+
|
99
|
+
# common functions
|
100
|
+
def aggregate
|
101
|
+
@controller.emit(@key => @values.inject {|ret, i| ret + i})
|
102
|
+
end
|
103
|
+
|
104
|
+
def identity
|
105
|
+
@values.each {|v| @controller.emit(@key => v)}
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|