hadoop-rubydsl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +53 -0
- data/Rakefile +18 -0
- data/TODO +2 -0
- data/VERSION +1 -0
- data/bin/hadoop +276 -0
- data/bin/hadoop-ruby.sh +30 -0
- data/conf/hadoop-site.xml +19 -0
- data/examples/apachelog-v2-2.rb +18 -0
- data/examples/apachelog-v2.rb +25 -0
- data/examples/apachelog.rb +15 -0
- data/examples/hive_like_test.rb +14 -0
- data/examples/word_count_test.rb +7 -0
- data/hadoop-rubydsl.gemspec +79 -0
- data/lib/core.rb +108 -0
- data/lib/hive_like.rb +122 -0
- data/lib/init.rb +60 -0
- data/lib/java/.gitignore +1 -0
- data/lib/java/hadoop-ruby.jar +0 -0
- data/lib/log_analysis.rb +165 -0
- data/lib/mapred_factory.rb +43 -0
- data/lib/util.rb +11 -0
- data/lib/word_count.rb +76 -0
- data/spec/core_spec.rb +73 -0
- data/spec/example_spec.rb +82 -0
- data/spec/hive_like_spec.rb +58 -0
- data/spec/init_spec.rb +56 -0
- data/spec/log_analysis_spec.rb +119 -0
- data/spec/mapred_factory_spec.rb +42 -0
- data/spec/spec_helper.rb +11 -0
- data/spec/util_spec.rb +15 -0
- data/spec/word_count_spec.rb +89 -0
- metadata +100 -0
data/README
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
= hadoop-rubydsl
|
2
|
+
|
3
|
+
== Description
|
4
|
+
HadoopのMapper/ReducerをRubyによるDSLで記述することができます。
|
5
|
+
hadoop-ruby.jarを利用します。
|
6
|
+
|
7
|
+
例)
|
8
|
+
apachelog.rb
|
9
|
+
|
10
|
+
# log:
|
11
|
+
# 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
|
12
|
+
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
|
13
|
+
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
|
14
|
+
|
15
|
+
use 'LogAnalysis'
|
16
|
+
data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
|
17
|
+
column[2].count_uniq
|
18
|
+
column[3].count_uniq
|
19
|
+
column[4].count_uniq
|
20
|
+
column[5].count_uniq
|
21
|
+
column[6].sum
|
22
|
+
|
23
|
+
=>
|
24
|
+
col2 frank 1
|
25
|
+
col2 frank2 2
|
26
|
+
col3 [10/Oct/2000:13:55:36 -0700] 3
|
27
|
+
col4 "GET /apache_pb.gif HTTP/1.0" 1
|
28
|
+
col4 "GET /apache_pb2.gif HTTP/1.0" 1
|
29
|
+
col4 "GET /apache_pb3.gif HTTP/1.0" 1
|
30
|
+
col5 200 2
|
31
|
+
col5 404 1
|
32
|
+
col6 6978
|
33
|
+
|
34
|
+
== Usage
|
35
|
+
0. HADOOP_HOMEを正しく設定し、Hadoopを一式立ち上げておく。
|
36
|
+
|
37
|
+
1. jruby-complete-*.jar を lib/java 以下にコピー
|
38
|
+
ex)
|
39
|
+
$ wget http://jruby.kenai.com/downloads/1.4.0RC2/jruby-complete-1.4.0RC2.jar
|
40
|
+
$ cp jruby-complete-*.jar lib/java/
|
41
|
+
|
42
|
+
2. データを HDFS にアップロード
|
43
|
+
ex)
|
44
|
+
$ hadoop dfs -copyFromLocal apachelog inputs/
|
45
|
+
|
46
|
+
3. MapReduce実行
|
47
|
+
$ bin/hadoop-ruby.sh examples/apachelog.rb inputs outputs
|
48
|
+
|
49
|
+
== Author
|
50
|
+
Koichi Fujikawa <fujibee@gmail.com>
|
51
|
+
|
52
|
+
== Copyright
|
53
|
+
License: Apache License
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
begin
|
2
|
+
require 'jeweler'
|
3
|
+
Jeweler::Tasks.new do |gemspec|
|
4
|
+
gemspec.name = "hadoop-rubydsl"
|
5
|
+
gemspec.summary = "Hadoop Ruby DSL"
|
6
|
+
gemspec.description = "Hadoop Ruby DSL"
|
7
|
+
gemspec.email = "fujibee@gmail.com"
|
8
|
+
gemspec.homepage = "http://github.com/fujibee/hadoop-rubydsl"
|
9
|
+
gemspec.authors = ["Koichi Fujikawa"]
|
10
|
+
|
11
|
+
gemspec.add_dependency 'jruby-on-hadoop'
|
12
|
+
gemspec.files.exclude "spec/**/*"
|
13
|
+
end
|
14
|
+
Jeweler::GemcutterTasks.new
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
17
|
+
end
|
18
|
+
|
data/TODO
ADDED
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.1
|
data/bin/hadoop
ADDED
@@ -0,0 +1,276 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# Licensed to the Apache Software Foundation (ASF) under one or more
|
4
|
+
# contributor license agreements. See the NOTICE file distributed with
|
5
|
+
# this work for additional information regarding copyright ownership.
|
6
|
+
# The ASF licenses this file to You under the Apache License, Version 2.0
|
7
|
+
# (the "License"); you may not use this file except in compliance with
|
8
|
+
# the License. You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
|
18
|
+
|
19
|
+
# The Hadoop command script
|
20
|
+
#
|
21
|
+
# Environment Variables
|
22
|
+
#
|
23
|
+
# JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
|
24
|
+
#
|
25
|
+
# HADOOP_CLASSPATH Extra Java CLASSPATH entries.
|
26
|
+
#
|
27
|
+
# HADOOP_HEAPSIZE The maximum amount of heap to use, in MB.
|
28
|
+
# Default is 1000.
|
29
|
+
#
|
30
|
+
# HADOOP_OPTS Extra Java runtime options.
|
31
|
+
#
|
32
|
+
# HADOOP_NAMENODE_OPTS These options are added to HADOOP_OPTS
|
33
|
+
# HADOOP_CLIENT_OPTS when the respective command is run.
|
34
|
+
# HADOOP_{COMMAND}_OPTS etc HADOOP_JT_OPTS applies to JobTracker
|
35
|
+
# for e.g. HADOOP_CLIENT_OPTS applies to
|
36
|
+
# more than one command (fs, dfs, fsck,
|
37
|
+
# dfsadmin etc)
|
38
|
+
#
|
39
|
+
# HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
|
40
|
+
#
|
41
|
+
# HADOOP_ROOT_LOGGER The root appender. Default is INFO,console
|
42
|
+
#
|
43
|
+
|
44
|
+
bin=`dirname "$0"`
|
45
|
+
bin=`cd "$bin"; pwd`
|
46
|
+
|
47
|
+
if [ -f "$bin"/hadoop-config.sh ]; then
|
48
|
+
. "$bin"/hadoop-config.sh
|
49
|
+
fi
|
50
|
+
|
51
|
+
cygwin=false
|
52
|
+
case "`uname`" in
|
53
|
+
CYGWIN*) cygwin=true;;
|
54
|
+
esac
|
55
|
+
|
56
|
+
# if no args specified, show usage
|
57
|
+
if [ $# = 0 ]; then
|
58
|
+
echo "Usage: hadoop [--config confdir] COMMAND"
|
59
|
+
echo "where COMMAND is one of:"
|
60
|
+
echo " namenode -format format the DFS filesystem"
|
61
|
+
echo " secondarynamenode run the DFS secondary namenode"
|
62
|
+
echo " namenode run the DFS namenode"
|
63
|
+
echo " datanode run a DFS datanode"
|
64
|
+
echo " dfsadmin run a DFS admin client"
|
65
|
+
echo " fsck run a DFS filesystem checking utility"
|
66
|
+
echo " fs run a generic filesystem user client"
|
67
|
+
echo " balancer run a cluster balancing utility"
|
68
|
+
echo " jobtracker run the MapReduce job Tracker node"
|
69
|
+
echo " pipes run a Pipes job"
|
70
|
+
echo " tasktracker run a MapReduce task Tracker node"
|
71
|
+
echo " job manipulate MapReduce jobs"
|
72
|
+
echo " queue get information regarding JobQueues"
|
73
|
+
echo " version print the version"
|
74
|
+
echo " jar <jar> run a jar file"
|
75
|
+
echo " distcp <srcurl> <desturl> copy file or directories recursively"
|
76
|
+
echo " archive -archiveName NAME <src>* <dest> create a hadoop archive"
|
77
|
+
echo " daemonlog get/set the log level for each daemon"
|
78
|
+
echo " or"
|
79
|
+
echo " CLASSNAME run the class named CLASSNAME"
|
80
|
+
echo "Most commands print help when invoked w/o parameters."
|
81
|
+
exit 1
|
82
|
+
fi
|
83
|
+
|
84
|
+
# get arguments
|
85
|
+
COMMAND=$1
|
86
|
+
shift
|
87
|
+
|
88
|
+
if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
|
89
|
+
. "${HADOOP_CONF_DIR}/hadoop-env.sh"
|
90
|
+
fi
|
91
|
+
|
92
|
+
# some Java parameters
|
93
|
+
if [ "$JAVA_HOME" != "" ]; then
|
94
|
+
#echo "run java in $JAVA_HOME"
|
95
|
+
JAVA_HOME=$JAVA_HOME
|
96
|
+
fi
|
97
|
+
|
98
|
+
if [ "$JAVA_HOME" = "" ]; then
|
99
|
+
echo "Error: JAVA_HOME is not set."
|
100
|
+
exit 1
|
101
|
+
fi
|
102
|
+
|
103
|
+
JAVA=$JAVA_HOME/bin/java
|
104
|
+
JAVA_HEAP_MAX=-Xmx1000m
|
105
|
+
|
106
|
+
# check envvars which might override default args
|
107
|
+
if [ "$HADOOP_HEAPSIZE" != "" ]; then
|
108
|
+
#echo "run with heapsize $HADOOP_HEAPSIZE"
|
109
|
+
JAVA_HEAP_MAX="-Xmx""$HADOOP_HEAPSIZE""m"
|
110
|
+
#echo $JAVA_HEAP_MAX
|
111
|
+
fi
|
112
|
+
|
113
|
+
# CLASSPATH initially contains $HADOOP_CONF_DIR
|
114
|
+
CLASSPATH="${HADOOP_CONF_DIR}"
|
115
|
+
CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
|
116
|
+
|
117
|
+
# for developers, add Hadoop classes to CLASSPATH
|
118
|
+
if [ -d "$HADOOP_HOME/build/classes" ]; then
|
119
|
+
CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/classes
|
120
|
+
fi
|
121
|
+
if [ -d "$HADOOP_HOME/build/webapps" ]; then
|
122
|
+
CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build
|
123
|
+
fi
|
124
|
+
if [ -d "$HADOOP_HOME/build/test/classes" ]; then
|
125
|
+
CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/test/classes
|
126
|
+
fi
|
127
|
+
if [ -d "$HADOOP_HOME/build/tools" ]; then
|
128
|
+
CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/tools
|
129
|
+
fi
|
130
|
+
|
131
|
+
# so that filenames w/ spaces are handled correctly in loops below
|
132
|
+
IFS=
|
133
|
+
|
134
|
+
# for releases, add core hadoop jar & webapps to CLASSPATH
|
135
|
+
if [ -d "$HADOOP_HOME/webapps" ]; then
|
136
|
+
CLASSPATH=${CLASSPATH}:$HADOOP_HOME
|
137
|
+
fi
|
138
|
+
for f in $HADOOP_HOME/hadoop-*-core.jar; do
|
139
|
+
CLASSPATH=${CLASSPATH}:$f;
|
140
|
+
done
|
141
|
+
|
142
|
+
# add libs to CLASSPATH
|
143
|
+
for f in $HADOOP_HOME/lib/*.jar; do
|
144
|
+
CLASSPATH=${CLASSPATH}:$f;
|
145
|
+
done
|
146
|
+
|
147
|
+
for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do
|
148
|
+
CLASSPATH=${CLASSPATH}:$f;
|
149
|
+
done
|
150
|
+
|
151
|
+
for f in $HADOOP_HOME/hadoop-*-tools.jar; do
|
152
|
+
TOOL_PATH=${TOOL_PATH}:$f;
|
153
|
+
done
|
154
|
+
for f in $HADOOP_HOME/build/hadoop-*-tools.jar; do
|
155
|
+
TOOL_PATH=${TOOL_PATH}:$f;
|
156
|
+
done
|
157
|
+
|
158
|
+
# add user-specified CLASSPATH last
|
159
|
+
if [ "$HADOOP_CLASSPATH" != "" ]; then
|
160
|
+
CLASSPATH=${CLASSPATH}:${HADOOP_CLASSPATH}
|
161
|
+
fi
|
162
|
+
|
163
|
+
# default log directory & file
|
164
|
+
if [ "$HADOOP_LOG_DIR" = "" ]; then
|
165
|
+
HADOOP_LOG_DIR="$HADOOP_HOME/logs"
|
166
|
+
fi
|
167
|
+
if [ "$HADOOP_LOGFILE" = "" ]; then
|
168
|
+
HADOOP_LOGFILE='hadoop.log'
|
169
|
+
fi
|
170
|
+
|
171
|
+
# restore ordinary behaviour
|
172
|
+
unset IFS
|
173
|
+
|
174
|
+
# figure out which class to run
|
175
|
+
if [ "$COMMAND" = "namenode" ] ; then
|
176
|
+
CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
|
177
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"
|
178
|
+
elif [ "$COMMAND" = "secondarynamenode" ] ; then
|
179
|
+
CLASS='org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode'
|
180
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_SECONDARYNAMENODE_OPTS"
|
181
|
+
elif [ "$COMMAND" = "datanode" ] ; then
|
182
|
+
CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
|
183
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_DATANODE_OPTS"
|
184
|
+
elif [ "$COMMAND" = "fs" ] ; then
|
185
|
+
CLASS=org.apache.hadoop.fs.FsShell
|
186
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
187
|
+
elif [ "$COMMAND" = "dfs" ] ; then
|
188
|
+
CLASS=org.apache.hadoop.fs.FsShell
|
189
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
190
|
+
elif [ "$COMMAND" = "dfsadmin" ] ; then
|
191
|
+
CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin
|
192
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
193
|
+
elif [ "$COMMAND" = "fsck" ] ; then
|
194
|
+
CLASS=org.apache.hadoop.hdfs.tools.DFSck
|
195
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
196
|
+
elif [ "$COMMAND" = "balancer" ] ; then
|
197
|
+
CLASS=org.apache.hadoop.hdfs.server.balancer.Balancer
|
198
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS"
|
199
|
+
elif [ "$COMMAND" = "jobtracker" ] ; then
|
200
|
+
CLASS=org.apache.hadoop.mapred.JobTracker
|
201
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
|
202
|
+
elif [ "$COMMAND" = "tasktracker" ] ; then
|
203
|
+
CLASS=org.apache.hadoop.mapred.TaskTracker
|
204
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
|
205
|
+
elif [ "$COMMAND" = "job" ] ; then
|
206
|
+
CLASS=org.apache.hadoop.mapred.JobClient
|
207
|
+
elif [ "$COMMAND" = "queue" ] ; then
|
208
|
+
CLASS=org.apache.hadoop.mapred.JobQueueClient
|
209
|
+
elif [ "$COMMAND" = "pipes" ] ; then
|
210
|
+
CLASS=org.apache.hadoop.mapred.pipes.Submitter
|
211
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
212
|
+
elif [ "$COMMAND" = "version" ] ; then
|
213
|
+
CLASS=org.apache.hadoop.util.VersionInfo
|
214
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
215
|
+
elif [ "$COMMAND" = "jar" ] ; then
|
216
|
+
CLASS=org.apache.hadoop.mapred.JobShell
|
217
|
+
elif [ "$COMMAND" = "distcp" ] ; then
|
218
|
+
CLASS=org.apache.hadoop.tools.DistCp
|
219
|
+
CLASSPATH=${CLASSPATH}:${TOOL_PATH}
|
220
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
221
|
+
elif [ "$COMMAND" = "daemonlog" ] ; then
|
222
|
+
CLASS=org.apache.hadoop.log.LogLevel
|
223
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
224
|
+
elif [ "$COMMAND" = "archive" ] ; then
|
225
|
+
CLASS=org.apache.hadoop.tools.HadoopArchives
|
226
|
+
CLASSPATH=${CLASSPATH}:${TOOL_PATH}
|
227
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
228
|
+
elif [ "$COMMAND" = "sampler" ] ; then
|
229
|
+
CLASS=org.apache.hadoop.mapred.lib.InputSampler
|
230
|
+
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
231
|
+
else
|
232
|
+
CLASS=$COMMAND
|
233
|
+
fi
|
234
|
+
|
235
|
+
# cygwin path translation
|
236
|
+
if $cygwin; then
|
237
|
+
CLASSPATH=`cygpath -p -w "$CLASSPATH"`
|
238
|
+
HADOOP_HOME=`cygpath -d "$HADOOP_HOME"`
|
239
|
+
HADOOP_LOG_DIR=`cygpath -d "$HADOOP_LOG_DIR"`
|
240
|
+
TOOL_PATH=`cygpath -p -w "$TOOL_PATH"`
|
241
|
+
fi
|
242
|
+
# setup 'java.library.path' for native-hadoop code if necessary
|
243
|
+
JAVA_LIBRARY_PATH=''
|
244
|
+
if [ -d "${HADOOP_HOME}/build/native" -o -d "${HADOOP_HOME}/lib/native" ]; then
|
245
|
+
JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"`
|
246
|
+
|
247
|
+
if [ -d "$HADOOP_HOME/build/native" ]; then
|
248
|
+
JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib
|
249
|
+
fi
|
250
|
+
|
251
|
+
if [ -d "${HADOOP_HOME}/lib/native" ]; then
|
252
|
+
if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
|
253
|
+
JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
|
254
|
+
else
|
255
|
+
JAVA_LIBRARY_PATH=${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
|
256
|
+
fi
|
257
|
+
fi
|
258
|
+
fi
|
259
|
+
|
260
|
+
# cygwin path translation
|
261
|
+
if $cygwin; then
|
262
|
+
JAVA_LIBRARY_PATH=`cygpath -p "$JAVA_LIBRARY_PATH"`
|
263
|
+
fi
|
264
|
+
|
265
|
+
HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.dir=$HADOOP_LOG_DIR"
|
266
|
+
HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.file=$HADOOP_LOGFILE"
|
267
|
+
HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.home.dir=$HADOOP_HOME"
|
268
|
+
HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.id.str=$HADOOP_IDENT_STRING"
|
269
|
+
HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.root.logger=${HADOOP_ROOT_LOGGER:-INFO,console}"
|
270
|
+
if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
|
271
|
+
HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
|
272
|
+
fi
|
273
|
+
|
274
|
+
# run it
|
275
|
+
#echo exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
|
276
|
+
exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
|
data/bin/hadoop-ruby.sh
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
BIN_DIR=`dirname "$0"`
|
3
|
+
BASE_DIR=`cd $BIN_DIR/..; pwd`
|
4
|
+
|
5
|
+
# choose hadoop sh
|
6
|
+
HADOOP=$HADOOP_HOME/bin/hadoop
|
7
|
+
if [ ! -f $HADOOP ]; then
|
8
|
+
HADOOP=$BIN_DIR/hadoop
|
9
|
+
#HADOOP_OPTS="--config $BASE_DIR/conf"
|
10
|
+
fi
|
11
|
+
|
12
|
+
# fetch jruby jar if not exist
|
13
|
+
LIB_DIR=$BASE_DIR/lib/java
|
14
|
+
JRUBY_JAR=jruby-complete-1.4.0.jar
|
15
|
+
if [ ! -f "$LIB_DIR/$JRUBY_JAR" ]; then
|
16
|
+
wget http://jruby.kenai.com/downloads/1.4.0/jruby-complete-1.4.0.jar
|
17
|
+
mv $JRUBY_JAR $LIB_DIR/
|
18
|
+
fi
|
19
|
+
|
20
|
+
# construct command line
|
21
|
+
HADOOP_RUBY_LIB_DIR=$BASE_DIR/lib
|
22
|
+
export HADOOP_CLASSPATH=$HADOOP_RUBY_LIB_DIR
|
23
|
+
for x in `ls $HADOOP_RUBY_LIB_DIR`; do
|
24
|
+
DSL_FILES=$HADOOP_RUBY_LIB_DIR/$x,$DSL_FILES
|
25
|
+
done
|
26
|
+
DSL_FILES=$DSL_FILES$1
|
27
|
+
|
28
|
+
# execute hadoop ruby
|
29
|
+
echo runnig $1...
|
30
|
+
$HADOOP $HADOOP_OPTS jar $LIB_DIR/hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars $LIB_DIR/$JRUBY_JAR -files $DSL_FILES $1 $2 $3
|
@@ -0,0 +1,19 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
|
3
|
+
|
4
|
+
<!-- Put site-specific property overrides in this file. -->
|
5
|
+
|
6
|
+
<configuration>
|
7
|
+
<property>
|
8
|
+
<name>fs.default.name</name>
|
9
|
+
<value>hdfs://localhost:9000/</value>
|
10
|
+
</property>
|
11
|
+
<property>
|
12
|
+
<name>mapred.job.tracker</name>
|
13
|
+
<value>localhost:50040</value>
|
14
|
+
</property>
|
15
|
+
<property>
|
16
|
+
<name>mapred.child.java.opts</name>
|
17
|
+
<value>-Xmx512m</value>
|
18
|
+
</property>
|
19
|
+
</configuration>
|
@@ -0,0 +1,18 @@
|
|
1
|
+
use 'LogAnalysis'
|
2
|
+
|
3
|
+
data 'apache log on test2' do
|
4
|
+
from 'apachelog/inputs'
|
5
|
+
to 'apachelog/outputs'
|
6
|
+
|
7
|
+
# 119.63.199.8 - - [15/Nov/2009:01:18:16 +0900] "GET /ranking/game?page=31 HTTP/1.1" 200 10077 "-" "Baiduspider+(+http://www.baidu.jp/spider/)"
|
8
|
+
# 203.83.243.81 - - [15/Nov/2009:01:18:33 +0900] "GET /dns_zones.txt HTTP/1.1" 404 294 "-" "libwww-perl/5.65"
|
9
|
+
|
10
|
+
each_line do
|
11
|
+
pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
|
12
|
+
column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
|
13
|
+
|
14
|
+
topic 'ua counts', :label => 'ua' do
|
15
|
+
count_uniq column[:ua]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
use 'LogAnalysis'
|
2
|
+
|
3
|
+
data 'apache log on test1' do
|
4
|
+
from 'apachlog/inputs'
|
5
|
+
to 'apachlog/outputs'
|
6
|
+
|
7
|
+
each_line do
|
8
|
+
pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
|
9
|
+
column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes' # 各カラムにラベルをつける
|
10
|
+
|
11
|
+
topic 'which users?', :label => 'user' do
|
12
|
+
count_uniq column[:user]
|
13
|
+
end
|
14
|
+
|
15
|
+
# topic 'access date by monthly' do
|
16
|
+
# select_date column[:access_date], BY_MONTHLY
|
17
|
+
# count column[:access_date]
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# topic 'total bytes' do
|
21
|
+
# select_date column[:access_date], BY_MONTHLY
|
22
|
+
# sum column[:bytes].to_kilobytes # / 1024
|
23
|
+
# end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# Apache log analysis
|
2
|
+
#
|
3
|
+
# example target data:
|
4
|
+
# 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
|
5
|
+
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
|
6
|
+
# 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
|
7
|
+
|
8
|
+
use 'LogAnalysis'
|
9
|
+
|
10
|
+
data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
|
11
|
+
column[2].count_uniq
|
12
|
+
column[3].count_uniq
|
13
|
+
column[4].count_uniq
|
14
|
+
column[5].count_uniq
|
15
|
+
column[6].sum
|
@@ -0,0 +1,14 @@
|
|
1
|
+
use 'HiveLike'
|
2
|
+
|
3
|
+
# hive-like/items.txt
|
4
|
+
# apple, 3, 100
|
5
|
+
# banana, 1, 50
|
6
|
+
|
7
|
+
create_table items(item STRING, quantity INT, price INT);
|
8
|
+
load_data "hive-like/items.txt" items;
|
9
|
+
|
10
|
+
select quantity, price, item from items;
|
11
|
+
|
12
|
+
# expect
|
13
|
+
# 0 apple 3 300
|
14
|
+
# 1 banana 1 50
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{hadoop-rubydsl}
|
8
|
+
s.version = "0.0.1"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Koichi Fujikawa"]
|
12
|
+
s.date = %q{2009-12-26}
|
13
|
+
s.description = %q{Hadoop Ruby DSL}
|
14
|
+
s.email = %q{fujibee@gmail.com}
|
15
|
+
s.executables = ["hadoop", "hadoop-ruby.sh"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README",
|
18
|
+
"TODO"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
"README",
|
22
|
+
"Rakefile",
|
23
|
+
"TODO",
|
24
|
+
"VERSION",
|
25
|
+
"bin/hadoop",
|
26
|
+
"bin/hadoop-ruby.sh",
|
27
|
+
"conf/hadoop-site.xml",
|
28
|
+
"examples/apachelog-v2-2.rb",
|
29
|
+
"examples/apachelog-v2.rb",
|
30
|
+
"examples/apachelog.rb",
|
31
|
+
"examples/hive_like_test.rb",
|
32
|
+
"examples/word_count_test.rb",
|
33
|
+
"hadoop-rubydsl.gemspec",
|
34
|
+
"lib/core.rb",
|
35
|
+
"lib/hive_like.rb",
|
36
|
+
"lib/init.rb",
|
37
|
+
"lib/java/.gitignore",
|
38
|
+
"lib/java/hadoop-ruby.jar",
|
39
|
+
"lib/log_analysis.rb",
|
40
|
+
"lib/mapred_factory.rb",
|
41
|
+
"lib/util.rb",
|
42
|
+
"lib/word_count.rb"
|
43
|
+
]
|
44
|
+
s.homepage = %q{http://github.com/fujibee/hadoop-rubydsl}
|
45
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
46
|
+
s.require_paths = ["lib"]
|
47
|
+
s.rubygems_version = %q{1.3.5}
|
48
|
+
s.summary = %q{Hadoop Ruby DSL}
|
49
|
+
s.test_files = [
|
50
|
+
"spec/spec_helper.rb",
|
51
|
+
"spec/core_spec.rb",
|
52
|
+
"spec/util_spec.rb",
|
53
|
+
"spec/mapred_factory_spec.rb",
|
54
|
+
"spec/word_count_spec.rb",
|
55
|
+
"spec/hive_like_spec.rb",
|
56
|
+
"spec/log_analysis_spec.rb",
|
57
|
+
"spec/example_spec.rb",
|
58
|
+
"spec/init_spec.rb",
|
59
|
+
"examples/apachelog-v2.rb",
|
60
|
+
"examples/hive_like_test.rb",
|
61
|
+
"examples/word_count_test.rb",
|
62
|
+
"examples/apachelog-v2-2.rb",
|
63
|
+
"examples/apachelog.rb"
|
64
|
+
]
|
65
|
+
|
66
|
+
if s.respond_to? :specification_version then
|
67
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
68
|
+
s.specification_version = 3
|
69
|
+
|
70
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
71
|
+
s.add_runtime_dependency(%q<jruby-on-hadoop>, [">= 0"])
|
72
|
+
else
|
73
|
+
s.add_dependency(%q<jruby-on-hadoop>, [">= 0"])
|
74
|
+
end
|
75
|
+
else
|
76
|
+
s.add_dependency(%q<jruby-on-hadoop>, [">= 0"])
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
data/lib/core.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'util'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module HadoopDsl
|
5
|
+
# controller
|
6
|
+
class BaseMapRed
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
attr_reader :emitted
|
10
|
+
|
11
|
+
def initialize(script, model)
|
12
|
+
@script, @model = script, model
|
13
|
+
@model.controller = self
|
14
|
+
@emitted = []
|
15
|
+
end
|
16
|
+
|
17
|
+
def run
|
18
|
+
body = pre_process(read_file(@script))
|
19
|
+
eval(body, binding, @script)
|
20
|
+
end
|
21
|
+
|
22
|
+
def pre_process(body)
|
23
|
+
body # do nothing
|
24
|
+
end
|
25
|
+
|
26
|
+
def emit(hash) @emitted << hash end
|
27
|
+
|
28
|
+
# all DSL statements without def is processed here
|
29
|
+
def method_missing(method_name, *args) self end
|
30
|
+
end
|
31
|
+
|
32
|
+
class BaseSetup
|
33
|
+
def initialize(script, conf)
|
34
|
+
@script, @conf = script, conf
|
35
|
+
output_format
|
36
|
+
end
|
37
|
+
|
38
|
+
def run
|
39
|
+
body = pre_process(read_file(@script))
|
40
|
+
eval(body, binding, @script)
|
41
|
+
end
|
42
|
+
|
43
|
+
def pre_process(body)
|
44
|
+
body # do nothing
|
45
|
+
end
|
46
|
+
|
47
|
+
# do nothing
|
48
|
+
def output_format; end
|
49
|
+
|
50
|
+
def paths; [@from, @to] end
|
51
|
+
|
52
|
+
def from(path) @from = path end
|
53
|
+
def to(path) @to = path end
|
54
|
+
|
55
|
+
# all DSL statements without def is processed here
|
56
|
+
def method_missing(method_name, *args) self end
|
57
|
+
end
|
58
|
+
|
59
|
+
class BaseMapper < BaseMapRed
|
60
|
+
def initialize(script, model)
|
61
|
+
super(script, model)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class BaseReducer < BaseMapRed
|
66
|
+
def initialize(script, model)
|
67
|
+
super(script, model)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# model
|
72
|
+
class BaseModel
|
73
|
+
attr_accessor :controller
|
74
|
+
|
75
|
+
# all DSL statements without def is processed here
|
76
|
+
def method_missing(method_name, *args) self end
|
77
|
+
end
|
78
|
+
|
79
|
+
class BaseMapperModel < BaseModel
|
80
|
+
attr_reader :key, :value
|
81
|
+
|
82
|
+
def initialize(key, value)
|
83
|
+
@key, @value = key, value
|
84
|
+
end
|
85
|
+
|
86
|
+
# common functions
|
87
|
+
def identity
|
88
|
+
@controller.emit(@key => @value)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
class BaseReducerModel < BaseModel
|
93
|
+
attr_reader :key, :values
|
94
|
+
|
95
|
+
def initialize(key, values)
|
96
|
+
@key, @values = key, values
|
97
|
+
end
|
98
|
+
|
99
|
+
# common functions
|
100
|
+
def aggregate
|
101
|
+
@controller.emit(@key => @values.inject {|ret, i| ret + i})
|
102
|
+
end
|
103
|
+
|
104
|
+
def identity
|
105
|
+
@values.each {|v| @controller.emit(@key => v)}
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|