hadoop-papyrus 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ pkg
@@ -0,0 +1,58 @@
1
+ = hadoop-papyrus
2
+
3
+ Enable to run Ruby DSL script on your Hadoop.
4
+
5
+ == Description
6
+
7
+ You can write DSL by Ruby to run Hadoop as Mapper / Reducer.
8
+ This gem depends on 'jruby-on-hadoop' project.
9
+
10
+ == Install
11
+
12
+ Required gems are all on GemCutter.
13
+
14
+ 1. Upgrade your rubygem to 1.3.5
15
+ 2. Install gems
16
+ $ gem install hadoop-papyrus
17
+
18
+ == Usage
19
+
20
+ 1. Run Hadoop cluster on your machines and put your 'hadoop' executable to your PATH or set HADOOP_HOME env variable.
21
+ 2. put files into your hdfs. ex) wc/inputs/file1
22
+ 3. Now you can run 'papyrus' like below:
23
+ $ papyrus examples/word_count_test.rb
24
+ You can get Hadoop job results in your hdfs wc/outputs/part-*
25
+
26
+ == Examples
27
+
28
+ Word Count DSL script
29
+ dsl 'WordCount'
30
+
31
+ from 'wc/inputs'
32
+ to 'wc/outputs'
33
+
34
+ count_uniq
35
+ total :bytes, :words, :lines
36
+
37
+ Log Analysis DSL script
38
+ dsl 'LogAnalysis'
39
+
40
+ data 'apache log on test2' do
41
+ from 'apachelog/inputs'
42
+ to 'apachelog/outputs'
43
+
44
+ each_line do
45
+ pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
46
+ column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
47
+
48
+ topic 'ua counts', :label => 'ua' do
49
+ count_uniq column[:ua]
50
+ end
51
+ end
52
+ end
53
+
54
+ == Author
55
+ Koichi Fujikawa <fujibee@gmail.com>
56
+
57
+ == Copyright
58
+ License: Apache License
@@ -0,0 +1,18 @@
1
+ begin
2
+ require 'jeweler'
3
+ Jeweler::Tasks.new do |gemspec|
4
+ gemspec.name = "hadoop-papyrus"
5
+ gemspec.summary = "Hadoop papyrus"
6
+ gemspec.description = "Hadoop papyrus - Ruby DSL for Hadoop"
7
+ gemspec.email = "fujibee@gmail.com"
8
+ gemspec.homepage = "http://github.com/fujibee/hadoop-papyrus"
9
+ gemspec.authors = ["Koichi Fujikawa"]
10
+
11
+ gemspec.add_dependency 'jruby-on-hadoop'
12
+ gemspec.files.exclude "spec/**/*"
13
+ end
14
+ Jeweler::GemcutterTasks.new
15
+ rescue LoadError
16
+ puts "Jeweler not available. Install it with: gem install jeweler"
17
+ end
18
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.6
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'hadoop_dsl_client'
4
+
5
+ HadoopDsl::Client.new(ARGV).run
@@ -0,0 +1,19 @@
1
+ <?xml version="1.0"?>
2
+ <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3
+
4
+ <!-- Put site-specific property overrides in this file. -->
5
+
6
+ <configuration>
7
+ <property>
8
+ <name>fs.default.name</name>
9
+ <value>hdfs://localhost:9000/</value>
10
+ </property>
11
+ <property>
12
+ <name>mapred.job.tracker</name>
13
+ <value>localhost:50040</value>
14
+ </property>
15
+ <property>
16
+ <name>mapred.child.java.opts</name>
17
+ <value>-Xmx512m</value>
18
+ </property>
19
+ </configuration>
@@ -0,0 +1,276 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Licensed to the Apache Software Foundation (ASF) under one or more
4
+ # contributor license agreements. See the NOTICE file distributed with
5
+ # this work for additional information regarding copyright ownership.
6
+ # The ASF licenses this file to You under the Apache License, Version 2.0
7
+ # (the "License"); you may not use this file except in compliance with
8
+ # the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+
19
+ # The Hadoop command script
20
+ #
21
+ # Environment Variables
22
+ #
23
+ # JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
24
+ #
25
+ # HADOOP_CLASSPATH Extra Java CLASSPATH entries.
26
+ #
27
+ # HADOOP_HEAPSIZE The maximum amount of heap to use, in MB.
28
+ # Default is 1000.
29
+ #
30
+ # HADOOP_OPTS Extra Java runtime options.
31
+ #
32
+ # HADOOP_NAMENODE_OPTS These options are added to HADOOP_OPTS
33
+ # HADOOP_CLIENT_OPTS when the respective command is run.
34
+ # HADOOP_{COMMAND}_OPTS etc HADOOP_JT_OPTS applies to JobTracker
35
+ # for e.g. HADOOP_CLIENT_OPTS applies to
36
+ # more than one command (fs, dfs, fsck,
37
+ # dfsadmin etc)
38
+ #
39
+ # HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
40
+ #
41
+ # HADOOP_ROOT_LOGGER The root appender. Default is INFO,console
42
+ #
43
+
44
+ bin=`dirname "$0"`
45
+ bin=`cd "$bin"; pwd`
46
+
47
+ if [ -f "$bin"/hadoop-config.sh ]; then
48
+ . "$bin"/hadoop-config.sh
49
+ fi
50
+
51
+ cygwin=false
52
+ case "`uname`" in
53
+ CYGWIN*) cygwin=true;;
54
+ esac
55
+
56
+ # if no args specified, show usage
57
+ if [ $# = 0 ]; then
58
+ echo "Usage: hadoop [--config confdir] COMMAND"
59
+ echo "where COMMAND is one of:"
60
+ echo " namenode -format format the DFS filesystem"
61
+ echo " secondarynamenode run the DFS secondary namenode"
62
+ echo " namenode run the DFS namenode"
63
+ echo " datanode run a DFS datanode"
64
+ echo " dfsadmin run a DFS admin client"
65
+ echo " fsck run a DFS filesystem checking utility"
66
+ echo " fs run a generic filesystem user client"
67
+ echo " balancer run a cluster balancing utility"
68
+ echo " jobtracker run the MapReduce job Tracker node"
69
+ echo " pipes run a Pipes job"
70
+ echo " tasktracker run a MapReduce task Tracker node"
71
+ echo " job manipulate MapReduce jobs"
72
+ echo " queue get information regarding JobQueues"
73
+ echo " version print the version"
74
+ echo " jar <jar> run a jar file"
75
+ echo " distcp <srcurl> <desturl> copy file or directories recursively"
76
+ echo " archive -archiveName NAME <src>* <dest> create a hadoop archive"
77
+ echo " daemonlog get/set the log level for each daemon"
78
+ echo " or"
79
+ echo " CLASSNAME run the class named CLASSNAME"
80
+ echo "Most commands print help when invoked w/o parameters."
81
+ exit 1
82
+ fi
83
+
84
+ # get arguments
85
+ COMMAND=$1
86
+ shift
87
+
88
+ if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
89
+ . "${HADOOP_CONF_DIR}/hadoop-env.sh"
90
+ fi
91
+
92
+ # some Java parameters
93
+ if [ "$JAVA_HOME" != "" ]; then
94
+ #echo "run java in $JAVA_HOME"
95
+ JAVA_HOME=$JAVA_HOME
96
+ fi
97
+
98
+ if [ "$JAVA_HOME" = "" ]; then
99
+ echo "Error: JAVA_HOME is not set."
100
+ exit 1
101
+ fi
102
+
103
+ JAVA=$JAVA_HOME/bin/java
104
+ JAVA_HEAP_MAX=-Xmx1000m
105
+
106
+ # check envvars which might override default args
107
+ if [ "$HADOOP_HEAPSIZE" != "" ]; then
108
+ #echo "run with heapsize $HADOOP_HEAPSIZE"
109
+ JAVA_HEAP_MAX="-Xmx""$HADOOP_HEAPSIZE""m"
110
+ #echo $JAVA_HEAP_MAX
111
+ fi
112
+
113
+ # CLASSPATH initially contains $HADOOP_CONF_DIR
114
+ CLASSPATH="${HADOOP_CONF_DIR}"
115
+ CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
116
+
117
+ # for developers, add Hadoop classes to CLASSPATH
118
+ if [ -d "$HADOOP_HOME/build/classes" ]; then
119
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/classes
120
+ fi
121
+ if [ -d "$HADOOP_HOME/build/webapps" ]; then
122
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build
123
+ fi
124
+ if [ -d "$HADOOP_HOME/build/test/classes" ]; then
125
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/test/classes
126
+ fi
127
+ if [ -d "$HADOOP_HOME/build/tools" ]; then
128
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/tools
129
+ fi
130
+
131
+ # so that filenames w/ spaces are handled correctly in loops below
132
+ IFS=
133
+
134
+ # for releases, add core hadoop jar & webapps to CLASSPATH
135
+ if [ -d "$HADOOP_HOME/webapps" ]; then
136
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME
137
+ fi
138
+ for f in $HADOOP_HOME/hadoop-*-core.jar; do
139
+ CLASSPATH=${CLASSPATH}:$f;
140
+ done
141
+
142
+ # add libs to CLASSPATH
143
+ for f in $HADOOP_HOME/lib/*.jar; do
144
+ CLASSPATH=${CLASSPATH}:$f;
145
+ done
146
+
147
+ for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do
148
+ CLASSPATH=${CLASSPATH}:$f;
149
+ done
150
+
151
+ for f in $HADOOP_HOME/hadoop-*-tools.jar; do
152
+ TOOL_PATH=${TOOL_PATH}:$f;
153
+ done
154
+ for f in $HADOOP_HOME/build/hadoop-*-tools.jar; do
155
+ TOOL_PATH=${TOOL_PATH}:$f;
156
+ done
157
+
158
+ # add user-specified CLASSPATH last
159
+ if [ "$HADOOP_CLASSPATH" != "" ]; then
160
+ CLASSPATH=${CLASSPATH}:${HADOOP_CLASSPATH}
161
+ fi
162
+
163
+ # default log directory & file
164
+ if [ "$HADOOP_LOG_DIR" = "" ]; then
165
+ HADOOP_LOG_DIR="$HADOOP_HOME/logs"
166
+ fi
167
+ if [ "$HADOOP_LOGFILE" = "" ]; then
168
+ HADOOP_LOGFILE='hadoop.log'
169
+ fi
170
+
171
+ # restore ordinary behaviour
172
+ unset IFS
173
+
174
+ # figure out which class to run
175
+ if [ "$COMMAND" = "namenode" ] ; then
176
+ CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
177
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"
178
+ elif [ "$COMMAND" = "secondarynamenode" ] ; then
179
+ CLASS='org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode'
180
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_SECONDARYNAMENODE_OPTS"
181
+ elif [ "$COMMAND" = "datanode" ] ; then
182
+ CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
183
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_DATANODE_OPTS"
184
+ elif [ "$COMMAND" = "fs" ] ; then
185
+ CLASS=org.apache.hadoop.fs.FsShell
186
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
187
+ elif [ "$COMMAND" = "dfs" ] ; then
188
+ CLASS=org.apache.hadoop.fs.FsShell
189
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
190
+ elif [ "$COMMAND" = "dfsadmin" ] ; then
191
+ CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin
192
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
193
+ elif [ "$COMMAND" = "fsck" ] ; then
194
+ CLASS=org.apache.hadoop.hdfs.tools.DFSck
195
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
196
+ elif [ "$COMMAND" = "balancer" ] ; then
197
+ CLASS=org.apache.hadoop.hdfs.server.balancer.Balancer
198
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS"
199
+ elif [ "$COMMAND" = "jobtracker" ] ; then
200
+ CLASS=org.apache.hadoop.mapred.JobTracker
201
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
202
+ elif [ "$COMMAND" = "tasktracker" ] ; then
203
+ CLASS=org.apache.hadoop.mapred.TaskTracker
204
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
205
+ elif [ "$COMMAND" = "job" ] ; then
206
+ CLASS=org.apache.hadoop.mapred.JobClient
207
+ elif [ "$COMMAND" = "queue" ] ; then
208
+ CLASS=org.apache.hadoop.mapred.JobQueueClient
209
+ elif [ "$COMMAND" = "pipes" ] ; then
210
+ CLASS=org.apache.hadoop.mapred.pipes.Submitter
211
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
212
+ elif [ "$COMMAND" = "version" ] ; then
213
+ CLASS=org.apache.hadoop.util.VersionInfo
214
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
215
+ elif [ "$COMMAND" = "jar" ] ; then
216
+ CLASS=org.apache.hadoop.mapred.JobShell
217
+ elif [ "$COMMAND" = "distcp" ] ; then
218
+ CLASS=org.apache.hadoop.tools.DistCp
219
+ CLASSPATH=${CLASSPATH}:${TOOL_PATH}
220
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
221
+ elif [ "$COMMAND" = "daemonlog" ] ; then
222
+ CLASS=org.apache.hadoop.log.LogLevel
223
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
224
+ elif [ "$COMMAND" = "archive" ] ; then
225
+ CLASS=org.apache.hadoop.tools.HadoopArchives
226
+ CLASSPATH=${CLASSPATH}:${TOOL_PATH}
227
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
228
+ elif [ "$COMMAND" = "sampler" ] ; then
229
+ CLASS=org.apache.hadoop.mapred.lib.InputSampler
230
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
231
+ else
232
+ CLASS=$COMMAND
233
+ fi
234
+
235
+ # cygwin path translation
236
+ if $cygwin; then
237
+ CLASSPATH=`cygpath -p -w "$CLASSPATH"`
238
+ HADOOP_HOME=`cygpath -d "$HADOOP_HOME"`
239
+ HADOOP_LOG_DIR=`cygpath -d "$HADOOP_LOG_DIR"`
240
+ TOOL_PATH=`cygpath -p -w "$TOOL_PATH"`
241
+ fi
242
+ # setup 'java.library.path' for native-hadoop code if necessary
243
+ JAVA_LIBRARY_PATH=''
244
+ if [ -d "${HADOOP_HOME}/build/native" -o -d "${HADOOP_HOME}/lib/native" ]; then
245
+ JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"`
246
+
247
+ if [ -d "$HADOOP_HOME/build/native" ]; then
248
+ JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib
249
+ fi
250
+
251
+ if [ -d "${HADOOP_HOME}/lib/native" ]; then
252
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
253
+ JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
254
+ else
255
+ JAVA_LIBRARY_PATH=${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
256
+ fi
257
+ fi
258
+ fi
259
+
260
+ # cygwin path translation
261
+ if $cygwin; then
262
+ JAVA_LIBRARY_PATH=`cygpath -p "$JAVA_LIBRARY_PATH"`
263
+ fi
264
+
265
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.dir=$HADOOP_LOG_DIR"
266
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.file=$HADOOP_LOGFILE"
267
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.home.dir=$HADOOP_HOME"
268
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.id.str=$HADOOP_IDENT_STRING"
269
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.root.logger=${HADOOP_ROOT_LOGGER:-INFO,console}"
270
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
271
+ HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
272
+ fi
273
+
274
+ # run it
275
+ #echo exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
276
+ exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
@@ -0,0 +1,30 @@
1
+ #!/bin/bash
2
+ BIN_DIR=`dirname "$0"`
3
+ BASE_DIR=`cd $BIN_DIR/..; pwd`
4
+
5
+ # choose hadoop sh
6
+ HADOOP=$HADOOP_HOME/bin/hadoop
7
+ if [ ! -f $HADOOP ]; then
8
+ HADOOP=$BIN_DIR/hadoop
9
+ #HADOOP_OPTS="--config $BASE_DIR/conf"
10
+ fi
11
+
12
+ # fetch jruby jar if not exist
13
+ LIB_DIR=$BASE_DIR/lib/java
14
+ JRUBY_JAR=jruby-complete-1.4.0.jar
15
+ if [ ! -f "$LIB_DIR/$JRUBY_JAR" ]; then
16
+ wget http://jruby.kenai.com/downloads/1.4.0/jruby-complete-1.4.0.jar
17
+ mv $JRUBY_JAR $LIB_DIR/
18
+ fi
19
+
20
+ # construct command line
21
+ HADOOP_RUBY_LIB_DIR=$BASE_DIR/lib
22
+ export HADOOP_CLASSPATH=$HADOOP_RUBY_LIB_DIR
23
+ for x in `ls $HADOOP_RUBY_LIB_DIR`; do
24
+ DSL_FILES=$HADOOP_RUBY_LIB_DIR/$x,$DSL_FILES
25
+ done
26
+ DSL_FILES=$DSL_FILES$1
27
+
28
+ # execute hadoop ruby
29
+ echo runnig $1...
30
+ $HADOOP $HADOOP_OPTS jar $LIB_DIR/hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars $LIB_DIR/$JRUBY_JAR -files $DSL_FILES $1 $2 $3
@@ -0,0 +1,14 @@
1
+ dsl 'HiveLike'
2
+
3
+ # hive-like/items.txt
4
+ # apple, 3, 100
5
+ # banana, 1, 50
6
+
7
+ create_table items(item STRING, quantity INT, price INT);
8
+ load_data "hive-like/items.txt" items;
9
+
10
+ select quantity, price, item from items;
11
+
12
+ # expect
13
+ # 0 apple 3 300
14
+ # 1 banana 1 50
@@ -0,0 +1,43 @@
1
+ dsl 'LogAnalysis'
2
+
3
+ data 'apache log on test2' do
4
+ from 'apachelog/inputs'
5
+ to 'apachelog/outputs'
6
+
7
+ # 119.63.199.8 - - [15/Nov/2009:01:18:16 +0900] "GET /ranking/game?page=31 HTTP/1.1" 200 10077 "-" "Baiduspider+(+http://www.baidu.jp/spider/)"
8
+ # 203.83.243.81 - - [15/Nov/2009:01:18:33 +0900] "GET /dns_zones.txt HTTP/1.1" 404 294 "-" "libwww-perl/5.65"
9
+
10
+ each_line do
11
+ pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
12
+ column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
13
+
14
+ topic 'ua counts', :label => 'ua' do
15
+ count_uniq column[:ua]
16
+ end
17
+
18
+ topic 'count bot', :label => 'bot' do
19
+ ua = column[:ua].value
20
+ bot = ua if ua =~ /bot/i
21
+ count_uniq bot
22
+ end
23
+
24
+ topic 'ua counts group by path' do
25
+ request = column[:request].value
26
+ if request
27
+ path = request.split(/\s+/)[1]
28
+ group_by path
29
+ end
30
+ count_uniq column[:ua]
31
+ end
32
+
33
+ topic 'ua counts by daily' do
34
+ # group_date_by column[:access_date], :daily
35
+ # count_uniq column[:ua]
36
+ end
37
+
38
+ # topic 'total bytes' do
39
+ # select_date column[:access_date], BY_MONTHLY
40
+ # sum column[:bytes].to_kilobytes # / 1024
41
+ # end
42
+ end
43
+ end