hadoop-rubydsl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,53 @@
1
+ = hadoop-rubydsl
2
+
3
+ == Description
4
+ HadoopのMapper/ReducerをRubyによるDSLで記述することができます。
5
+ hadoop-ruby.jarを利用します。
6
+
7
+ 例)
8
+ apachelog.rb
9
+
10
+ # log:
11
+ # 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
12
+ # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
13
+ # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
14
+
15
+ use 'LogAnalysis'
16
+ data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
17
+ column[2].count_uniq
18
+ column[3].count_uniq
19
+ column[4].count_uniq
20
+ column[5].count_uniq
21
+ column[6].sum
22
+
23
+ =>
24
+ col2 frank 1
25
+ col2 frank2 2
26
+ col3 [10/Oct/2000:13:55:36 -0700] 3
27
+ col4 "GET /apache_pb.gif HTTP/1.0" 1
28
+ col4 "GET /apache_pb2.gif HTTP/1.0" 1
29
+ col4 "GET /apache_pb3.gif HTTP/1.0" 1
30
+ col5 200 2
31
+ col5 404 1
32
+ col6 6978
33
+
34
+ == Usage
35
+ 0. HADOOP_HOMEを正しく設定し、Hadoopを一式立ち上げておく。
36
+
37
+ 1. jruby-complete-*.jar を lib/java 以下にコピー
38
+ ex)
39
+ $ wget http://jruby.kenai.com/downloads/1.4.0RC2/jruby-complete-1.4.0RC2.jar
40
+ $ cp jruby-complete-*.jar lib/java/
41
+
42
+ 2. データを HDFS にアップロード
43
+ ex)
44
+ $ hadoop dfs -copyFromLocal apachelog inputs/
45
+
46
+ 3. MapReduce実行
47
+ $ bin/hadoop-ruby.sh examples/apachelog.rb inputs outputs
48
+
49
+ == Author
50
+ Koichi Fujikawa <fujibee@gmail.com>
51
+
52
+ == Copyright
53
+ License: Apache License
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ begin
2
+ require 'jeweler'
3
+ Jeweler::Tasks.new do |gemspec|
4
+ gemspec.name = "hadoop-rubydsl"
5
+ gemspec.summary = "Hadoop Ruby DSL"
6
+ gemspec.description = "Hadoop Ruby DSL"
7
+ gemspec.email = "fujibee@gmail.com"
8
+ gemspec.homepage = "http://github.com/fujibee/hadoop-rubydsl"
9
+ gemspec.authors = ["Koichi Fujikawa"]
10
+
11
+ gemspec.add_dependency 'jruby-on-hadoop'
12
+ gemspec.files.exclude "spec/**/*"
13
+ end
14
+ Jeweler::GemcutterTasks.new
15
+ rescue LoadError
16
+ puts "Jeweler not available. Install it with: gem install jeweler"
17
+ end
18
+
data/TODO ADDED
@@ -0,0 +1,2 @@
1
+ * entire error handling
2
+ * "use" method not allowed double quote..
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
data/bin/hadoop ADDED
@@ -0,0 +1,276 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Licensed to the Apache Software Foundation (ASF) under one or more
4
+ # contributor license agreements. See the NOTICE file distributed with
5
+ # this work for additional information regarding copyright ownership.
6
+ # The ASF licenses this file to You under the Apache License, Version 2.0
7
+ # (the "License"); you may not use this file except in compliance with
8
+ # the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+
19
+ # The Hadoop command script
20
+ #
21
+ # Environment Variables
22
+ #
23
+ # JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
24
+ #
25
+ # HADOOP_CLASSPATH Extra Java CLASSPATH entries.
26
+ #
27
+ # HADOOP_HEAPSIZE The maximum amount of heap to use, in MB.
28
+ # Default is 1000.
29
+ #
30
+ # HADOOP_OPTS Extra Java runtime options.
31
+ #
32
+ # HADOOP_NAMENODE_OPTS These options are added to HADOOP_OPTS
33
+ # HADOOP_CLIENT_OPTS when the respective command is run.
34
+ # HADOOP_{COMMAND}_OPTS etc HADOOP_JT_OPTS applies to JobTracker
35
+ # for e.g. HADOOP_CLIENT_OPTS applies to
36
+ # more than one command (fs, dfs, fsck,
37
+ # dfsadmin etc)
38
+ #
39
+ # HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
40
+ #
41
+ # HADOOP_ROOT_LOGGER The root appender. Default is INFO,console
42
+ #
43
+
44
+ bin=`dirname "$0"`
45
+ bin=`cd "$bin"; pwd`
46
+
47
+ if [ -f "$bin"/hadoop-config.sh ]; then
48
+ . "$bin"/hadoop-config.sh
49
+ fi
50
+
51
+ cygwin=false
52
+ case "`uname`" in
53
+ CYGWIN*) cygwin=true;;
54
+ esac
55
+
56
+ # if no args specified, show usage
57
+ if [ $# = 0 ]; then
58
+ echo "Usage: hadoop [--config confdir] COMMAND"
59
+ echo "where COMMAND is one of:"
60
+ echo " namenode -format format the DFS filesystem"
61
+ echo " secondarynamenode run the DFS secondary namenode"
62
+ echo " namenode run the DFS namenode"
63
+ echo " datanode run a DFS datanode"
64
+ echo " dfsadmin run a DFS admin client"
65
+ echo " fsck run a DFS filesystem checking utility"
66
+ echo " fs run a generic filesystem user client"
67
+ echo " balancer run a cluster balancing utility"
68
+ echo " jobtracker run the MapReduce job Tracker node"
69
+ echo " pipes run a Pipes job"
70
+ echo " tasktracker run a MapReduce task Tracker node"
71
+ echo " job manipulate MapReduce jobs"
72
+ echo " queue get information regarding JobQueues"
73
+ echo " version print the version"
74
+ echo " jar <jar> run a jar file"
75
+ echo " distcp <srcurl> <desturl> copy file or directories recursively"
76
+ echo " archive -archiveName NAME <src>* <dest> create a hadoop archive"
77
+ echo " daemonlog get/set the log level for each daemon"
78
+ echo " or"
79
+ echo " CLASSNAME run the class named CLASSNAME"
80
+ echo "Most commands print help when invoked w/o parameters."
81
+ exit 1
82
+ fi
83
+
84
+ # get arguments
85
+ COMMAND=$1
86
+ shift
87
+
88
+ if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
89
+ . "${HADOOP_CONF_DIR}/hadoop-env.sh"
90
+ fi
91
+
92
+ # some Java parameters
93
+ if [ "$JAVA_HOME" != "" ]; then
94
+ #echo "run java in $JAVA_HOME"
95
+ JAVA_HOME=$JAVA_HOME
96
+ fi
97
+
98
+ if [ "$JAVA_HOME" = "" ]; then
99
+ echo "Error: JAVA_HOME is not set."
100
+ exit 1
101
+ fi
102
+
103
+ JAVA=$JAVA_HOME/bin/java
104
+ JAVA_HEAP_MAX=-Xmx1000m
105
+
106
+ # check envvars which might override default args
107
+ if [ "$HADOOP_HEAPSIZE" != "" ]; then
108
+ #echo "run with heapsize $HADOOP_HEAPSIZE"
109
+ JAVA_HEAP_MAX="-Xmx""$HADOOP_HEAPSIZE""m"
110
+ #echo $JAVA_HEAP_MAX
111
+ fi
112
+
113
+ # CLASSPATH initially contains $HADOOP_CONF_DIR
114
+ CLASSPATH="${HADOOP_CONF_DIR}"
115
+ CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
116
+
117
+ # for developers, add Hadoop classes to CLASSPATH
118
+ if [ -d "$HADOOP_HOME/build/classes" ]; then
119
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/classes
120
+ fi
121
+ if [ -d "$HADOOP_HOME/build/webapps" ]; then
122
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build
123
+ fi
124
+ if [ -d "$HADOOP_HOME/build/test/classes" ]; then
125
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/test/classes
126
+ fi
127
+ if [ -d "$HADOOP_HOME/build/tools" ]; then
128
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/tools
129
+ fi
130
+
131
+ # so that filenames w/ spaces are handled correctly in loops below
132
+ IFS=
133
+
134
+ # for releases, add core hadoop jar & webapps to CLASSPATH
135
+ if [ -d "$HADOOP_HOME/webapps" ]; then
136
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME
137
+ fi
138
+ for f in $HADOOP_HOME/hadoop-*-core.jar; do
139
+ CLASSPATH=${CLASSPATH}:$f;
140
+ done
141
+
142
+ # add libs to CLASSPATH
143
+ for f in $HADOOP_HOME/lib/*.jar; do
144
+ CLASSPATH=${CLASSPATH}:$f;
145
+ done
146
+
147
+ for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do
148
+ CLASSPATH=${CLASSPATH}:$f;
149
+ done
150
+
151
+ for f in $HADOOP_HOME/hadoop-*-tools.jar; do
152
+ TOOL_PATH=${TOOL_PATH}:$f;
153
+ done
154
+ for f in $HADOOP_HOME/build/hadoop-*-tools.jar; do
155
+ TOOL_PATH=${TOOL_PATH}:$f;
156
+ done
157
+
158
+ # add user-specified CLASSPATH last
159
+ if [ "$HADOOP_CLASSPATH" != "" ]; then
160
+ CLASSPATH=${CLASSPATH}:${HADOOP_CLASSPATH}
161
+ fi
162
+
163
+ # default log directory & file
164
+ if [ "$HADOOP_LOG_DIR" = "" ]; then
165
+ HADOOP_LOG_DIR="$HADOOP_HOME/logs"
166
+ fi
167
+ if [ "$HADOOP_LOGFILE" = "" ]; then
168
+ HADOOP_LOGFILE='hadoop.log'
169
+ fi
170
+
171
+ # restore ordinary behaviour
172
+ unset IFS
173
+
174
+ # figure out which class to run
175
+ if [ "$COMMAND" = "namenode" ] ; then
176
+ CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
177
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"
178
+ elif [ "$COMMAND" = "secondarynamenode" ] ; then
179
+ CLASS='org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode'
180
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_SECONDARYNAMENODE_OPTS"
181
+ elif [ "$COMMAND" = "datanode" ] ; then
182
+ CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
183
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_DATANODE_OPTS"
184
+ elif [ "$COMMAND" = "fs" ] ; then
185
+ CLASS=org.apache.hadoop.fs.FsShell
186
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
187
+ elif [ "$COMMAND" = "dfs" ] ; then
188
+ CLASS=org.apache.hadoop.fs.FsShell
189
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
190
+ elif [ "$COMMAND" = "dfsadmin" ] ; then
191
+ CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin
192
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
193
+ elif [ "$COMMAND" = "fsck" ] ; then
194
+ CLASS=org.apache.hadoop.hdfs.tools.DFSck
195
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
196
+ elif [ "$COMMAND" = "balancer" ] ; then
197
+ CLASS=org.apache.hadoop.hdfs.server.balancer.Balancer
198
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS"
199
+ elif [ "$COMMAND" = "jobtracker" ] ; then
200
+ CLASS=org.apache.hadoop.mapred.JobTracker
201
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
202
+ elif [ "$COMMAND" = "tasktracker" ] ; then
203
+ CLASS=org.apache.hadoop.mapred.TaskTracker
204
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
205
+ elif [ "$COMMAND" = "job" ] ; then
206
+ CLASS=org.apache.hadoop.mapred.JobClient
207
+ elif [ "$COMMAND" = "queue" ] ; then
208
+ CLASS=org.apache.hadoop.mapred.JobQueueClient
209
+ elif [ "$COMMAND" = "pipes" ] ; then
210
+ CLASS=org.apache.hadoop.mapred.pipes.Submitter
211
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
212
+ elif [ "$COMMAND" = "version" ] ; then
213
+ CLASS=org.apache.hadoop.util.VersionInfo
214
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
215
+ elif [ "$COMMAND" = "jar" ] ; then
216
+ CLASS=org.apache.hadoop.mapred.JobShell
217
+ elif [ "$COMMAND" = "distcp" ] ; then
218
+ CLASS=org.apache.hadoop.tools.DistCp
219
+ CLASSPATH=${CLASSPATH}:${TOOL_PATH}
220
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
221
+ elif [ "$COMMAND" = "daemonlog" ] ; then
222
+ CLASS=org.apache.hadoop.log.LogLevel
223
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
224
+ elif [ "$COMMAND" = "archive" ] ; then
225
+ CLASS=org.apache.hadoop.tools.HadoopArchives
226
+ CLASSPATH=${CLASSPATH}:${TOOL_PATH}
227
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
228
+ elif [ "$COMMAND" = "sampler" ] ; then
229
+ CLASS=org.apache.hadoop.mapred.lib.InputSampler
230
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
231
+ else
232
+ CLASS=$COMMAND
233
+ fi
234
+
235
+ # cygwin path translation
236
+ if $cygwin; then
237
+ CLASSPATH=`cygpath -p -w "$CLASSPATH"`
238
+ HADOOP_HOME=`cygpath -d "$HADOOP_HOME"`
239
+ HADOOP_LOG_DIR=`cygpath -d "$HADOOP_LOG_DIR"`
240
+ TOOL_PATH=`cygpath -p -w "$TOOL_PATH"`
241
+ fi
242
+ # setup 'java.library.path' for native-hadoop code if necessary
243
+ JAVA_LIBRARY_PATH=''
244
+ if [ -d "${HADOOP_HOME}/build/native" -o -d "${HADOOP_HOME}/lib/native" ]; then
245
+ JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"`
246
+
247
+ if [ -d "$HADOOP_HOME/build/native" ]; then
248
+ JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib
249
+ fi
250
+
251
+ if [ -d "${HADOOP_HOME}/lib/native" ]; then
252
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
253
+ JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
254
+ else
255
+ JAVA_LIBRARY_PATH=${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
256
+ fi
257
+ fi
258
+ fi
259
+
260
+ # cygwin path translation
261
+ if $cygwin; then
262
+ JAVA_LIBRARY_PATH=`cygpath -p "$JAVA_LIBRARY_PATH"`
263
+ fi
264
+
265
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.dir=$HADOOP_LOG_DIR"
266
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.file=$HADOOP_LOGFILE"
267
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.home.dir=$HADOOP_HOME"
268
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.id.str=$HADOOP_IDENT_STRING"
269
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.root.logger=${HADOOP_ROOT_LOGGER:-INFO,console}"
270
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
271
+ HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
272
+ fi
273
+
274
+ # run it
275
+ #echo exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
276
+ exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
@@ -0,0 +1,30 @@
1
+ #!/bin/bash
2
+ BIN_DIR=`dirname "$0"`
3
+ BASE_DIR=`cd $BIN_DIR/..; pwd`
4
+
5
+ # choose hadoop sh
6
+ HADOOP=$HADOOP_HOME/bin/hadoop
7
+ if [ ! -f $HADOOP ]; then
8
+ HADOOP=$BIN_DIR/hadoop
9
+ #HADOOP_OPTS="--config $BASE_DIR/conf"
10
+ fi
11
+
12
+ # fetch jruby jar if not exist
13
+ LIB_DIR=$BASE_DIR/lib/java
14
+ JRUBY_JAR=jruby-complete-1.4.0.jar
15
+ if [ ! -f "$LIB_DIR/$JRUBY_JAR" ]; then
16
+ wget http://jruby.kenai.com/downloads/1.4.0/jruby-complete-1.4.0.jar
17
+ mv $JRUBY_JAR $LIB_DIR/
18
+ fi
19
+
20
+ # construct command line
21
+ HADOOP_RUBY_LIB_DIR=$BASE_DIR/lib
22
+ export HADOOP_CLASSPATH=$HADOOP_RUBY_LIB_DIR
23
+ for x in `ls $HADOOP_RUBY_LIB_DIR`; do
24
+ DSL_FILES=$HADOOP_RUBY_LIB_DIR/$x,$DSL_FILES
25
+ done
26
+ DSL_FILES=$DSL_FILES$1
27
+
28
+ # execute hadoop ruby
29
+ echo runnig $1...
30
+ $HADOOP $HADOOP_OPTS jar $LIB_DIR/hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars $LIB_DIR/$JRUBY_JAR -files $DSL_FILES $1 $2 $3
@@ -0,0 +1,19 @@
1
+ <?xml version="1.0"?>
2
+ <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3
+
4
+ <!-- Put site-specific property overrides in this file. -->
5
+
6
+ <configuration>
7
+ <property>
8
+ <name>fs.default.name</name>
9
+ <value>hdfs://localhost:9000/</value>
10
+ </property>
11
+ <property>
12
+ <name>mapred.job.tracker</name>
13
+ <value>localhost:50040</value>
14
+ </property>
15
+ <property>
16
+ <name>mapred.child.java.opts</name>
17
+ <value>-Xmx512m</value>
18
+ </property>
19
+ </configuration>
@@ -0,0 +1,18 @@
1
+ use 'LogAnalysis'
2
+
3
+ data 'apache log on test2' do
4
+ from 'apachelog/inputs'
5
+ to 'apachelog/outputs'
6
+
7
+ # 119.63.199.8 - - [15/Nov/2009:01:18:16 +0900] "GET /ranking/game?page=31 HTTP/1.1" 200 10077 "-" "Baiduspider+(+http://www.baidu.jp/spider/)"
8
+ # 203.83.243.81 - - [15/Nov/2009:01:18:33 +0900] "GET /dns_zones.txt HTTP/1.1" 404 294 "-" "libwww-perl/5.65"
9
+
10
+ each_line do
11
+ pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
12
+ column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
13
+
14
+ topic 'ua counts', :label => 'ua' do
15
+ count_uniq column[:ua]
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,25 @@
1
+ use 'LogAnalysis'
2
+
3
+ data 'apache log on test1' do
4
+ from 'apachlog/inputs'
5
+ to 'apachlog/outputs'
6
+
7
+ each_line do
8
+ pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
9
+ column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes' # 各カラムにラベルをつける
10
+
11
+ topic 'which users?', :label => 'user' do
12
+ count_uniq column[:user]
13
+ end
14
+
15
+ # topic 'access date by monthly' do
16
+ # select_date column[:access_date], BY_MONTHLY
17
+ # count column[:access_date]
18
+ # end
19
+ #
20
+ # topic 'total bytes' do
21
+ # select_date column[:access_date], BY_MONTHLY
22
+ # sum column[:bytes].to_kilobytes # / 1024
23
+ # end
24
+ end
25
+ end
@@ -0,0 +1,15 @@
1
+ # Apache log analysis
2
+ #
3
+ # example target data:
4
+ # 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
5
+ # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
6
+ # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
7
+
8
+ use 'LogAnalysis'
9
+
10
+ data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
11
+ column[2].count_uniq
12
+ column[3].count_uniq
13
+ column[4].count_uniq
14
+ column[5].count_uniq
15
+ column[6].sum
@@ -0,0 +1,14 @@
1
+ use 'HiveLike'
2
+
3
+ # hive-like/items.txt
4
+ # apple, 3, 100
5
+ # banana, 1, 50
6
+
7
+ create_table items(item STRING, quantity INT, price INT);
8
+ load_data "hive-like/items.txt" items;
9
+
10
+ select quantity, price, item from items;
11
+
12
+ # expect
13
+ # 0 apple 3 300
14
+ # 1 banana 1 50
@@ -0,0 +1,7 @@
1
+ use 'WordCount'
2
+
3
+ from 'wc/inputs'
4
+ to 'wc/outputs'
5
+
6
+ count_uniq
7
+ total :bytes, :words, :lines
@@ -0,0 +1,79 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{hadoop-rubydsl}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Koichi Fujikawa"]
12
+ s.date = %q{2009-12-26}
13
+ s.description = %q{Hadoop Ruby DSL}
14
+ s.email = %q{fujibee@gmail.com}
15
+ s.executables = ["hadoop", "hadoop-ruby.sh"]
16
+ s.extra_rdoc_files = [
17
+ "README",
18
+ "TODO"
19
+ ]
20
+ s.files = [
21
+ "README",
22
+ "Rakefile",
23
+ "TODO",
24
+ "VERSION",
25
+ "bin/hadoop",
26
+ "bin/hadoop-ruby.sh",
27
+ "conf/hadoop-site.xml",
28
+ "examples/apachelog-v2-2.rb",
29
+ "examples/apachelog-v2.rb",
30
+ "examples/apachelog.rb",
31
+ "examples/hive_like_test.rb",
32
+ "examples/word_count_test.rb",
33
+ "hadoop-rubydsl.gemspec",
34
+ "lib/core.rb",
35
+ "lib/hive_like.rb",
36
+ "lib/init.rb",
37
+ "lib/java/.gitignore",
38
+ "lib/java/hadoop-ruby.jar",
39
+ "lib/log_analysis.rb",
40
+ "lib/mapred_factory.rb",
41
+ "lib/util.rb",
42
+ "lib/word_count.rb"
43
+ ]
44
+ s.homepage = %q{http://github.com/fujibee/hadoop-rubydsl}
45
+ s.rdoc_options = ["--charset=UTF-8"]
46
+ s.require_paths = ["lib"]
47
+ s.rubygems_version = %q{1.3.5}
48
+ s.summary = %q{Hadoop Ruby DSL}
49
+ s.test_files = [
50
+ "spec/spec_helper.rb",
51
+ "spec/core_spec.rb",
52
+ "spec/util_spec.rb",
53
+ "spec/mapred_factory_spec.rb",
54
+ "spec/word_count_spec.rb",
55
+ "spec/hive_like_spec.rb",
56
+ "spec/log_analysis_spec.rb",
57
+ "spec/example_spec.rb",
58
+ "spec/init_spec.rb",
59
+ "examples/apachelog-v2.rb",
60
+ "examples/hive_like_test.rb",
61
+ "examples/word_count_test.rb",
62
+ "examples/apachelog-v2-2.rb",
63
+ "examples/apachelog.rb"
64
+ ]
65
+
66
+ if s.respond_to? :specification_version then
67
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
68
+ s.specification_version = 3
69
+
70
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
71
+ s.add_runtime_dependency(%q<jruby-on-hadoop>, [">= 0"])
72
+ else
73
+ s.add_dependency(%q<jruby-on-hadoop>, [">= 0"])
74
+ end
75
+ else
76
+ s.add_dependency(%q<jruby-on-hadoop>, [">= 0"])
77
+ end
78
+ end
79
+
data/lib/core.rb ADDED
@@ -0,0 +1,108 @@
1
+ require 'util'
2
+ require 'forwardable'
3
+
4
+ module HadoopDsl
5
+ # controller
6
+ class BaseMapRed
7
+ extend Forwardable
8
+
9
+ attr_reader :emitted
10
+
11
+ def initialize(script, model)
12
+ @script, @model = script, model
13
+ @model.controller = self
14
+ @emitted = []
15
+ end
16
+
17
+ def run
18
+ body = pre_process(read_file(@script))
19
+ eval(body, binding, @script)
20
+ end
21
+
22
+ def pre_process(body)
23
+ body # do nothing
24
+ end
25
+
26
+ def emit(hash) @emitted << hash end
27
+
28
+ # all DSL statements without def is processed here
29
+ def method_missing(method_name, *args) self end
30
+ end
31
+
32
+ class BaseSetup
33
+ def initialize(script, conf)
34
+ @script, @conf = script, conf
35
+ output_format
36
+ end
37
+
38
+ def run
39
+ body = pre_process(read_file(@script))
40
+ eval(body, binding, @script)
41
+ end
42
+
43
+ def pre_process(body)
44
+ body # do nothing
45
+ end
46
+
47
+ # do nothing
48
+ def output_format; end
49
+
50
+ def paths; [@from, @to] end
51
+
52
+ def from(path) @from = path end
53
+ def to(path) @to = path end
54
+
55
+ # all DSL statements without def is processed here
56
+ def method_missing(method_name, *args) self end
57
+ end
58
+
59
+ class BaseMapper < BaseMapRed
60
+ def initialize(script, model)
61
+ super(script, model)
62
+ end
63
+ end
64
+
65
+ class BaseReducer < BaseMapRed
66
+ def initialize(script, model)
67
+ super(script, model)
68
+ end
69
+ end
70
+
71
+ # model
72
+ class BaseModel
73
+ attr_accessor :controller
74
+
75
+ # all DSL statements without def is processed here
76
+ def method_missing(method_name, *args) self end
77
+ end
78
+
79
+ class BaseMapperModel < BaseModel
80
+ attr_reader :key, :value
81
+
82
+ def initialize(key, value)
83
+ @key, @value = key, value
84
+ end
85
+
86
+ # common functions
87
+ def identity
88
+ @controller.emit(@key => @value)
89
+ end
90
+ end
91
+
92
+ class BaseReducerModel < BaseModel
93
+ attr_reader :key, :values
94
+
95
+ def initialize(key, values)
96
+ @key, @values = key, values
97
+ end
98
+
99
+ # common functions
100
+ def aggregate
101
+ @controller.emit(@key => @values.inject {|ret, i| ret + i})
102
+ end
103
+
104
+ def identity
105
+ @values.each {|v| @controller.emit(@key => v)}
106
+ end
107
+ end
108
+ end