hadoop-rubydsl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,53 @@
1
+ = hadoop-rubydsl
2
+
3
+ == Description
4
+ HadoopのMapper/ReducerをRubyによるDSLで記述することができます。
5
+ hadoop-ruby.jarを利用します。
6
+
7
+ 例)
8
+ apachelog.rb
9
+
10
+ # log:
11
+ # 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
12
+ # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
13
+ # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
14
+
15
+ use 'LogAnalysis'
16
+ data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
17
+ column[2].count_uniq
18
+ column[3].count_uniq
19
+ column[4].count_uniq
20
+ column[5].count_uniq
21
+ column[6].sum
22
+
23
+ =>
24
+ col2 frank 1
25
+ col2 frank2 2
26
+ col3 [10/Oct/2000:13:55:36 -0700] 3
27
+ col4 "GET /apache_pb.gif HTTP/1.0" 1
28
+ col4 "GET /apache_pb2.gif HTTP/1.0" 1
29
+ col4 "GET /apache_pb3.gif HTTP/1.0" 1
30
+ col5 200 2
31
+ col5 404 1
32
+ col6 6978
33
+
34
+ == Usage
35
+ 0. HADOOP_HOMEを正しく設定し、Hadoopを一式立ち上げておく。
36
+
37
+ 1. jruby-complete-*.jar を lib/java 以下にコピー
38
+ ex)
39
+ $ wget http://jruby.kenai.com/downloads/1.4.0RC2/jruby-complete-1.4.0RC2.jar
40
+ $ cp jruby-complete-*.jar lib/java/
41
+
42
+ 2. データを HDFS にアップロード
43
+ ex)
44
+ $ hadoop dfs -copyFromLocal apachelog inputs/
45
+
46
+ 3. MapReduce実行
47
+ $ bin/hadoop-ruby.sh examples/apachelog.rb inputs outputs
48
+
49
+ == Author
50
+ Koichi Fujikawa <fujibee@gmail.com>
51
+
52
+ == Copyright
53
+ License: Apache License
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ begin
2
+ require 'jeweler'
3
+ Jeweler::Tasks.new do |gemspec|
4
+ gemspec.name = "hadoop-rubydsl"
5
+ gemspec.summary = "Hadoop Ruby DSL"
6
+ gemspec.description = "Hadoop Ruby DSL"
7
+ gemspec.email = "fujibee@gmail.com"
8
+ gemspec.homepage = "http://github.com/fujibee/hadoop-rubydsl"
9
+ gemspec.authors = ["Koichi Fujikawa"]
10
+
11
+ gemspec.add_dependency 'jruby-on-hadoop'
12
+ gemspec.files.exclude "spec/**/*"
13
+ end
14
+ Jeweler::GemcutterTasks.new
15
+ rescue LoadError
16
+ puts "Jeweler not available. Install it with: gem install jeweler"
17
+ end
18
+
data/TODO ADDED
@@ -0,0 +1,2 @@
1
+ * entire error handling
2
+ * "use" method not allowed double quote..
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
data/bin/hadoop ADDED
@@ -0,0 +1,276 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Licensed to the Apache Software Foundation (ASF) under one or more
4
+ # contributor license agreements. See the NOTICE file distributed with
5
+ # this work for additional information regarding copyright ownership.
6
+ # The ASF licenses this file to You under the Apache License, Version 2.0
7
+ # (the "License"); you may not use this file except in compliance with
8
+ # the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+
19
+ # The Hadoop command script
20
+ #
21
+ # Environment Variables
22
+ #
23
+ # JAVA_HOME The java implementation to use. Overrides JAVA_HOME.
24
+ #
25
+ # HADOOP_CLASSPATH Extra Java CLASSPATH entries.
26
+ #
27
+ # HADOOP_HEAPSIZE The maximum amount of heap to use, in MB.
28
+ # Default is 1000.
29
+ #
30
+ # HADOOP_OPTS Extra Java runtime options.
31
+ #
32
+ # HADOOP_NAMENODE_OPTS These options are added to HADOOP_OPTS
33
+ # HADOOP_CLIENT_OPTS when the respective command is run.
34
+ # HADOOP_{COMMAND}_OPTS etc HADOOP_JT_OPTS applies to JobTracker
35
+ # for e.g. HADOOP_CLIENT_OPTS applies to
36
+ # more than one command (fs, dfs, fsck,
37
+ # dfsadmin etc)
38
+ #
39
+ # HADOOP_CONF_DIR Alternate conf dir. Default is ${HADOOP_HOME}/conf.
40
+ #
41
+ # HADOOP_ROOT_LOGGER The root appender. Default is INFO,console
42
+ #
43
+
44
+ bin=`dirname "$0"`
45
+ bin=`cd "$bin"; pwd`
46
+
47
+ if [ -f "$bin"/hadoop-config.sh ]; then
48
+ . "$bin"/hadoop-config.sh
49
+ fi
50
+
51
+ cygwin=false
52
+ case "`uname`" in
53
+ CYGWIN*) cygwin=true;;
54
+ esac
55
+
56
+ # if no args specified, show usage
57
+ if [ $# = 0 ]; then
58
+ echo "Usage: hadoop [--config confdir] COMMAND"
59
+ echo "where COMMAND is one of:"
60
+ echo " namenode -format format the DFS filesystem"
61
+ echo " secondarynamenode run the DFS secondary namenode"
62
+ echo " namenode run the DFS namenode"
63
+ echo " datanode run a DFS datanode"
64
+ echo " dfsadmin run a DFS admin client"
65
+ echo " fsck run a DFS filesystem checking utility"
66
+ echo " fs run a generic filesystem user client"
67
+ echo " balancer run a cluster balancing utility"
68
+ echo " jobtracker run the MapReduce job Tracker node"
69
+ echo " pipes run a Pipes job"
70
+ echo " tasktracker run a MapReduce task Tracker node"
71
+ echo " job manipulate MapReduce jobs"
72
+ echo " queue get information regarding JobQueues"
73
+ echo " version print the version"
74
+ echo " jar <jar> run a jar file"
75
+ echo " distcp <srcurl> <desturl> copy file or directories recursively"
76
+ echo " archive -archiveName NAME <src>* <dest> create a hadoop archive"
77
+ echo " daemonlog get/set the log level for each daemon"
78
+ echo " or"
79
+ echo " CLASSNAME run the class named CLASSNAME"
80
+ echo "Most commands print help when invoked w/o parameters."
81
+ exit 1
82
+ fi
83
+
84
+ # get arguments
85
+ COMMAND=$1
86
+ shift
87
+
88
+ if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
89
+ . "${HADOOP_CONF_DIR}/hadoop-env.sh"
90
+ fi
91
+
92
+ # some Java parameters
93
+ if [ "$JAVA_HOME" != "" ]; then
94
+ #echo "run java in $JAVA_HOME"
95
+ JAVA_HOME=$JAVA_HOME
96
+ fi
97
+
98
+ if [ "$JAVA_HOME" = "" ]; then
99
+ echo "Error: JAVA_HOME is not set."
100
+ exit 1
101
+ fi
102
+
103
+ JAVA=$JAVA_HOME/bin/java
104
+ JAVA_HEAP_MAX=-Xmx1000m
105
+
106
+ # check envvars which might override default args
107
+ if [ "$HADOOP_HEAPSIZE" != "" ]; then
108
+ #echo "run with heapsize $HADOOP_HEAPSIZE"
109
+ JAVA_HEAP_MAX="-Xmx""$HADOOP_HEAPSIZE""m"
110
+ #echo $JAVA_HEAP_MAX
111
+ fi
112
+
113
+ # CLASSPATH initially contains $HADOOP_CONF_DIR
114
+ CLASSPATH="${HADOOP_CONF_DIR}"
115
+ CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
116
+
117
+ # for developers, add Hadoop classes to CLASSPATH
118
+ if [ -d "$HADOOP_HOME/build/classes" ]; then
119
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/classes
120
+ fi
121
+ if [ -d "$HADOOP_HOME/build/webapps" ]; then
122
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build
123
+ fi
124
+ if [ -d "$HADOOP_HOME/build/test/classes" ]; then
125
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/test/classes
126
+ fi
127
+ if [ -d "$HADOOP_HOME/build/tools" ]; then
128
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/tools
129
+ fi
130
+
131
+ # so that filenames w/ spaces are handled correctly in loops below
132
+ IFS=
133
+
134
+ # for releases, add core hadoop jar & webapps to CLASSPATH
135
+ if [ -d "$HADOOP_HOME/webapps" ]; then
136
+ CLASSPATH=${CLASSPATH}:$HADOOP_HOME
137
+ fi
138
+ for f in $HADOOP_HOME/hadoop-*-core.jar; do
139
+ CLASSPATH=${CLASSPATH}:$f;
140
+ done
141
+
142
+ # add libs to CLASSPATH
143
+ for f in $HADOOP_HOME/lib/*.jar; do
144
+ CLASSPATH=${CLASSPATH}:$f;
145
+ done
146
+
147
+ for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do
148
+ CLASSPATH=${CLASSPATH}:$f;
149
+ done
150
+
151
+ for f in $HADOOP_HOME/hadoop-*-tools.jar; do
152
+ TOOL_PATH=${TOOL_PATH}:$f;
153
+ done
154
+ for f in $HADOOP_HOME/build/hadoop-*-tools.jar; do
155
+ TOOL_PATH=${TOOL_PATH}:$f;
156
+ done
157
+
158
+ # add user-specified CLASSPATH last
159
+ if [ "$HADOOP_CLASSPATH" != "" ]; then
160
+ CLASSPATH=${CLASSPATH}:${HADOOP_CLASSPATH}
161
+ fi
162
+
163
+ # default log directory & file
164
+ if [ "$HADOOP_LOG_DIR" = "" ]; then
165
+ HADOOP_LOG_DIR="$HADOOP_HOME/logs"
166
+ fi
167
+ if [ "$HADOOP_LOGFILE" = "" ]; then
168
+ HADOOP_LOGFILE='hadoop.log'
169
+ fi
170
+
171
+ # restore ordinary behaviour
172
+ unset IFS
173
+
174
+ # figure out which class to run
175
+ if [ "$COMMAND" = "namenode" ] ; then
176
+ CLASS='org.apache.hadoop.hdfs.server.namenode.NameNode'
177
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_NAMENODE_OPTS"
178
+ elif [ "$COMMAND" = "secondarynamenode" ] ; then
179
+ CLASS='org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode'
180
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_SECONDARYNAMENODE_OPTS"
181
+ elif [ "$COMMAND" = "datanode" ] ; then
182
+ CLASS='org.apache.hadoop.hdfs.server.datanode.DataNode'
183
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_DATANODE_OPTS"
184
+ elif [ "$COMMAND" = "fs" ] ; then
185
+ CLASS=org.apache.hadoop.fs.FsShell
186
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
187
+ elif [ "$COMMAND" = "dfs" ] ; then
188
+ CLASS=org.apache.hadoop.fs.FsShell
189
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
190
+ elif [ "$COMMAND" = "dfsadmin" ] ; then
191
+ CLASS=org.apache.hadoop.hdfs.tools.DFSAdmin
192
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
193
+ elif [ "$COMMAND" = "fsck" ] ; then
194
+ CLASS=org.apache.hadoop.hdfs.tools.DFSck
195
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
196
+ elif [ "$COMMAND" = "balancer" ] ; then
197
+ CLASS=org.apache.hadoop.hdfs.server.balancer.Balancer
198
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS"
199
+ elif [ "$COMMAND" = "jobtracker" ] ; then
200
+ CLASS=org.apache.hadoop.mapred.JobTracker
201
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOBTRACKER_OPTS"
202
+ elif [ "$COMMAND" = "tasktracker" ] ; then
203
+ CLASS=org.apache.hadoop.mapred.TaskTracker
204
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_TASKTRACKER_OPTS"
205
+ elif [ "$COMMAND" = "job" ] ; then
206
+ CLASS=org.apache.hadoop.mapred.JobClient
207
+ elif [ "$COMMAND" = "queue" ] ; then
208
+ CLASS=org.apache.hadoop.mapred.JobQueueClient
209
+ elif [ "$COMMAND" = "pipes" ] ; then
210
+ CLASS=org.apache.hadoop.mapred.pipes.Submitter
211
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
212
+ elif [ "$COMMAND" = "version" ] ; then
213
+ CLASS=org.apache.hadoop.util.VersionInfo
214
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
215
+ elif [ "$COMMAND" = "jar" ] ; then
216
+ CLASS=org.apache.hadoop.mapred.JobShell
217
+ elif [ "$COMMAND" = "distcp" ] ; then
218
+ CLASS=org.apache.hadoop.tools.DistCp
219
+ CLASSPATH=${CLASSPATH}:${TOOL_PATH}
220
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
221
+ elif [ "$COMMAND" = "daemonlog" ] ; then
222
+ CLASS=org.apache.hadoop.log.LogLevel
223
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
224
+ elif [ "$COMMAND" = "archive" ] ; then
225
+ CLASS=org.apache.hadoop.tools.HadoopArchives
226
+ CLASSPATH=${CLASSPATH}:${TOOL_PATH}
227
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
228
+ elif [ "$COMMAND" = "sampler" ] ; then
229
+ CLASS=org.apache.hadoop.mapred.lib.InputSampler
230
+ HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
231
+ else
232
+ CLASS=$COMMAND
233
+ fi
234
+
235
+ # cygwin path translation
236
+ if $cygwin; then
237
+ CLASSPATH=`cygpath -p -w "$CLASSPATH"`
238
+ HADOOP_HOME=`cygpath -d "$HADOOP_HOME"`
239
+ HADOOP_LOG_DIR=`cygpath -d "$HADOOP_LOG_DIR"`
240
+ TOOL_PATH=`cygpath -p -w "$TOOL_PATH"`
241
+ fi
242
+ # setup 'java.library.path' for native-hadoop code if necessary
243
+ JAVA_LIBRARY_PATH=''
244
+ if [ -d "${HADOOP_HOME}/build/native" -o -d "${HADOOP_HOME}/lib/native" ]; then
245
+ JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"`
246
+
247
+ if [ -d "$HADOOP_HOME/build/native" ]; then
248
+ JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib
249
+ fi
250
+
251
+ if [ -d "${HADOOP_HOME}/lib/native" ]; then
252
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
253
+ JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
254
+ else
255
+ JAVA_LIBRARY_PATH=${HADOOP_HOME}/lib/native/${JAVA_PLATFORM}
256
+ fi
257
+ fi
258
+ fi
259
+
260
+ # cygwin path translation
261
+ if $cygwin; then
262
+ JAVA_LIBRARY_PATH=`cygpath -p "$JAVA_LIBRARY_PATH"`
263
+ fi
264
+
265
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.dir=$HADOOP_LOG_DIR"
266
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.log.file=$HADOOP_LOGFILE"
267
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.home.dir=$HADOOP_HOME"
268
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.id.str=$HADOOP_IDENT_STRING"
269
+ HADOOP_OPTS="$HADOOP_OPTS -Dhadoop.root.logger=${HADOOP_ROOT_LOGGER:-INFO,console}"
270
+ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
271
+ HADOOP_OPTS="$HADOOP_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
272
+ fi
273
+
274
+ # run it
275
+ #echo exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
276
+ exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
@@ -0,0 +1,30 @@
1
+ #!/bin/bash
2
+ BIN_DIR=`dirname "$0"`
3
+ BASE_DIR=`cd $BIN_DIR/..; pwd`
4
+
5
+ # choose hadoop sh
6
+ HADOOP=$HADOOP_HOME/bin/hadoop
7
+ if [ ! -f $HADOOP ]; then
8
+ HADOOP=$BIN_DIR/hadoop
9
+ #HADOOP_OPTS="--config $BASE_DIR/conf"
10
+ fi
11
+
12
+ # fetch jruby jar if not exist
13
+ LIB_DIR=$BASE_DIR/lib/java
14
+ JRUBY_JAR=jruby-complete-1.4.0.jar
15
+ if [ ! -f "$LIB_DIR/$JRUBY_JAR" ]; then
16
+ wget http://jruby.kenai.com/downloads/1.4.0/jruby-complete-1.4.0.jar
17
+ mv $JRUBY_JAR $LIB_DIR/
18
+ fi
19
+
20
+ # construct command line
21
+ HADOOP_RUBY_LIB_DIR=$BASE_DIR/lib
22
+ export HADOOP_CLASSPATH=$HADOOP_RUBY_LIB_DIR
23
+ for x in `ls $HADOOP_RUBY_LIB_DIR`; do
24
+ DSL_FILES=$HADOOP_RUBY_LIB_DIR/$x,$DSL_FILES
25
+ done
26
+ DSL_FILES=$DSL_FILES$1
27
+
28
+ # execute hadoop ruby
29
+ echo runnig $1...
30
+ $HADOOP $HADOOP_OPTS jar $LIB_DIR/hadoop-ruby.jar org.apache.hadoop.ruby.JRubyJobRunner -libjars $LIB_DIR/$JRUBY_JAR -files $DSL_FILES $1 $2 $3
@@ -0,0 +1,19 @@
1
+ <?xml version="1.0"?>
2
+ <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3
+
4
+ <!-- Put site-specific property overrides in this file. -->
5
+
6
+ <configuration>
7
+ <property>
8
+ <name>fs.default.name</name>
9
+ <value>hdfs://localhost:9000/</value>
10
+ </property>
11
+ <property>
12
+ <name>mapred.job.tracker</name>
13
+ <value>localhost:50040</value>
14
+ </property>
15
+ <property>
16
+ <name>mapred.child.java.opts</name>
17
+ <value>-Xmx512m</value>
18
+ </property>
19
+ </configuration>
@@ -0,0 +1,18 @@
1
+ use 'LogAnalysis'
2
+
3
+ data 'apache log on test2' do
4
+ from 'apachelog/inputs'
5
+ to 'apachelog/outputs'
6
+
7
+ # 119.63.199.8 - - [15/Nov/2009:01:18:16 +0900] "GET /ranking/game?page=31 HTTP/1.1" 200 10077 "-" "Baiduspider+(+http://www.baidu.jp/spider/)"
8
+ # 203.83.243.81 - - [15/Nov/2009:01:18:33 +0900] "GET /dns_zones.txt HTTP/1.1" 404 294 "-" "libwww-perl/5.65"
9
+
10
+ each_line do
11
+ pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*) (.*) "(.*)"/
12
+ column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes', 'pass', 'ua'
13
+
14
+ topic 'ua counts', :label => 'ua' do
15
+ count_uniq column[:ua]
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,25 @@
1
+ use 'LogAnalysis'
2
+
3
+ data 'apache log on test1' do
4
+ from 'apachlog/inputs'
5
+ to 'apachlog/outputs'
6
+
7
+ each_line do
8
+ pattern /(.*) (.*) (.*) \[(.*)\] (".*") (\d*) (\d*)/
9
+ column_name 'remote_host', 'pass', 'user', 'access_date', 'request', 'status', 'bytes' # 各カラムにラベルをつける
10
+
11
+ topic 'which users?', :label => 'user' do
12
+ count_uniq column[:user]
13
+ end
14
+
15
+ # topic 'access date by monthly' do
16
+ # select_date column[:access_date], BY_MONTHLY
17
+ # count column[:access_date]
18
+ # end
19
+ #
20
+ # topic 'total bytes' do
21
+ # select_date column[:access_date], BY_MONTHLY
22
+ # sum column[:bytes].to_kilobytes # / 1024
23
+ # end
24
+ end
25
+ end
@@ -0,0 +1,15 @@
1
+ # Apache log analysis
2
+ #
3
+ # example target data:
4
+ # 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326
5
+ # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb2.gif HTTP/1.0" 200 2326
6
+ # 127.0.0.1 - frank2 [10/Oct/2000:13:55:36 -0700] "GET /apache_pb3.gif HTTP/1.0" 404 2326
7
+
8
+ use 'LogAnalysis'
9
+
10
+ data.pattern /(.*) (.*) (.*) (\[.*\]) (".*") (\d*) (\d*)/
11
+ column[2].count_uniq
12
+ column[3].count_uniq
13
+ column[4].count_uniq
14
+ column[5].count_uniq
15
+ column[6].sum
@@ -0,0 +1,14 @@
1
+ use 'HiveLike'
2
+
3
+ # hive-like/items.txt
4
+ # apple, 3, 100
5
+ # banana, 1, 50
6
+
7
+ create_table items(item STRING, quantity INT, price INT);
8
+ load_data "hive-like/items.txt" items;
9
+
10
+ select quantity, price, item from items;
11
+
12
+ # expect
13
+ # 0 apple 3 300
14
+ # 1 banana 1 50
@@ -0,0 +1,7 @@
1
+ use 'WordCount'
2
+
3
+ from 'wc/inputs'
4
+ to 'wc/outputs'
5
+
6
+ count_uniq
7
+ total :bytes, :words, :lines
@@ -0,0 +1,79 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{hadoop-rubydsl}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Koichi Fujikawa"]
12
+ s.date = %q{2009-12-26}
13
+ s.description = %q{Hadoop Ruby DSL}
14
+ s.email = %q{fujibee@gmail.com}
15
+ s.executables = ["hadoop", "hadoop-ruby.sh"]
16
+ s.extra_rdoc_files = [
17
+ "README",
18
+ "TODO"
19
+ ]
20
+ s.files = [
21
+ "README",
22
+ "Rakefile",
23
+ "TODO",
24
+ "VERSION",
25
+ "bin/hadoop",
26
+ "bin/hadoop-ruby.sh",
27
+ "conf/hadoop-site.xml",
28
+ "examples/apachelog-v2-2.rb",
29
+ "examples/apachelog-v2.rb",
30
+ "examples/apachelog.rb",
31
+ "examples/hive_like_test.rb",
32
+ "examples/word_count_test.rb",
33
+ "hadoop-rubydsl.gemspec",
34
+ "lib/core.rb",
35
+ "lib/hive_like.rb",
36
+ "lib/init.rb",
37
+ "lib/java/.gitignore",
38
+ "lib/java/hadoop-ruby.jar",
39
+ "lib/log_analysis.rb",
40
+ "lib/mapred_factory.rb",
41
+ "lib/util.rb",
42
+ "lib/word_count.rb"
43
+ ]
44
+ s.homepage = %q{http://github.com/fujibee/hadoop-rubydsl}
45
+ s.rdoc_options = ["--charset=UTF-8"]
46
+ s.require_paths = ["lib"]
47
+ s.rubygems_version = %q{1.3.5}
48
+ s.summary = %q{Hadoop Ruby DSL}
49
+ s.test_files = [
50
+ "spec/spec_helper.rb",
51
+ "spec/core_spec.rb",
52
+ "spec/util_spec.rb",
53
+ "spec/mapred_factory_spec.rb",
54
+ "spec/word_count_spec.rb",
55
+ "spec/hive_like_spec.rb",
56
+ "spec/log_analysis_spec.rb",
57
+ "spec/example_spec.rb",
58
+ "spec/init_spec.rb",
59
+ "examples/apachelog-v2.rb",
60
+ "examples/hive_like_test.rb",
61
+ "examples/word_count_test.rb",
62
+ "examples/apachelog-v2-2.rb",
63
+ "examples/apachelog.rb"
64
+ ]
65
+
66
+ if s.respond_to? :specification_version then
67
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
68
+ s.specification_version = 3
69
+
70
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
71
+ s.add_runtime_dependency(%q<jruby-on-hadoop>, [">= 0"])
72
+ else
73
+ s.add_dependency(%q<jruby-on-hadoop>, [">= 0"])
74
+ end
75
+ else
76
+ s.add_dependency(%q<jruby-on-hadoop>, [">= 0"])
77
+ end
78
+ end
79
+
data/lib/core.rb ADDED
@@ -0,0 +1,108 @@
1
+ require 'util'
2
+ require 'forwardable'
3
+
4
+ module HadoopDsl
5
+ # controller
6
+ class BaseMapRed
7
+ extend Forwardable
8
+
9
+ attr_reader :emitted
10
+
11
+ def initialize(script, model)
12
+ @script, @model = script, model
13
+ @model.controller = self
14
+ @emitted = []
15
+ end
16
+
17
+ def run
18
+ body = pre_process(read_file(@script))
19
+ eval(body, binding, @script)
20
+ end
21
+
22
+ def pre_process(body)
23
+ body # do nothing
24
+ end
25
+
26
+ def emit(hash) @emitted << hash end
27
+
28
+ # all DSL statements without def is processed here
29
+ def method_missing(method_name, *args) self end
30
+ end
31
+
32
+ class BaseSetup
33
+ def initialize(script, conf)
34
+ @script, @conf = script, conf
35
+ output_format
36
+ end
37
+
38
+ def run
39
+ body = pre_process(read_file(@script))
40
+ eval(body, binding, @script)
41
+ end
42
+
43
+ def pre_process(body)
44
+ body # do nothing
45
+ end
46
+
47
+ # do nothing
48
+ def output_format; end
49
+
50
+ def paths; [@from, @to] end
51
+
52
+ def from(path) @from = path end
53
+ def to(path) @to = path end
54
+
55
+ # all DSL statements without def is processed here
56
+ def method_missing(method_name, *args) self end
57
+ end
58
+
59
+ class BaseMapper < BaseMapRed
60
+ def initialize(script, model)
61
+ super(script, model)
62
+ end
63
+ end
64
+
65
+ class BaseReducer < BaseMapRed
66
+ def initialize(script, model)
67
+ super(script, model)
68
+ end
69
+ end
70
+
71
+ # model
72
+ class BaseModel
73
+ attr_accessor :controller
74
+
75
+ # all DSL statements without def is processed here
76
+ def method_missing(method_name, *args) self end
77
+ end
78
+
79
+ class BaseMapperModel < BaseModel
80
+ attr_reader :key, :value
81
+
82
+ def initialize(key, value)
83
+ @key, @value = key, value
84
+ end
85
+
86
+ # common functions
87
+ def identity
88
+ @controller.emit(@key => @value)
89
+ end
90
+ end
91
+
92
+ class BaseReducerModel < BaseModel
93
+ attr_reader :key, :values
94
+
95
+ def initialize(key, values)
96
+ @key, @values = key, values
97
+ end
98
+
99
+ # common functions
100
+ def aggregate
101
+ @controller.emit(@key => @values.inject {|ret, i| ret + i})
102
+ end
103
+
104
+ def identity
105
+ @values.each {|v| @controller.emit(@key => v)}
106
+ end
107
+ end
108
+ end