elephant-driver 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md ADDED
@@ -0,0 +1,44 @@
1
+ # About
2
+
3
+ 'elephant-driver' is a Ruby library to communicate with Hadoop daemons.
4
+
5
+ # Notice
6
+
7
+ Currently, only communicating with JobTracker is supported. Other daemons (TaskTracker, NameNode, DataNode) are not supported yet.
8
+
9
+ # Requirements
10
+
11
+ * thfift
12
+ * nokogiri
13
+
14
+ # Setup
15
+
16
+ This library assumes that you're using CDH3 (Cloudera Distribution for Hadoop, version 3).
17
+
18
+ You first need to install 'hue-plugins' package at the JobTracker node. Then, the following settings needs to be included in your mapred-site.xml.
19
+
20
+ ```xml
21
+ <!-- Enable Hue plugins -->
22
+ <property>
23
+ <name>mapred.jobtracker.plugins</name>
24
+ <value>org.apache.hadoop.thriftfs.ThriftJobTrackerPlugin</value>
25
+ <description>Comma-separated list of jobtracker plug-ins to be activated.</description>
26
+ </property>
27
+ <property>
28
+ <name>jobtracker.thrift.address</name>
29
+ <value>0.0.0.0:9290</value>
30
+ </property>
31
+ ```
32
+
33
+ # Usage
34
+
35
+ See spec/ directory for the example usage.
36
+
37
+ ```ruby
38
+ @cln = ElephantDriver::Client.new($HOST, $PORT)
39
+ @cln.jobs.each { |j|
40
+ j.tasks.each { |t|
41
+ t.counters
42
+ }
43
+ }
44
+ ```
data/Rakefile ADDED
@@ -0,0 +1,75 @@
1
+ require 'rake'
2
+ require 'rake/testtask'
3
+ require 'rake/clean'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |gemspec|
8
+ gemspec.name = "elephant-driver"
9
+ gemspec.summary = "Ruby libray for managing Hadoop clusters"
10
+ gemspec.author = "Kazuki Ohta"
11
+ gemspec.email = "kazuki.ohta@gmail.com"
12
+ #gemspec.homepage = "http://.../"
13
+ gemspec.has_rdoc = false
14
+ gemspec.require_paths = ["lib"]
15
+ gemspec.add_dependency "thrift", "~> 0.7.0"
16
+ gemspec.add_dependency "nokogiri", ">= 1.5.0"
17
+ gemspec.test_files = Dir["test/**/*.rb", "test/**/*.sh"]
18
+ gemspec.files = Dir["bin/**/*", "lib/**/*", "test/**/*.rb"]
19
+ gemspec.executables = []
20
+ end
21
+ Jeweler::GemcutterTasks.new
22
+ rescue LoadError
23
+ puts "Jeweler not available. Install it with: gem install jeweler"
24
+ end
25
+
26
+ ############################
27
+
28
+ require 'spec/rake/spectask'
29
+
30
+ desc "Run all specs"
31
+ Spec::Rake::SpecTask.new('spec') do |t|
32
+ t.spec_opts = ['--colour --format progress --loadby mtime --reverse']
33
+ t.spec_files = FileList['spec/*_spec.rb']
34
+ end
35
+
36
+ desc "Run all examples with RCov"
37
+ Spec::Rake::SpecTask.new('rcov') do |t|
38
+ t.spec_files = FileList['spec/*_spec.rb']
39
+ t.rcov = true
40
+ t.rcov_opts = ['--exclude', 'examples']
41
+ end
42
+
43
+ # task :default => :spec
44
+
45
+ ############################
46
+
47
+ task "thrift_gen" do
48
+ system "rm -f common.thrift jobtracker.thrift"
49
+ system "wget https://raw.github.com/cloudera/hue/master/desktop/libs/hadoop/java/if/common.thrift"
50
+ system "wget https://raw.github.com/cloudera/hue/master/desktop/libs/hadoop/java/if/jobtracker.thrift"
51
+ system "mv common.thrift lib/elephant-driver/thrift/"
52
+ system "mv jobtracker.thrift lib/elephant-driver/thrift/"
53
+ system "mkdir -p tmp"
54
+ system "thrift --gen rb -o tmp lib/elephant-driver/thrift/common.thrift"
55
+ system "thrift --gen rb -o tmp lib/elephant-driver/thrift/jobtracker.thrift"
56
+ system "mv tmp/gen-rb/* lib/elephant-driver/thrift"
57
+ system "rm -fR tmp"
58
+ end
59
+
60
+ VERSION_FILE = "lib/elephant-driver/version.rb"
61
+
62
+ file VERSION_FILE => ["VERSION"] do |t|
63
+ version = File.read("VERSION").strip
64
+ File.open(VERSION_FILE, "w") {|f|
65
+ f.write <<EOF
66
+ module ElephantDriver
67
+
68
+ VERSION = '#{version}'
69
+
70
+ end
71
+ EOF
72
+ }
73
+ end
74
+
75
+ task :default => [VERSION_FILE, :build]
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,19 @@
1
+ require 'thrift'
2
+
3
+ # thrift-generated files
4
+ $:.unshift File.join(File.dirname(__FILE__), 'elephant-driver', 'thrift')
5
+ [
6
+ 'common_types',
7
+ 'common_constants',
8
+ 'hadoop_service_base',
9
+ 'jobtracker_types',
10
+ 'jobtracker_constants',
11
+ 'jobtracker',
12
+ ].each { |fn|
13
+ require File.join(File.dirname(__FILE__), 'elephant-driver', 'thrift', fn)
14
+ }
15
+
16
+ # library files
17
+ [ 'client', 'task', 'job', 'tracker' ].each { |fn|
18
+ require File.join(File.dirname(__FILE__), 'elephant-driver', fn)
19
+ }
@@ -0,0 +1,69 @@
1
+ module ElephantDriver
2
+
3
+ class Client
4
+ def initialize(host, port=9290, user='mapred', timeout=30)
5
+ sock = Thrift::Socket.new host, port
6
+ sock.timeout = timeout * 1000
7
+
8
+ @transport = Thrift::BufferedTransport.new sock
9
+ @transport.open
10
+
11
+ # 2011/08/23 Kazuki Ohta <kazuki.ohta@gmail.com>
12
+ # explicitly specify TCP_NODELAY for low-latency communication.
13
+ raw_sock = sock.to_io
14
+ raw_sock.setsockopt Socket::IPPROTO_TCP, Socket::TCP_NODELAY, 1
15
+
16
+ protocol = Thrift::BinaryProtocol.new @transport
17
+ @client = Hadoop::API::Jobtracker::Jobtracker::Client.new protocol
18
+ options = { 'effective_user' => user }
19
+ @ctx = Hadoop::API::RequestContext.new(:confOptions => options)
20
+ end
21
+
22
+ # Jobs
23
+ def jobs(status=:running)
24
+ ret =
25
+ case status
26
+ when :running then call :getRunningJobs
27
+ when :completed then call :getCompletedJobs
28
+ when :failed then call :getFailedJobs
29
+ when :killed then call :getKilledJobs
30
+ else call :getAllJobs
31
+ end
32
+ ret.jobs.collect{ |j| Job.new(self, j) }
33
+ end
34
+
35
+ def get_job(job_id)
36
+ Job.new self, call(:getJob, job_id)
37
+ end
38
+
39
+ # Trackers
40
+ def trackers(status=:active)
41
+ ret =
42
+ case status
43
+ when :active then call :getActiveTrackers
44
+ when :blacklisted then call :getBlacklistedTrackers
45
+ else call :getAllTrackers
46
+ end
47
+ ret.trackers.collect{ |t| Tracker.new(self, t) }
48
+ end
49
+
50
+ def get_tracker(name)
51
+ Tracker.new(self, (call :getTracker, name))
52
+ end
53
+
54
+ # Tasks
55
+ def tasks
56
+ end
57
+
58
+ # Status
59
+ def status
60
+ call :getClusterStatus
61
+ end
62
+
63
+ private
64
+ def call(method, *args)
65
+ @client.send method, @ctx, *args
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,127 @@
1
+ require 'nokogiri'
2
+
3
+ module ElephantDriver
4
+
5
+ class Job
6
+ STATES = [ :running, :succeeded, :failed, :prep, :killed ]
7
+
8
+ def initialize(cln, thrift_job)
9
+ @cln = cln
10
+ @thrift_job = thrift_job
11
+ end
12
+
13
+ def job_id
14
+ @thrift_job.jobID
15
+ end
16
+
17
+ def user
18
+ @thrift_job.status.user
19
+ end
20
+
21
+ def completed?
22
+ state != :running
23
+ end
24
+
25
+ def start_time
26
+ @thrift_job.startTime
27
+ end
28
+
29
+ def launch_time
30
+ @thrift_job.launchTime
31
+ end
32
+
33
+ def finish_time
34
+ @thrift_job.finishTime
35
+ end
36
+
37
+ def state
38
+ STATES[@thrift_job.status.runState - 1]
39
+ end
40
+
41
+ def map_progress
42
+ @thrift_job.status.mapProgress
43
+ end
44
+
45
+ def reduce_progress
46
+ @thrift_job.status.reduceProgress
47
+ end
48
+
49
+ def cleanup_progress
50
+ @thrift_job.status.cleanupProgress
51
+ end
52
+
53
+ def setup_progress
54
+ @thrift_job.status.setupProgress
55
+ end
56
+
57
+ def progress
58
+ (@thrift_job.status.mapProgress + @thrift_job.status.reduceProgress) / 2.0
59
+ end
60
+
61
+ def config_params
62
+ xml = call :getJobConfXML
63
+ #@parsed_config ||= Nokogiri::XML(xml).xpath("//property").inject({}) { |props, xprop|
64
+ # props[xprop.xpath("./name").text] = xprop.xpath("./value").text
65
+ # props
66
+ #}
67
+ {}
68
+ end
69
+
70
+ def counters
71
+ counters = {}
72
+ ret = call :getJobCounters
73
+ ret.groups.each { |g|
74
+ h = {}
75
+ g.counters.each { |name, c| h[name] = c.value }
76
+ counters[g.name] = h
77
+ }
78
+ counters
79
+ end
80
+
81
+ def tasks
82
+ types = [
83
+ Hadoop::API::Jobtracker::ThriftTaskType::MAP,
84
+ Hadoop::API::Jobtracker::ThriftTaskType::REDUCE,
85
+ Hadoop::API::Jobtracker::ThriftTaskType::JOB_SETUP,
86
+ Hadoop::API::Jobtracker::ThriftTaskType::JOB_CLEANUP,
87
+ Hadoop::API::Jobtracker::ThriftTaskType::TASK_CLEANUP,
88
+ ]
89
+ states = [
90
+ Hadoop::API::Jobtracker::ThriftTaskState::RUNNING,
91
+ Hadoop::API::Jobtracker::ThriftTaskState::SUCCEEDED,
92
+ Hadoop::API::Jobtracker::ThriftTaskState::FAILED,
93
+ Hadoop::API::Jobtracker::ThriftTaskState::UNASSIGNED,
94
+ Hadoop::API::Jobtracker::ThriftTaskState::KILLED,
95
+ Hadoop::API::Jobtracker::ThriftTaskState::COMMIT_PENDING,
96
+ Hadoop::API::Jobtracker::ThriftTaskState::FAILED_UNCLEAN,
97
+ Hadoop::API::Jobtracker::ThriftTaskState::KILLED_UNCLEAN,
98
+ ]
99
+
100
+ tasks = (call :getTaskList, types, states, '', 10000, 0).tasks
101
+ return tasks.collect{ |t| Task.new(self, t) }
102
+ end
103
+
104
+ def set_priority(priority)
105
+ prio =
106
+ case priority
107
+ when :very_high then Hadoop::API::Jobtracker::ThriftJobPriority::VERY_HIGH
108
+ when :high then Hadoop::API::Jobtracker::ThriftJobPriority::HIGH
109
+ when :normal then Hadoop::API::Jobtracker::ThriftJobPriority::NORMAL
110
+ when :low then Hadoop::API::Jobtracker::ThriftJobPriority::LOW
111
+ when :very_low then Hadoop::API::Jobtracker::ThriftJobPriority::VERY_LOW
112
+ else Hadoop::API::Jobtracker::ThriftJobPriority::NORMAL
113
+ end
114
+ call :setJobPriority, prio
115
+ end
116
+
117
+ def kill!
118
+ call :killJob
119
+ end
120
+
121
+ private
122
+ def call(method, *args)
123
+ @cln.send :call, method, @thrift_job.jobID, *args
124
+ end
125
+ end
126
+
127
+ end
@@ -0,0 +1,46 @@
1
+ require 'nokogiri'
2
+
3
+ module ElephantDriver
4
+
5
+ class Task
6
+ def initialize(cln, thrift_task)
7
+ @cln = cln
8
+ @thrift_task = thrift_task
9
+ end
10
+
11
+ def start_time
12
+ @thrift_task.startTime
13
+ end
14
+
15
+ def exec_start_time
16
+ @thrift_task.execStartTime
17
+ end
18
+
19
+ def exec_finish_time
20
+ @thrift_task.execFinishTime
21
+ end
22
+
23
+ def progress
24
+ @thrift_task.progress
25
+ end
26
+
27
+ def failed?
28
+ @thrift_task.failed
29
+ end
30
+
31
+ def completed?
32
+ @thrift_task.complete
33
+ end
34
+
35
+ def counters
36
+ counters = {}
37
+ @thrift_task.counters.groups.each { |g|
38
+ h = {}
39
+ g.counters.each { |name, c| h[name] = c.value }
40
+ counters[g.name] = h
41
+ }
42
+ counters
43
+ end
44
+ end
45
+
46
+ end
@@ -0,0 +1,129 @@
1
+ /*
2
+ * Licensed to Cloudera, Inc. under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. Cloudera, Inc. licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing, software
13
+ * distributed under the License is distributed on an "AS IS" BASIS,
14
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ * See the License for the specific language governing permissions and
16
+ * limitations under the License.
17
+ */
18
+
19
+ /*
20
+ * Common declarations for Hadoop Thrift interfaces
21
+ */
22
+
23
+ /*
24
+ * Namespaces for generated code. The idea is to keep code generated by
25
+ * Thrift under a 'hadoop.api' namespace, so that a higher-level set of
26
+ * functions and classes may be defined under 'hadoop'.
27
+ */
28
+
29
+ namespace cpp hadoop.api
30
+ namespace csharp Hadoop.API
31
+ namespace java org.apache.hadoop.thriftfs.api
32
+ namespace perl Hadoop.API
33
+ namespace php hadoop_api
34
+ namespace py hadoop.api.common
35
+ namespace rb Hadoop.API
36
+
37
+ /** Generic I/O error */
38
+ exception IOException {
39
+ /** Error message. */
40
+ 1: string msg,
41
+
42
+ /** Textual representation of the call stack. */
43
+ 2: string stack
44
+
45
+ /** The Java class of the Exception (may be a subclass) */
46
+ 3: string clazz
47
+ }
48
+
49
+ /**
50
+ * Information about the compilation version of this server
51
+ */
52
+ struct VersionInfo {
53
+ 1: string version
54
+ 2: string revision
55
+ 4: string compileDate
56
+ 5: string compilingUser
57
+ 6: string url
58
+ 7: string buildVersion
59
+ }
60
+
61
+
62
+ /** A single stack frame in a stack dump */
63
+ struct StackTraceElement {
64
+ 1: string className
65
+ 2: string fileName
66
+ 3: i32 lineNumber
67
+ 4: string methodName
68
+ 5: bool isNativeMethod
69
+ 6: string stringRepresentation
70
+ }
71
+
72
+ /** Info about a thread with its corresponding stack trace */
73
+ struct ThreadStackTrace {
74
+ 1: string threadName
75
+ 2: string threadStringRepresentation
76
+ 3: bool isDaemon
77
+
78
+ 4: list<StackTraceElement> stackTrace;
79
+ }
80
+
81
+ /**
82
+ * Memory available via java.lang.Runtime
83
+ */
84
+ struct RuntimeInfo {
85
+ 1:i64 totalMemory
86
+ 2:i64 freeMemory
87
+ 3:i64 maxMemory
88
+ }
89
+
90
+ /**
91
+ * Context options for every request.
92
+ */
93
+ struct RequestContext {
94
+ /**
95
+ * This map turns into a Configuration object in the server and
96
+ * is currently used to construct a UserGroupInformation to
97
+ * authenticate this request.
98
+ */
99
+ 1:map<string, string> confOptions
100
+ }
101
+
102
+ struct MetricsRecord {
103
+ 2: map<string, string> tags
104
+ 3: map<string, i64> metrics
105
+ }
106
+
107
+ struct MetricsContext {
108
+ 1: string name
109
+ 2: bool isMonitoring
110
+ 3: i32 period
111
+
112
+ 4: map<string, list<MetricsRecord>> records
113
+ }
114
+
115
+ struct ThriftDelegationToken {
116
+ 1: binary delegationTokenBytes
117
+ }
118
+
119
+ service HadoopServiceBase {
120
+ /** Return the version information for this server */
121
+ VersionInfo getVersionInfo(10:RequestContext ctx);
122
+ RuntimeInfo getRuntimeInfo(10:RequestContext ctx);
123
+ list<ThreadStackTrace> getThreadDump(10:RequestContext ctx);
124
+ list<MetricsContext> getAllMetrics(10:RequestContext ctx)
125
+ throws (1:IOException err);
126
+ MetricsContext getMetricsContext(10:RequestContext ctx, 1:string contextName)
127
+ throws (1:IOException err);
128
+ }
129
+