spark_toolkit 0.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +50 -0
- data/Rakefile +10 -0
- data/bin/setup +8 -0
- data/docs/HDFS.md +82 -0
- data/docs/Spark.md +29 -0
- data/docs/YARN.md +62 -0
- data/lib/spark_toolkit/hadoop/conf/configuration.rb +38 -0
- data/lib/spark_toolkit/hadoop/hdfs/file_system.rb +63 -0
- data/lib/spark_toolkit/hadoop/hdfs/reader.rb +21 -0
- data/lib/spark_toolkit/hadoop/yarn/application.rb +29 -0
- data/lib/spark_toolkit/hadoop/yarn/attempt.rb +19 -0
- data/lib/spark_toolkit/hadoop/yarn/client.rb +44 -0
- data/lib/spark_toolkit/hadoop/yarn/id.rb +17 -0
- data/lib/spark_toolkit/hadoop/yarn/log.rb +119 -0
- data/lib/spark_toolkit/hadoop/yarn/node.rb +25 -0
- data/lib/spark_toolkit/hadoop.rb +9 -0
- data/lib/spark_toolkit/spark/client.rb +90 -0
- data/lib/spark_toolkit/spark.rb +1 -0
- data/lib/spark_toolkit/version.rb +3 -0
- data/lib/spark_toolkit.rb +7 -0
- data/spark_toolkit.gemspec +29 -0
- metadata +126 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 724f7d68120af23d1127588d5cce846ae9982cc0
|
4
|
+
data.tar.gz: 2aec33d6d4aa449a721953e6f4232ae53a75621e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 4b7dbfa7811f3b45d5d4c2ab1d02e21aaf354e6e8bcf5b2812ec025b9debf307aa4b1e076a4fd5854249ca3f83e9ef02949bf971ae4a81f903b332dc84856bfa
|
7
|
+
data.tar.gz: 25dc646ad44d6967fdc5b2efd957356c26446b883160d13527c07c02457c70fde1ac89996fde64c4bd34155d09b068c9f4adf9fdf0ba815084acc1da7ac0ec4c
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
2
|
+
|
3
|
+
## Our Pledge
|
4
|
+
|
5
|
+
In the interest of fostering an open and welcoming environment, we as
|
6
|
+
contributors and maintainers pledge to making participation in our project and
|
7
|
+
our community a harassment-free experience for everyone, regardless of age, body
|
8
|
+
size, disability, ethnicity, gender identity and expression, level of experience,
|
9
|
+
nationality, personal appearance, race, religion, or sexual identity and
|
10
|
+
orientation.
|
11
|
+
|
12
|
+
## Our Standards
|
13
|
+
|
14
|
+
Examples of behavior that contributes to creating a positive environment
|
15
|
+
include:
|
16
|
+
|
17
|
+
* Using welcoming and inclusive language
|
18
|
+
* Being respectful of differing viewpoints and experiences
|
19
|
+
* Gracefully accepting constructive criticism
|
20
|
+
* Focusing on what is best for the community
|
21
|
+
* Showing empathy towards other community members
|
22
|
+
|
23
|
+
Examples of unacceptable behavior by participants include:
|
24
|
+
|
25
|
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
26
|
+
advances
|
27
|
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28
|
+
* Public or private harassment
|
29
|
+
* Publishing others' private information, such as a physical or electronic
|
30
|
+
address, without explicit permission
|
31
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
32
|
+
professional setting
|
33
|
+
|
34
|
+
## Our Responsibilities
|
35
|
+
|
36
|
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37
|
+
behavior and are expected to take appropriate and fair corrective action in
|
38
|
+
response to any instances of unacceptable behavior.
|
39
|
+
|
40
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
41
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
44
|
+
threatening, offensive, or harmful.
|
45
|
+
|
46
|
+
## Scope
|
47
|
+
|
48
|
+
This Code of Conduct applies both within project spaces and in public spaces
|
49
|
+
when an individual is representing the project or its community. Examples of
|
50
|
+
representing a project or community include using an official project e-mail
|
51
|
+
address, posting via an official social media account, or acting as an appointed
|
52
|
+
representative at an online or offline event. Representation of a project may be
|
53
|
+
further defined and clarified by project maintainers.
|
54
|
+
|
55
|
+
## Enforcement
|
56
|
+
|
57
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
58
|
+
reported by contacting the project team at moyuli@sensetime.com. All
|
59
|
+
complaints will be reviewed and investigated and will result in a response that
|
60
|
+
is deemed necessary and appropriate to the circumstances. The project team is
|
61
|
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
62
|
+
Further details of specific enforcement policies may be posted separately.
|
63
|
+
|
64
|
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
65
|
+
faith may face temporary or permanent repercussions as determined by other
|
66
|
+
members of the project's leadership.
|
67
|
+
|
68
|
+
## Attribution
|
69
|
+
|
70
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
71
|
+
available at [http://contributor-covenant.org/version/1/4][version]
|
72
|
+
|
73
|
+
[homepage]: http://contributor-covenant.org
|
74
|
+
[version]: http://contributor-covenant.org/version/1/4/
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2017 myl
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# SparkToolkit
|
2
|
+
|
3
|
+
SparkToolkit was designed as the swiss army knife for interacting with Spark and Hadoop YARN cluster. Whether you need to get access to HDFS, monitor YARN cluster node or job status, submit or run spark job, SparkToolkit is for you.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'spark_toolkit'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install spark_toolkit
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
Ensure `SPARK_HOME` variable was setted in your environment:
|
22
|
+
|
23
|
+
$ export SPARK_HOME=/usr/local/spark
|
24
|
+
|
25
|
+
First, load all dep jars into JRuby:
|
26
|
+
|
27
|
+
Dir.glob("#{SPARK_HOME}/jars/*.jar").each { |jar| require jar }
|
28
|
+
|
29
|
+
Then require this gem:
|
30
|
+
|
31
|
+
require "spark_toolkit"
|
32
|
+
|
33
|
+
For more details, view the doc under `docs` directory.
|
34
|
+
|
35
|
+
|
36
|
+
## TODO
|
37
|
+
|
38
|
+
- Support Spark 1.x
|
39
|
+
- Support Spark cluster mode
|
40
|
+
- ~~Add YARN application log analyzer~~
|
41
|
+
|
42
|
+
## Contributing
|
43
|
+
|
44
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/myl2821/spark_toolkit. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
45
|
+
|
46
|
+
|
47
|
+
## License
|
48
|
+
|
49
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
50
|
+
|
data/Rakefile
ADDED
data/bin/setup
ADDED
data/docs/HDFS.md
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# HDFS
|
2
|
+
|
3
|
+
This document explains how to get started using SparkToolkit with HDFS.
|
4
|
+
|
5
|
+
## Set HDFS Config
|
6
|
+
Create new Hadoop Config:
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
conf = SparkToolkit::Conf::Configuration.new
|
10
|
+
```
|
11
|
+
|
12
|
+
Set property, for example, you want to declare HDFS cluster was protected under `kerberos`:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
conf["hadoop.security.authentication"] = "kerberos"
|
16
|
+
```
|
17
|
+
|
18
|
+
Load settings from XML file:
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
conf.add_resource("hdfs-site.xml")
|
22
|
+
```
|
23
|
+
|
24
|
+
Load all XML files under a directory:
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
conf.add_config_dir("config-dir")
|
28
|
+
```
|
29
|
+
|
30
|
+
## Get HDFS Instance and Do Operation
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
name_service_url = conf.get('dfs.nameservices')
|
34
|
+
hdfs = SparkToolkit::HDFS::FileSystem.new(name_service_url, conf)
|
35
|
+
```
|
36
|
+
### List Entry
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
hdfs.list '/path'
|
40
|
+
```
|
41
|
+
|
42
|
+
### Copy File From HDFS to Local
|
43
|
+
|
44
|
+
To request a view account details:
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
hdfs.copy_to_local(hdfs_src_path, dst_local_path)
|
48
|
+
```
|
49
|
+
|
50
|
+
### Check if Entry Exists
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
hdfs.exists?(entry_path)
|
54
|
+
```
|
55
|
+
|
56
|
+
### Delete File
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
hdfs.delete(entry_path)
|
60
|
+
```
|
61
|
+
|
62
|
+
### Put file
|
63
|
+
|
64
|
+
```ruby
|
65
|
+
hdfs.put(local_src_path, hdfs_dst_path)
|
66
|
+
```
|
67
|
+
|
68
|
+
### Mkdir
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
hdfs.mkdir(dst_path)
|
72
|
+
```
|
73
|
+
|
74
|
+
### Open and Read File
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
fd = hdfs.open(dfs_path)
|
78
|
+
line = fd.readline
|
79
|
+
lines = fd.readlines
|
80
|
+
chunk = fd.read(4096)
|
81
|
+
whole_blob = fd.read
|
82
|
+
```
|
data/docs/Spark.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Spark
|
2
|
+
|
3
|
+
This document explains how to get started using SparkToolkit with Spark.
|
4
|
+
|
5
|
+
## Get and Initialize Spark Client Instance
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
spark_client = SparkToolkit::Spark::Client.new(hadoop_conf)
|
9
|
+
```
|
10
|
+
## Set config metadate
|
11
|
+
```ruby
|
12
|
+
spark_conf = spark_client.get_spark_conf
|
13
|
+
spark_conf.set_app_name "example"
|
14
|
+
```
|
15
|
+
|
16
|
+
## Submit Spark Job
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
# example of submit pi python job
|
20
|
+
args = ["--class", "org.apache.spark.deploy.PythonRunner",
|
21
|
+
"--primary-py-file", "pi.py",
|
22
|
+
"--arg", 2]
|
23
|
+
spark_conf.yarn_deploy_mode(:cluster) # or :client
|
24
|
+
spark_client.is_python_job(true)
|
25
|
+
# Submit your job to YARN and get its app_id for query
|
26
|
+
yarn_app_id = spark_client.yarn_submit(args)
|
27
|
+
# Or run as client, print all output into console
|
28
|
+
spark_client.yarn_run(args)
|
29
|
+
```
|
data/docs/YARN.md
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# YARN
|
2
|
+
|
3
|
+
This document explains how to get started using SparkToolkit with YARN.
|
4
|
+
|
5
|
+
|
6
|
+
## Get and Initiate YARN Client Instance
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
yarn = SparkToolkit::YARN::Client.new hadoop_conf
|
10
|
+
yarn.start
|
11
|
+
```
|
12
|
+
## YARN Client Operation
|
13
|
+
|
14
|
+
YARN client allows you to monitor the state of all node in its cluster, get report and diagnosis of specific YARN jobs, submit job to YARN cluster, and so on.
|
15
|
+
|
16
|
+
### Get Report of Running Nodes
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
node_reports = yarn.get_node_reports
|
20
|
+
node_reports.each do |report|
|
21
|
+
node_state = report.get_node_state
|
22
|
+
node_id = report.get_node_id
|
23
|
+
num_containers = report.get_num_containers
|
24
|
+
node_total_memory = report.get_total_memory
|
25
|
+
node_used_memory = report.get_used_memory
|
26
|
+
node_vcores = report.get_total_vcores
|
27
|
+
node_used_vcores = report.get_used_vcores
|
28
|
+
end
|
29
|
+
```
|
30
|
+
|
31
|
+
### Get Report of YARN Application
|
32
|
+
|
33
|
+
```ruby
|
34
|
+
app_report = yarn.get_application_report(app_id)
|
35
|
+
app_report_detail = app_report.get_detail
|
36
|
+
```
|
37
|
+
|
38
|
+
### Get Log of YARN Application
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
log = yarn.get_application_logs(app_id)
|
42
|
+
formatted_log_stdout = SparkToolkit::YARN::SimpleFormatter.format(app_id, :stdout)
|
43
|
+
formatted_log_stderr = SparkToolkit::YARN::SimpleFormatter.format(app_id, :stderr)
|
44
|
+
formatted_log_all = SparkToolkit::YARN::SimpleFormatter.format(app_id, :all)
|
45
|
+
```
|
46
|
+
|
47
|
+
### Get Report of YARN Attempt
|
48
|
+
|
49
|
+
```ruby
|
50
|
+
attempts = yarn.get_application_reports(app_id)
|
51
|
+
attempts.each do |attempt|
|
52
|
+
attempt_id = attempt.get_attempt_id
|
53
|
+
attempt_diagnostics = attempt.get_dianostics
|
54
|
+
tracking_url = attempt.get_tracking_url
|
55
|
+
attempt_state = attempt.get_yarn_application_apptempt_state
|
56
|
+
am_container_id = attempt.get_am_container_id
|
57
|
+
end
|
58
|
+
```
|
59
|
+
|
60
|
+
### Kill YARN Job
|
61
|
+
|
62
|
+
yarn.kill_application(app_id)
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module SparkToolkit
|
2
|
+
module Conf
|
3
|
+
Configuration = Java::OrgApacheHadoopConf::Configuration
|
4
|
+
class Configuration
|
5
|
+
java_import org.apache.hadoop.fs.Path
|
6
|
+
|
7
|
+
alias_method :initialise, :initialize
|
8
|
+
def initialize(opts={})
|
9
|
+
initialise
|
10
|
+
|
11
|
+
default_opts = {
|
12
|
+
'fs.hdfs.impl' => 'org.apache.hadoop.hdfs.DistributedFileSystem',
|
13
|
+
'fs.file.impl' => 'org.apache.hadoop.fs.LocalFileSystem'
|
14
|
+
}
|
15
|
+
|
16
|
+
default_opts.merge(opts).each { |k, v| set(k, v) }
|
17
|
+
end
|
18
|
+
|
19
|
+
alias_method :add_resource_java, :add_resource
|
20
|
+
def add_resource(f)
|
21
|
+
add_resource_java(Path.new(f))
|
22
|
+
end
|
23
|
+
|
24
|
+
def []=(k, v)
|
25
|
+
set(k, v)
|
26
|
+
end
|
27
|
+
|
28
|
+
def [](k)
|
29
|
+
get(k)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Load *.xml files under input dir
|
33
|
+
def add_config_dir(dir)
|
34
|
+
Dir.glob("#{dir}/*.xml").each { |f| add_resource(f) }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module SparkToolkit
|
2
|
+
module HDFS
|
3
|
+
class FileSystem
|
4
|
+
java_import org.apache.hadoop.fs.Path
|
5
|
+
java_import org.apache.hadoop.security.UserGroupInformation
|
6
|
+
java_import java.net.URI
|
7
|
+
java_import org.apache.hadoop.fs.FileUtil
|
8
|
+
|
9
|
+
def initialize(url, conf)
|
10
|
+
@url = url
|
11
|
+
@hdfs_conf = conf
|
12
|
+
UserGroupInformation.set_configuration(@hdfs_conf)
|
13
|
+
@hdfs = org.apache.hadoop.fs.FileSystem.get(URI.create(url), @hdfs_conf)
|
14
|
+
end
|
15
|
+
|
16
|
+
# ==== Returns
|
17
|
+
#
|
18
|
+
# * <~HdfsInputStream>
|
19
|
+
def open(path)
|
20
|
+
@hdfs.open(Path.new(path))
|
21
|
+
end
|
22
|
+
|
23
|
+
def list(path, recursively=false)
|
24
|
+
if recursively
|
25
|
+
paths = []
|
26
|
+
dir_itr = @hdfs.listFiles(Path.new(path), true)
|
27
|
+
|
28
|
+
while dir_itr.hasNext
|
29
|
+
next_path = dir_itr.next.getPath
|
30
|
+
paths << next_path
|
31
|
+
end
|
32
|
+
paths
|
33
|
+
else
|
34
|
+
file_status = @hdfs.listStatus(Path.new(path))
|
35
|
+
FileUtil.stat2Paths(file_status)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
alias_method :ls, :list
|
39
|
+
|
40
|
+
def copy_to_local(hdfs_src, local_dst)
|
41
|
+
@hdfs.copy_to_local_file(false, Path.new(hdfs_src), Path.new(local_dst), true)
|
42
|
+
end
|
43
|
+
|
44
|
+
def exists?(path)
|
45
|
+
@hdfs.exists(Path.new(path))
|
46
|
+
end
|
47
|
+
|
48
|
+
def delete(path, recursively=false)
|
49
|
+
@hdfs.delete(Path.new(path), recursively)
|
50
|
+
end
|
51
|
+
alias_method :rm, :delete
|
52
|
+
|
53
|
+
def put(local_src, hdfs_dst)
|
54
|
+
@hdfs.copyFromLocalFile(false, true, Path.new(local_src), Path.new(hdfs_dst))
|
55
|
+
end
|
56
|
+
|
57
|
+
def mkdir(path)
|
58
|
+
@hdfs.mkdirs(Path.new(path))
|
59
|
+
end
|
60
|
+
alias_method :mkdir_p, :mkdir
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
class Java::OrgApacheHadoopHdfsClient::HdfsDataInputStream
|
2
|
+
def read(*args)
|
3
|
+
@io ||= self.to_io
|
4
|
+
@io.read(*args)
|
5
|
+
end
|
6
|
+
|
7
|
+
def readline
|
8
|
+
@io ||= self.to_io
|
9
|
+
@io.read_line
|
10
|
+
end
|
11
|
+
|
12
|
+
def readlines
|
13
|
+
lines = []
|
14
|
+
loop do
|
15
|
+
line = readline
|
16
|
+
line.nil? ? break : lines << line
|
17
|
+
end
|
18
|
+
lines
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class Java::OrgApacheHadoopYarnApiRecordsImplPb::ApplicationReportPBImpl
|
2
|
+
# get_application_id
|
3
|
+
# get_application_type
|
4
|
+
# get_start_time
|
5
|
+
# get_finish_time
|
6
|
+
# get_user
|
7
|
+
# get_host
|
8
|
+
# get_name
|
9
|
+
# get_tracking_url
|
10
|
+
def get_detail
|
11
|
+
{
|
12
|
+
id: get_application_id,
|
13
|
+
name: get_name,
|
14
|
+
user: get_user,
|
15
|
+
type: get_application_type,
|
16
|
+
host: get_host,
|
17
|
+
tracking_url: get_tracking_url,
|
18
|
+
start_time: get_start_time,
|
19
|
+
finish_time: get_finish_time,
|
20
|
+
state: get_yarn_application_state
|
21
|
+
}
|
22
|
+
end
|
23
|
+
|
24
|
+
# NOTE: For detailed state meaning, view the doc:
|
25
|
+
# https://hadoop.apache.org/docs/r2.4.1/api/org/apache/hadoop/yarn/api/records/YarnApplicationState.html
|
26
|
+
def get_yarn_application_state
|
27
|
+
getYarnApplicationState.to_s.to_sym
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class Java::OrgApacheHadoopYarnApiRecordsImplPb::ApplicationAttemptReportPBImpl
|
2
|
+
def get_attempt_id
|
3
|
+
getApplicationAttemptId
|
4
|
+
end
|
5
|
+
|
6
|
+
# get_diagnostics
|
7
|
+
|
8
|
+
# get_tracking_url
|
9
|
+
|
10
|
+
# get_original_tracking_url
|
11
|
+
|
12
|
+
# get_yarn_application_apptempt_state
|
13
|
+
|
14
|
+
# get_am_container_id
|
15
|
+
|
16
|
+
def get_state
|
17
|
+
get_yarn_application_apptempt_state.to_s.to_sym
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module SparkToolkit
|
2
|
+
module YARN
|
3
|
+
Client = Java::OrgApacheHadoopYarnClientApiImpl::YarnClientImpl
|
4
|
+
class Client
|
5
|
+
alias_method :initalise, :initialize
|
6
|
+
def initialize(conf=nil)
|
7
|
+
initalise
|
8
|
+
@conf = conf
|
9
|
+
init conf if conf
|
10
|
+
end
|
11
|
+
|
12
|
+
def get_applications
|
13
|
+
getApplications.to_a
|
14
|
+
end
|
15
|
+
# get_application_report(app_id)
|
16
|
+
|
17
|
+
def get_containers(app_id)
|
18
|
+
getContainers(app_id).to_a
|
19
|
+
end
|
20
|
+
# get_container_report(container_id)
|
21
|
+
|
22
|
+
def get_application_attempts(app_id)
|
23
|
+
getApplicationAttempts(app_id).to_a
|
24
|
+
end
|
25
|
+
# get_attempt_report(app_id)
|
26
|
+
|
27
|
+
def get_node_reports
|
28
|
+
getNodeReports.to_a
|
29
|
+
end
|
30
|
+
|
31
|
+
# Available devs are:
|
32
|
+
# - :all
|
33
|
+
# - :stdout
|
34
|
+
# - :stderr
|
35
|
+
def get_application_logs(appid, dev=:all)
|
36
|
+
@conf ||= SparkToolkit::Conf::Configuration.new
|
37
|
+
@log_accssor ||= SparkToolkit::YARN::LogAccessor.new(@conf)
|
38
|
+
@log_accssor.get_logs(appid, dev)
|
39
|
+
end
|
40
|
+
|
41
|
+
# kill_application(app_id)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module SparkToolkit
|
2
|
+
module YARN
|
3
|
+
ID = Java::OrgApacheHadoopYarnApiRecordsImplPb::ApplicationIdPBImpl
|
4
|
+
class ID
|
5
|
+
java_import org.apache.hadoop.yarn.api.records.ApplicationId
|
6
|
+
def self.new_instance(ts, id)
|
7
|
+
ApplicationId.new_instance(ts.to_i, id.to_i)
|
8
|
+
end
|
9
|
+
|
10
|
+
def get_timestamp
|
11
|
+
getClusterTimestamp
|
12
|
+
end
|
13
|
+
|
14
|
+
# def get_id
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
module SparkToolkit
|
2
|
+
module YARN
|
3
|
+
class LogAccessor
|
4
|
+
java_import org.apache.hadoop.security.UserGroupInformation
|
5
|
+
java_import org.apache.hadoop.fs.FileContext
|
6
|
+
java_import org.apache.hadoop.fs.Path
|
7
|
+
java_import org.apache.hadoop.yarn.logaggregation.AggregatedLogFormat
|
8
|
+
|
9
|
+
attr_reader :hconf
|
10
|
+
def initialize(hconf)
|
11
|
+
@hconf = hconf
|
12
|
+
log_root = hconf.get('yarn.nodemanager.remote-app-log-dir')
|
13
|
+
username = UserGroupInformation.get_current_user.get_short_user_name
|
14
|
+
@log_dir = File.join(log_root, username, 'logs')
|
15
|
+
@max_log_len = 1024 * 1024 # 1M log
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_logs(appid, dev)
|
19
|
+
case dev
|
20
|
+
when :stdout
|
21
|
+
get_logs_core(appid, ["stdout"])
|
22
|
+
when :stderr
|
23
|
+
get_logs_core(appid, ["stderr"])
|
24
|
+
when :all
|
25
|
+
get_logs_core(appid, ["stdout", "stderr"])
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def read_one_log(stream, upload_time, log_type)
|
31
|
+
dev = stream.readUTF
|
32
|
+
length = stream.readUTF.to_i
|
33
|
+
io = stream.to_io
|
34
|
+
if log_type.include?(dev.to_s)
|
35
|
+
skip_len = [length-@max_log_len, 0].max
|
36
|
+
body = io.read(length)
|
37
|
+
io.read skip_len
|
38
|
+
{
|
39
|
+
log_type: dev,
|
40
|
+
upload_time: upload_time,
|
41
|
+
length: length,
|
42
|
+
body: body
|
43
|
+
}
|
44
|
+
else
|
45
|
+
io.read length
|
46
|
+
nil
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def read_logs_core(stream, mod_time, log_type)
|
51
|
+
res = []
|
52
|
+
begin
|
53
|
+
loop do
|
54
|
+
res << read_one_log(stream, mod_time, log_type)
|
55
|
+
end
|
56
|
+
rescue Java::JavaIO::EOFException
|
57
|
+
res.compact.select { |x| x[:length] != 0 }
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def get_logs_core(appid, log_type)
|
62
|
+
path = Path.new("#{@log_dir}/#{appid}")
|
63
|
+
qdir = FileContext.getFileContext(hconf).makeQualified(path)
|
64
|
+
files = FileContext.getFileContext(qdir.toUri(), hconf).listStatus(path)
|
65
|
+
res = []
|
66
|
+
while files.has_next
|
67
|
+
file = files.next
|
68
|
+
reader = AggregatedLogFormat::LogReader.new(hconf, file.get_path)
|
69
|
+
key = AggregatedLogFormat::LogKey.new
|
70
|
+
stream = reader.next(key)
|
71
|
+
until stream.nil?
|
72
|
+
parsed = {
|
73
|
+
container: key.to_s,
|
74
|
+
file: file.get_path.get_name,
|
75
|
+
content: read_logs_core(stream, file_mod_time(file), log_type)
|
76
|
+
}
|
77
|
+
res << parsed
|
78
|
+
stream = reader.next(key)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
res.reject { |x| x[:content].empty? }
|
82
|
+
end
|
83
|
+
|
84
|
+
def file_mod_time(file)
|
85
|
+
Time.at(file.get_modification_time / 1000).to_s
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
class LogFormatter
|
90
|
+
def self.format(input)
|
91
|
+
fail "Called from base class!"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
class SimpleLogFormatter < LogFormatter
|
96
|
+
def self.format(formatables)
|
97
|
+
formatted = ""
|
98
|
+
formatables.each do |formatable|
|
99
|
+
formatable[:content].each do |content|
|
100
|
+
formatted << 'Container: ' << formatable[:container] << "\n"
|
101
|
+
formatted << 'Node: ' << formatable[:file] << "\n"
|
102
|
+
formatted << 'Log Type: ' << content[:log_type] << "\n"
|
103
|
+
formatted << 'Log Upload Time: ' << content[:upload_time] << "\n"
|
104
|
+
formatted << 'Log Length: ' << content[:length].to_s << "\n"
|
105
|
+
formatted << '='*80 << "\n"
|
106
|
+
formatted << content[:body] << "\n"*4
|
107
|
+
end
|
108
|
+
end
|
109
|
+
formatted
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class HTMLLogFormatter < LogFormatter
|
114
|
+
def self.format(formatables)
|
115
|
+
fail NotImplementedError
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class Java::OrgApacheHadoopYarnApiRecordsImplPb::ResourcePBImpl
|
2
|
+
# get_node_id
|
3
|
+
|
4
|
+
# get_num_containers
|
5
|
+
|
6
|
+
def get_node_state
|
7
|
+
getNodeState.to_s.to_sym
|
8
|
+
end
|
9
|
+
|
10
|
+
def get_total_memory
|
11
|
+
getCapability.getMemory
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_used_memory
|
15
|
+
getUsed.getMemory
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_total_vcores
|
19
|
+
getCapability.getVirtualCores
|
20
|
+
end
|
21
|
+
|
22
|
+
def get_used_vcores
|
23
|
+
getUsed.getVirtualCores
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
require 'spark_toolkit/hadoop/hdfs/file_system'
|
2
|
+
require 'spark_toolkit/hadoop/hdfs/reader'
|
3
|
+
require 'spark_toolkit/hadoop/conf/configuration'
|
4
|
+
require 'spark_toolkit/hadoop/yarn/client'
|
5
|
+
require 'spark_toolkit/hadoop/yarn/application'
|
6
|
+
require 'spark_toolkit/hadoop/yarn/attempt'
|
7
|
+
require 'spark_toolkit/hadoop/yarn/node'
|
8
|
+
require 'spark_toolkit/hadoop/yarn/id'
|
9
|
+
require 'spark_toolkit/hadoop/yarn/log'
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module SparkToolkit
|
2
|
+
module Spark
|
3
|
+
class Client
|
4
|
+
java_import org.apache.hadoop.security.UserGroupInformation
|
5
|
+
java_import org.apache.spark.deploy.SparkHadoopUtil
|
6
|
+
|
7
|
+
def initialize(hconf)
|
8
|
+
@hconf = hconf
|
9
|
+
UserGroupInformation.set_configuration(@hconf)
|
10
|
+
@sconf = org.apache.spark.SparkConf.new
|
11
|
+
@sconf.set_spark_home(ENV['SPARK_HOME']) if ENV['SPARK_HOME']
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_spark_conf
|
15
|
+
@sconf
|
16
|
+
end
|
17
|
+
|
18
|
+
def set_app_name s
|
19
|
+
@sconf.set_app_name s
|
20
|
+
end
|
21
|
+
|
22
|
+
# ==== Returns
|
23
|
+
#
|
24
|
+
# <~AppID>
|
25
|
+
def yarn_submit(args)
|
26
|
+
prepare_yarn_propreties
|
27
|
+
begin
|
28
|
+
cli_args = org.apache.spark.deploy.yarn.ClientArguments.new(args)
|
29
|
+
rescue ArgumentError # Spark 1.x
|
30
|
+
cli_args = org.apache.spark.deploy.yarn.ClientArguments.new(args, @sconf)
|
31
|
+
end
|
32
|
+
client = org.apache.spark.deploy.yarn.Client.new(cli_args, @hconf, @sconf)
|
33
|
+
client.submit_application
|
34
|
+
end
|
35
|
+
|
36
|
+
def yarn_run(args)
|
37
|
+
prepare_yarn_propreties
|
38
|
+
begin
|
39
|
+
cli_args = org.apache.spark.deploy.yarn.ClientArguments.new(args)
|
40
|
+
rescue ArgumentError # Spark 1.x
|
41
|
+
cli_args = org.apache.spark.deploy.yarn.ClientArguments.new(args, @sconf)
|
42
|
+
end
|
43
|
+
client = org.apache.spark.deploy.yarn.Client.new(cli_args, @hconf, @sconf)
|
44
|
+
client.run
|
45
|
+
end
|
46
|
+
|
47
|
+
def is_python_job t
|
48
|
+
if t
|
49
|
+
@sconf.set('spark.yarn.isPython', 'true')
|
50
|
+
else
|
51
|
+
@sconf.set('spark.yarn.isPython', 'false')
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def yarn_deploy_mode mode
|
56
|
+
case mode
|
57
|
+
when :cluster
|
58
|
+
@sconf.set('spark.submit.deployMode', 'cluster')
|
59
|
+
when :client
|
60
|
+
@sconf.set('spark.submit.deployMode', 'client')
|
61
|
+
else
|
62
|
+
fail "Unsupported deploy mode!"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def active_kerberos
|
67
|
+
prepare_yarn_propreties
|
68
|
+
|
69
|
+
@sconf.set("spark.hadoop.hadoop.security.authentication", "kerberos")
|
70
|
+
@sconf.set("spark.hadoop.hadoop.security.authorization", "true")
|
71
|
+
|
72
|
+
UserGroupInformation.set_configuration(SparkHadoopUtil.get.newConfiguration(@sconf))
|
73
|
+
credentials = UserGroupInformation.getLoginUser.getCredentials
|
74
|
+
SparkHadoopUtil.get.addCurrentUserCredentials(credentials)
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
def prepare_yarn_propreties
|
79
|
+
@sconf.set_master('yarn')
|
80
|
+
begin
|
81
|
+
@sconf.get('spark.submit.deployMode')
|
82
|
+
rescue
|
83
|
+
@sconf.set('spark.submit.deployMode', 'cluster')
|
84
|
+
end
|
85
|
+
|
86
|
+
java.lang.System.setProperty("SPARK_YARN_MODE", "true")
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'spark_toolkit/spark/client'
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'spark_toolkit/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "spark_toolkit"
|
8
|
+
spec.version = SparkToolkit::VERSION
|
9
|
+
spec.authors = ["Yuli Mo"]
|
10
|
+
spec.email = ["lizz@lizz.me"]
|
11
|
+
|
12
|
+
spec.summary = %q{Yet Another Jruby Spark toolkit.}
|
13
|
+
spec.description = %q{Yet Another Jruby Spark toolkit.}
|
14
|
+
spec.homepage = "https://github.com/myl2821/spark_toolkit"
|
15
|
+
spec.license = "MIT"
|
16
|
+
spec.platform = "java"
|
17
|
+
|
18
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
19
|
+
f.match(%r{^(test|spec|features)/})
|
20
|
+
end
|
21
|
+
spec.bindir = "exe"
|
22
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
|
+
spec.require_paths = ["lib"]
|
24
|
+
|
25
|
+
spec.add_development_dependency "bundler", "~> 1.13"
|
26
|
+
spec.add_development_dependency "pry"
|
27
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
28
|
+
spec.add_development_dependency "minitest", "~> 5.0"
|
29
|
+
end
|
metadata
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spark_toolkit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: java
|
6
|
+
authors:
|
7
|
+
- Yuli Mo
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-02-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.13'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.13'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: pry
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: minitest
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '5.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '5.0'
|
69
|
+
description: Yet Another Jruby Spark toolkit.
|
70
|
+
email:
|
71
|
+
- lizz@lizz.me
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- ".gitignore"
|
77
|
+
- ".travis.yml"
|
78
|
+
- CODE_OF_CONDUCT.md
|
79
|
+
- Gemfile
|
80
|
+
- LICENSE.txt
|
81
|
+
- README.md
|
82
|
+
- Rakefile
|
83
|
+
- bin/setup
|
84
|
+
- docs/HDFS.md
|
85
|
+
- docs/Spark.md
|
86
|
+
- docs/YARN.md
|
87
|
+
- lib/spark_toolkit.rb
|
88
|
+
- lib/spark_toolkit/hadoop.rb
|
89
|
+
- lib/spark_toolkit/hadoop/conf/configuration.rb
|
90
|
+
- lib/spark_toolkit/hadoop/hdfs/file_system.rb
|
91
|
+
- lib/spark_toolkit/hadoop/hdfs/reader.rb
|
92
|
+
- lib/spark_toolkit/hadoop/yarn/application.rb
|
93
|
+
- lib/spark_toolkit/hadoop/yarn/attempt.rb
|
94
|
+
- lib/spark_toolkit/hadoop/yarn/client.rb
|
95
|
+
- lib/spark_toolkit/hadoop/yarn/id.rb
|
96
|
+
- lib/spark_toolkit/hadoop/yarn/log.rb
|
97
|
+
- lib/spark_toolkit/hadoop/yarn/node.rb
|
98
|
+
- lib/spark_toolkit/spark.rb
|
99
|
+
- lib/spark_toolkit/spark/client.rb
|
100
|
+
- lib/spark_toolkit/version.rb
|
101
|
+
- spark_toolkit.gemspec
|
102
|
+
homepage: https://github.com/myl2821/spark_toolkit
|
103
|
+
licenses:
|
104
|
+
- MIT
|
105
|
+
metadata: {}
|
106
|
+
post_install_message:
|
107
|
+
rdoc_options: []
|
108
|
+
require_paths:
|
109
|
+
- lib
|
110
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
111
|
+
requirements:
|
112
|
+
- - ">="
|
113
|
+
- !ruby/object:Gem::Version
|
114
|
+
version: '0'
|
115
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: '0'
|
120
|
+
requirements: []
|
121
|
+
rubyforge_project:
|
122
|
+
rubygems_version: 2.5.1
|
123
|
+
signing_key:
|
124
|
+
specification_version: 4
|
125
|
+
summary: Yet Another Jruby Spark toolkit.
|
126
|
+
test_files: []
|