ruby-spark 1.1.0.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +15 -0
- data/CHANGELOG.md +8 -0
- data/README.md +184 -57
- data/TODO.md +3 -1
- data/ext/spark/build.sbt +5 -5
- data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
- data/lib/spark.rb +69 -10
- data/lib/spark/accumulator.rb +8 -0
- data/lib/spark/broadcast.rb +7 -0
- data/lib/spark/build.rb +10 -10
- data/lib/spark/cli.rb +68 -76
- data/lib/spark/config.rb +13 -17
- data/lib/spark/context.rb +10 -7
- data/lib/spark/error.rb +4 -0
- data/lib/spark/helper/statistic.rb +5 -1
- data/lib/spark/java_bridge.rb +5 -3
- data/lib/spark/java_bridge/base.rb +15 -15
- data/lib/spark/java_bridge/jruby.rb +3 -1
- data/lib/spark/java_bridge/rjb.rb +2 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
- data/lib/spark/mllib/classification/svm.rb +10 -2
- data/lib/spark/mllib/clustering/kmeans.rb +6 -2
- data/lib/spark/mllib/regression/lasso.rb +18 -2
- data/lib/spark/mllib/regression/linear.rb +11 -3
- data/lib/spark/mllib/regression/ridge.rb +18 -2
- data/lib/spark/rdd.rb +11 -2
- data/lib/spark/serializer.rb +1 -1
- data/lib/spark/serializer/auto_batched.rb +7 -0
- data/lib/spark/version.rb +1 -1
- data/ruby-spark.gemspec +4 -5
- data/spec/generator.rb +1 -1
- data/spec/lib/collect_spec.rb +10 -10
- data/spec/lib/config_spec.rb +10 -10
- data/spec/lib/context_spec.rb +116 -115
- data/spec/lib/ext_spec.rb +17 -17
- data/spec/lib/external_apps_spec.rb +1 -1
- data/spec/lib/filter_spec.rb +17 -17
- data/spec/lib/flat_map_spec.rb +22 -19
- data/spec/lib/group_spec.rb +22 -19
- data/spec/lib/helper_spec.rb +60 -12
- data/spec/lib/key_spec.rb +9 -8
- data/spec/lib/manipulation_spec.rb +15 -15
- data/spec/lib/map_partitions_spec.rb +6 -4
- data/spec/lib/map_spec.rb +22 -19
- data/spec/lib/reduce_by_key_spec.rb +19 -19
- data/spec/lib/reduce_spec.rb +22 -20
- data/spec/lib/sample_spec.rb +13 -12
- data/spec/lib/serializer_spec.rb +27 -0
- data/spec/lib/sort_spec.rb +16 -14
- data/spec/lib/statistic_spec.rb +4 -2
- data/spec/lib/whole_text_files_spec.rb +9 -8
- data/spec/spec_helper.rb +3 -3
- metadata +19 -18
data/lib/spark.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Gems and libraries
|
2
2
|
require 'method_source'
|
3
|
+
require 'securerandom'
|
3
4
|
require 'forwardable'
|
4
5
|
require 'sourcify'
|
5
6
|
require 'socket'
|
@@ -29,6 +30,8 @@ module Spark
|
|
29
30
|
|
30
31
|
include Helper::System
|
31
32
|
|
33
|
+
DEFAULT_CONFIG_FILE = File.join(Dir.home, '.ruby-spark.conf')
|
34
|
+
|
32
35
|
def self.print_logo(message=nil)
|
33
36
|
puts <<-STRING
|
34
37
|
|
@@ -107,6 +110,63 @@ module Spark
|
|
107
110
|
!!@context
|
108
111
|
end
|
109
112
|
|
113
|
+
|
114
|
+
# ===============================================================================
|
115
|
+
# Defaults
|
116
|
+
|
117
|
+
# Load default configuration for Spark and RubySpark
|
118
|
+
# By default are values stored at ~/.ruby-spark.conf
|
119
|
+
# File is automatically created
|
120
|
+
def self.load_defaults
|
121
|
+
unless File.exists?(DEFAULT_CONFIG_FILE)
|
122
|
+
save_defaults_to(DEFAULT_CONFIG_FILE)
|
123
|
+
end
|
124
|
+
|
125
|
+
load_defaults_from(DEFAULT_CONFIG_FILE)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Clear prev setting and load new from file
|
129
|
+
def self.load_defaults_from(file_path)
|
130
|
+
# Parse values
|
131
|
+
values = File.readlines(file_path)
|
132
|
+
values.map!(&:strip)
|
133
|
+
values.select!{|value| value.start_with?('gem.')}
|
134
|
+
values.map!{|value| value.split(nil, 2)}
|
135
|
+
values = Hash[values]
|
136
|
+
|
137
|
+
# Clear prev values
|
138
|
+
@target_dir = nil
|
139
|
+
@ruby_spark_jar = nil
|
140
|
+
@spark_home = nil
|
141
|
+
|
142
|
+
# Load new
|
143
|
+
@target_dir = values['gem.target']
|
144
|
+
end
|
145
|
+
|
146
|
+
# Create target dir and new config file
|
147
|
+
def self.save_defaults_to(file_path)
|
148
|
+
dir = File.join(Dir.home, ".ruby-spark.#{SecureRandom.uuid}")
|
149
|
+
|
150
|
+
if Dir.exist?(dir)
|
151
|
+
save_defaults_to(file_path)
|
152
|
+
else
|
153
|
+
Dir.mkdir(dir, 0700)
|
154
|
+
file = File.open(file_path, 'w')
|
155
|
+
file.puts "# Directory where will be Spark saved"
|
156
|
+
file.puts "gem.target #{dir}"
|
157
|
+
file.puts ""
|
158
|
+
file.puts "# You can also defined spark properties"
|
159
|
+
file.puts "# spark.master spark://master:7077"
|
160
|
+
file.puts "# spark.ruby.serializer marshal"
|
161
|
+
file.puts "# spark.ruby.serializer.batch_size 2048"
|
162
|
+
file.close
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
# ===============================================================================
|
168
|
+
# Global settings and variables
|
169
|
+
|
110
170
|
def self.logger
|
111
171
|
@logger ||= Spark::Logger.new
|
112
172
|
end
|
@@ -116,10 +176,6 @@ module Spark
|
|
116
176
|
@root ||= File.expand_path('..', File.dirname(__FILE__))
|
117
177
|
end
|
118
178
|
|
119
|
-
def self.home
|
120
|
-
root
|
121
|
-
end
|
122
|
-
|
123
179
|
# Default directory for java extensions
|
124
180
|
def self.target_dir
|
125
181
|
@target_dir ||= File.join(root, 'target')
|
@@ -146,17 +202,16 @@ module Spark
|
|
146
202
|
# Cannot load before CLI::install
|
147
203
|
#
|
148
204
|
# == Parameters:
|
149
|
-
#
|
205
|
+
# target::
|
150
206
|
# path to directory where are located sparks .jar files or single Spark jar
|
151
207
|
#
|
152
|
-
def self.load_lib(
|
208
|
+
def self.load_lib(target=nil)
|
153
209
|
return if @java_bridge
|
154
210
|
|
155
|
-
|
211
|
+
target ||= Spark.target_dir
|
156
212
|
|
157
|
-
|
158
|
-
@java_bridge
|
159
|
-
@java_bridge.load
|
213
|
+
@java_bridge = JavaBridge.init(target)
|
214
|
+
@java_bridge.import_all
|
160
215
|
nil
|
161
216
|
end
|
162
217
|
|
@@ -169,6 +224,7 @@ module Spark
|
|
169
224
|
class << self
|
170
225
|
alias_method :sc, :context
|
171
226
|
alias_method :jb, :java_bridge
|
227
|
+
alias_method :home, :root
|
172
228
|
end
|
173
229
|
|
174
230
|
end
|
@@ -189,6 +245,9 @@ require 'spark/ext/io'
|
|
189
245
|
require 'spark/version'
|
190
246
|
require 'spark/error'
|
191
247
|
|
248
|
+
# Load default settings for gem and Spark
|
249
|
+
Spark.load_defaults
|
250
|
+
|
192
251
|
# Make sure that Spark be always stopped
|
193
252
|
Kernel.at_exit do
|
194
253
|
begin
|
data/lib/spark/accumulator.rb
CHANGED
@@ -77,6 +77,14 @@ module Spark
|
|
77
77
|
@@instances[@id] = self
|
78
78
|
end
|
79
79
|
|
80
|
+
def inspect
|
81
|
+
result = %{#<#{self.class.name}:0x#{object_id}\n}
|
82
|
+
result << %{ ID: #{@id}\n}
|
83
|
+
result << %{ Zero: #{@zero_value.to_s[0, 10]}\n}
|
84
|
+
result << %{Value: #{@value.to_s[0, 10]}>}
|
85
|
+
result
|
86
|
+
end
|
87
|
+
|
80
88
|
def self.changed
|
81
89
|
@@changed
|
82
90
|
end
|
data/lib/spark/broadcast.rb
CHANGED
@@ -61,6 +61,13 @@ module Spark
|
|
61
61
|
ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })
|
62
62
|
end
|
63
63
|
|
64
|
+
def inspect
|
65
|
+
result = %{#<#{self.class.name}:0x#{object_id}\n}
|
66
|
+
result << %{ ID: #{@id}\n}
|
67
|
+
result << %{Value: #{@value.to_s[0, 10]}>}
|
68
|
+
result
|
69
|
+
end
|
70
|
+
|
64
71
|
def self.register(id, path)
|
65
72
|
@@registered[id] = path
|
66
73
|
end
|
data/lib/spark/build.rb
CHANGED
@@ -3,7 +3,7 @@ module Spark
|
|
3
3
|
|
4
4
|
DEFAULT_SCALA_VERSION = '2.10.4'
|
5
5
|
DEFAULT_CORE_VERSION = '2.10'
|
6
|
-
DEFAULT_SPARK_VERSION = '1.
|
6
|
+
DEFAULT_SPARK_VERSION = '1.4.0'
|
7
7
|
DEFAULT_HADOOP_VERSION = '1.0.4'
|
8
8
|
|
9
9
|
SBT = 'sbt/sbt'
|
@@ -11,20 +11,20 @@ module Spark
|
|
11
11
|
SBT_EXT = 'package'
|
12
12
|
SBT_CLEAN = 'clean'
|
13
13
|
|
14
|
-
def self.build(options)
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
only_ext
|
14
|
+
def self.build(options={})
|
15
|
+
scala_version = options[:scala_version] || DEFAULT_SCALA_VERSION
|
16
|
+
spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION
|
17
|
+
spark_version = options[:spark_version] || DEFAULT_SPARK_VERSION
|
18
|
+
hadoop_version = options[:hadoop_version] || DEFAULT_HADOOP_VERSION
|
19
|
+
target = options[:target] || Spark.target_dir
|
20
|
+
only_ext = options[:only_ext] || false
|
21
21
|
|
22
22
|
env = {
|
23
23
|
'SCALA_VERSION' => scala_version,
|
24
24
|
'SPARK_VERSION' => spark_version,
|
25
|
-
'SPARK_CORE_VERSION' =>
|
25
|
+
'SPARK_CORE_VERSION' => spark_core_version,
|
26
26
|
'HADOOP_VERSION' => hadoop_version,
|
27
|
-
'
|
27
|
+
'TARGET_DIR' => target
|
28
28
|
}
|
29
29
|
|
30
30
|
cmd = [SBT]
|
data/lib/spark/cli.rb
CHANGED
@@ -13,8 +13,8 @@ module Spark
|
|
13
13
|
class CLI
|
14
14
|
include Commander::Methods
|
15
15
|
|
16
|
-
IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
|
17
|
-
IRB_HISTORY_SIZE = 100
|
16
|
+
# IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
|
17
|
+
# IRB_HISTORY_SIZE = 100
|
18
18
|
|
19
19
|
def run
|
20
20
|
program :name, 'RubySpark'
|
@@ -29,22 +29,15 @@ module Spark
|
|
29
29
|
command :build do |c|
|
30
30
|
c.syntax = 'build [options]'
|
31
31
|
c.description = 'Build spark and gem extensions'
|
32
|
-
c.option '--hadoop-version STRING', String, 'Version of hadoop which will
|
33
|
-
c.option '--spark-
|
34
|
-
c.option '--spark-
|
35
|
-
c.option '--spark-version STRING', String, 'Version of SPARK'
|
32
|
+
c.option '--hadoop-version STRING', String, 'Version of hadoop which will assembled with the Spark'
|
33
|
+
c.option '--spark-core-version STRING', String, 'Version of Spark core'
|
34
|
+
c.option '--spark-version STRING', String, 'Version of Spark'
|
36
35
|
c.option '--scala-version STRING', String, 'Version of Scala'
|
37
|
-
c.option '--
|
36
|
+
c.option '--target STRING', String, 'Directory where Spark will be stored'
|
37
|
+
c.option '--only-ext', 'Build only extension for RubySpark'
|
38
38
|
|
39
39
|
c.action do |args, options|
|
40
|
-
|
41
|
-
spark_home: Spark.target_dir,
|
42
|
-
spark_core: Spark::Build::DEFAULT_CORE_VERSION,
|
43
|
-
spark_version: Spark::Build::DEFAULT_SPARK_VERSION,
|
44
|
-
scala_version: Spark::Build::DEFAULT_SCALA_VERSION,
|
45
|
-
only_ext: false
|
46
|
-
|
47
|
-
Spark::Build.build(options)
|
40
|
+
Spark::Build.build(options.__hash__)
|
48
41
|
puts
|
49
42
|
puts 'Everything is OK'
|
50
43
|
end
|
@@ -52,23 +45,23 @@ module Spark
|
|
52
45
|
alias_command :install, :build
|
53
46
|
|
54
47
|
|
55
|
-
#
|
56
|
-
command :
|
57
|
-
c.syntax = '
|
48
|
+
# Shell -----------------------------------------------------------------
|
49
|
+
command :shell do |c|
|
50
|
+
c.syntax = 'shell [options]'
|
58
51
|
c.description = 'Start ruby shell for spark'
|
59
|
-
c.option '--
|
52
|
+
c.option '--target STRING', String, 'Directory where Spark is stored'
|
60
53
|
c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'
|
61
|
-
c.option '--[no-]start', 'Start
|
54
|
+
c.option '--[no-]start', 'Start Spark immediately'
|
62
55
|
c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
|
63
56
|
|
64
57
|
c.action do |args, options|
|
65
58
|
options.default start: true, logger: true
|
66
59
|
|
67
|
-
Spark.load_lib(options.
|
68
|
-
Spark
|
60
|
+
Spark.load_lib(options.target)
|
61
|
+
Spark.logger.disable unless options.logger
|
69
62
|
|
70
63
|
Spark.config do
|
71
|
-
set_app_name '
|
64
|
+
set_app_name 'RubySpark'
|
72
65
|
end
|
73
66
|
|
74
67
|
Spark.config.from_file(options.properties_file)
|
@@ -88,61 +81,60 @@ module Spark
|
|
88
81
|
Pry.start
|
89
82
|
end
|
90
83
|
end
|
91
|
-
alias_command :shell, :pry
|
92
84
|
|
93
85
|
|
94
|
-
# IRB -------------------------------------------------------------------
|
95
|
-
command :irb do |c|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
end
|
86
|
+
# # IRB -------------------------------------------------------------------
|
87
|
+
# command :irb do |c|
|
88
|
+
# c.syntax = 'irb [options]'
|
89
|
+
# c.description = 'Start ruby shell for spark'
|
90
|
+
# c.option '--spark-home STRING', String, 'Directory where Spark is stored'
|
91
|
+
# c.option '--[no-]start', 'Start Spark immediately'
|
92
|
+
# c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
|
93
|
+
#
|
94
|
+
# c.action do |args, options|
|
95
|
+
# options.default start: true, logger: true
|
96
|
+
#
|
97
|
+
# Spark.load_lib(options.spark_home)
|
98
|
+
# Spark::Logger.disable unless options.logger
|
99
|
+
#
|
100
|
+
# Spark.config do
|
101
|
+
# set_app_name 'Pry RubySpark'
|
102
|
+
# end
|
103
|
+
#
|
104
|
+
# if options.start
|
105
|
+
# # Load Java and Spark
|
106
|
+
# Spark.start
|
107
|
+
# $sc = Spark.context
|
108
|
+
#
|
109
|
+
# Spark.print_logo('Spark context is loaded as $sc')
|
110
|
+
# else
|
111
|
+
# Spark.print_logo('You can start Spark with Spark.start')
|
112
|
+
# end
|
113
|
+
#
|
114
|
+
# # Load IRB
|
115
|
+
# require 'irb'
|
116
|
+
# require 'irb/completion'
|
117
|
+
# require 'irb/ext/save-history'
|
118
|
+
#
|
119
|
+
# begin
|
120
|
+
# file = File.expand_path(IRB_HISTORY_FILE)
|
121
|
+
# if File.exists?(file)
|
122
|
+
# lines = IO.readlines(file).collect { |line| line.chomp }
|
123
|
+
# Readline::HISTORY.push(*lines)
|
124
|
+
# end
|
125
|
+
# Kernel.at_exit do
|
126
|
+
# lines = Readline::HISTORY.to_a.reverse.uniq.reverse
|
127
|
+
# lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
|
128
|
+
# File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
|
129
|
+
# end
|
130
|
+
# rescue
|
131
|
+
# end
|
132
|
+
#
|
133
|
+
# ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
|
134
|
+
# ARGV.concat ['--readline', '--prompt-mode', 'simple']
|
135
|
+
# IRB.start
|
136
|
+
# end
|
137
|
+
# end
|
146
138
|
|
147
139
|
|
148
140
|
# Home ------------------------------------------------------------------
|
data/lib/spark/config.rb
CHANGED
@@ -16,6 +16,7 @@ module Spark
|
|
16
16
|
def initialize
|
17
17
|
@spark_conf = SparkConf.new(true)
|
18
18
|
set_default
|
19
|
+
from_file(Spark::DEFAULT_CONFIG_FILE)
|
19
20
|
end
|
20
21
|
|
21
22
|
def from_file(file)
|
@@ -140,11 +141,11 @@ module Spark
|
|
140
141
|
set('spark.ruby.serializer', default_serializer)
|
141
142
|
set('spark.ruby.serializer.compress', default_serializer_compress)
|
142
143
|
set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
|
143
|
-
set('spark.ruby.executor.uri', default_executor_uri)
|
144
144
|
set('spark.ruby.executor.command', default_executor_command)
|
145
145
|
set('spark.ruby.executor.options', default_executor_options)
|
146
146
|
set('spark.ruby.worker.type', default_worker_type)
|
147
147
|
load_executor_envs
|
148
|
+
# set('spark.ruby.executor.install', default_executor_install)
|
148
149
|
end
|
149
150
|
|
150
151
|
def default_serializer
|
@@ -159,21 +160,6 @@ module Spark
|
|
159
160
|
ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
|
160
161
|
end
|
161
162
|
|
162
|
-
# Ruby executor.
|
163
|
-
#
|
164
|
-
# == Options:
|
165
|
-
# nil::
|
166
|
-
# System's gem is loaded (ruby-spark).
|
167
|
-
#
|
168
|
-
# other::
|
169
|
-
# Path of library which will be used.
|
170
|
-
# Current ruby-spark gem is used.
|
171
|
-
# (default)
|
172
|
-
#
|
173
|
-
def default_executor_uri
|
174
|
-
ENV['SPARK_RUBY_EXECUTOR_URI'] || ''
|
175
|
-
end
|
176
|
-
|
177
163
|
# Command template which is applied when scala want create a ruby
|
178
164
|
# process (e.g. master, home request). Command is represented by '%s'.
|
179
165
|
#
|
@@ -186,13 +172,23 @@ module Spark
|
|
186
172
|
|
187
173
|
# Options for every worker.
|
188
174
|
#
|
189
|
-
# ==
|
175
|
+
# == Example:
|
190
176
|
# -J-Xmx512m
|
191
177
|
#
|
192
178
|
def default_executor_options
|
193
179
|
ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
|
194
180
|
end
|
195
181
|
|
182
|
+
# # Install command which is triggered before on start.
|
183
|
+
# # This command using executor command template.
|
184
|
+
# #
|
185
|
+
# # == Example:
|
186
|
+
# # gem install ruby-spark -v 1.2.0
|
187
|
+
# #
|
188
|
+
# def default_executor_install
|
189
|
+
# ENV['SPARK_RUBY_EXECUTOR_INSTALL'] || ''
|
190
|
+
# end
|
191
|
+
|
196
192
|
# Type of worker.
|
197
193
|
#
|
198
194
|
# == Options:
|