ruby-spark 1.1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +15 -0
- data/CHANGELOG.md +8 -0
- data/README.md +184 -57
- data/TODO.md +3 -1
- data/ext/spark/build.sbt +5 -5
- data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
- data/lib/spark.rb +69 -10
- data/lib/spark/accumulator.rb +8 -0
- data/lib/spark/broadcast.rb +7 -0
- data/lib/spark/build.rb +10 -10
- data/lib/spark/cli.rb +68 -76
- data/lib/spark/config.rb +13 -17
- data/lib/spark/context.rb +10 -7
- data/lib/spark/error.rb +4 -0
- data/lib/spark/helper/statistic.rb +5 -1
- data/lib/spark/java_bridge.rb +5 -3
- data/lib/spark/java_bridge/base.rb +15 -15
- data/lib/spark/java_bridge/jruby.rb +3 -1
- data/lib/spark/java_bridge/rjb.rb +2 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
- data/lib/spark/mllib/classification/svm.rb +10 -2
- data/lib/spark/mllib/clustering/kmeans.rb +6 -2
- data/lib/spark/mllib/regression/lasso.rb +18 -2
- data/lib/spark/mllib/regression/linear.rb +11 -3
- data/lib/spark/mllib/regression/ridge.rb +18 -2
- data/lib/spark/rdd.rb +11 -2
- data/lib/spark/serializer.rb +1 -1
- data/lib/spark/serializer/auto_batched.rb +7 -0
- data/lib/spark/version.rb +1 -1
- data/ruby-spark.gemspec +4 -5
- data/spec/generator.rb +1 -1
- data/spec/lib/collect_spec.rb +10 -10
- data/spec/lib/config_spec.rb +10 -10
- data/spec/lib/context_spec.rb +116 -115
- data/spec/lib/ext_spec.rb +17 -17
- data/spec/lib/external_apps_spec.rb +1 -1
- data/spec/lib/filter_spec.rb +17 -17
- data/spec/lib/flat_map_spec.rb +22 -19
- data/spec/lib/group_spec.rb +22 -19
- data/spec/lib/helper_spec.rb +60 -12
- data/spec/lib/key_spec.rb +9 -8
- data/spec/lib/manipulation_spec.rb +15 -15
- data/spec/lib/map_partitions_spec.rb +6 -4
- data/spec/lib/map_spec.rb +22 -19
- data/spec/lib/reduce_by_key_spec.rb +19 -19
- data/spec/lib/reduce_spec.rb +22 -20
- data/spec/lib/sample_spec.rb +13 -12
- data/spec/lib/serializer_spec.rb +27 -0
- data/spec/lib/sort_spec.rb +16 -14
- data/spec/lib/statistic_spec.rb +4 -2
- data/spec/lib/whole_text_files_spec.rb +9 -8
- data/spec/spec_helper.rb +3 -3
- metadata +19 -18
data/lib/spark.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Gems and libraries
|
2
2
|
require 'method_source'
|
3
|
+
require 'securerandom'
|
3
4
|
require 'forwardable'
|
4
5
|
require 'sourcify'
|
5
6
|
require 'socket'
|
@@ -29,6 +30,8 @@ module Spark
|
|
29
30
|
|
30
31
|
include Helper::System
|
31
32
|
|
33
|
+
DEFAULT_CONFIG_FILE = File.join(Dir.home, '.ruby-spark.conf')
|
34
|
+
|
32
35
|
def self.print_logo(message=nil)
|
33
36
|
puts <<-STRING
|
34
37
|
|
@@ -107,6 +110,63 @@ module Spark
|
|
107
110
|
!!@context
|
108
111
|
end
|
109
112
|
|
113
|
+
|
114
|
+
# ===============================================================================
|
115
|
+
# Defaults
|
116
|
+
|
117
|
+
# Load default configuration for Spark and RubySpark
|
118
|
+
# By default are values stored at ~/.ruby-spark.conf
|
119
|
+
# File is automatically created
|
120
|
+
def self.load_defaults
|
121
|
+
unless File.exists?(DEFAULT_CONFIG_FILE)
|
122
|
+
save_defaults_to(DEFAULT_CONFIG_FILE)
|
123
|
+
end
|
124
|
+
|
125
|
+
load_defaults_from(DEFAULT_CONFIG_FILE)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Clear prev setting and load new from file
|
129
|
+
def self.load_defaults_from(file_path)
|
130
|
+
# Parse values
|
131
|
+
values = File.readlines(file_path)
|
132
|
+
values.map!(&:strip)
|
133
|
+
values.select!{|value| value.start_with?('gem.')}
|
134
|
+
values.map!{|value| value.split(nil, 2)}
|
135
|
+
values = Hash[values]
|
136
|
+
|
137
|
+
# Clear prev values
|
138
|
+
@target_dir = nil
|
139
|
+
@ruby_spark_jar = nil
|
140
|
+
@spark_home = nil
|
141
|
+
|
142
|
+
# Load new
|
143
|
+
@target_dir = values['gem.target']
|
144
|
+
end
|
145
|
+
|
146
|
+
# Create target dir and new config file
|
147
|
+
def self.save_defaults_to(file_path)
|
148
|
+
dir = File.join(Dir.home, ".ruby-spark.#{SecureRandom.uuid}")
|
149
|
+
|
150
|
+
if Dir.exist?(dir)
|
151
|
+
save_defaults_to(file_path)
|
152
|
+
else
|
153
|
+
Dir.mkdir(dir, 0700)
|
154
|
+
file = File.open(file_path, 'w')
|
155
|
+
file.puts "# Directory where will be Spark saved"
|
156
|
+
file.puts "gem.target #{dir}"
|
157
|
+
file.puts ""
|
158
|
+
file.puts "# You can also defined spark properties"
|
159
|
+
file.puts "# spark.master spark://master:7077"
|
160
|
+
file.puts "# spark.ruby.serializer marshal"
|
161
|
+
file.puts "# spark.ruby.serializer.batch_size 2048"
|
162
|
+
file.close
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
# ===============================================================================
|
168
|
+
# Global settings and variables
|
169
|
+
|
110
170
|
def self.logger
|
111
171
|
@logger ||= Spark::Logger.new
|
112
172
|
end
|
@@ -116,10 +176,6 @@ module Spark
|
|
116
176
|
@root ||= File.expand_path('..', File.dirname(__FILE__))
|
117
177
|
end
|
118
178
|
|
119
|
-
def self.home
|
120
|
-
root
|
121
|
-
end
|
122
|
-
|
123
179
|
# Default directory for java extensions
|
124
180
|
def self.target_dir
|
125
181
|
@target_dir ||= File.join(root, 'target')
|
@@ -146,17 +202,16 @@ module Spark
|
|
146
202
|
# Cannot load before CLI::install
|
147
203
|
#
|
148
204
|
# == Parameters:
|
149
|
-
#
|
205
|
+
# target::
|
150
206
|
# path to directory where are located sparks .jar files or single Spark jar
|
151
207
|
#
|
152
|
-
def self.load_lib(
|
208
|
+
def self.load_lib(target=nil)
|
153
209
|
return if @java_bridge
|
154
210
|
|
155
|
-
|
211
|
+
target ||= Spark.target_dir
|
156
212
|
|
157
|
-
|
158
|
-
@java_bridge
|
159
|
-
@java_bridge.load
|
213
|
+
@java_bridge = JavaBridge.init(target)
|
214
|
+
@java_bridge.import_all
|
160
215
|
nil
|
161
216
|
end
|
162
217
|
|
@@ -169,6 +224,7 @@ module Spark
|
|
169
224
|
class << self
|
170
225
|
alias_method :sc, :context
|
171
226
|
alias_method :jb, :java_bridge
|
227
|
+
alias_method :home, :root
|
172
228
|
end
|
173
229
|
|
174
230
|
end
|
@@ -189,6 +245,9 @@ require 'spark/ext/io'
|
|
189
245
|
require 'spark/version'
|
190
246
|
require 'spark/error'
|
191
247
|
|
248
|
+
# Load default settings for gem and Spark
|
249
|
+
Spark.load_defaults
|
250
|
+
|
192
251
|
# Make sure that Spark be always stopped
|
193
252
|
Kernel.at_exit do
|
194
253
|
begin
|
data/lib/spark/accumulator.rb
CHANGED
@@ -77,6 +77,14 @@ module Spark
|
|
77
77
|
@@instances[@id] = self
|
78
78
|
end
|
79
79
|
|
80
|
+
def inspect
|
81
|
+
result = %{#<#{self.class.name}:0x#{object_id}\n}
|
82
|
+
result << %{ ID: #{@id}\n}
|
83
|
+
result << %{ Zero: #{@zero_value.to_s[0, 10]}\n}
|
84
|
+
result << %{Value: #{@value.to_s[0, 10]}>}
|
85
|
+
result
|
86
|
+
end
|
87
|
+
|
80
88
|
def self.changed
|
81
89
|
@@changed
|
82
90
|
end
|
data/lib/spark/broadcast.rb
CHANGED
@@ -61,6 +61,13 @@ module Spark
|
|
61
61
|
ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })
|
62
62
|
end
|
63
63
|
|
64
|
+
def inspect
|
65
|
+
result = %{#<#{self.class.name}:0x#{object_id}\n}
|
66
|
+
result << %{ ID: #{@id}\n}
|
67
|
+
result << %{Value: #{@value.to_s[0, 10]}>}
|
68
|
+
result
|
69
|
+
end
|
70
|
+
|
64
71
|
def self.register(id, path)
|
65
72
|
@@registered[id] = path
|
66
73
|
end
|
data/lib/spark/build.rb
CHANGED
@@ -3,7 +3,7 @@ module Spark
|
|
3
3
|
|
4
4
|
DEFAULT_SCALA_VERSION = '2.10.4'
|
5
5
|
DEFAULT_CORE_VERSION = '2.10'
|
6
|
-
DEFAULT_SPARK_VERSION = '1.
|
6
|
+
DEFAULT_SPARK_VERSION = '1.4.0'
|
7
7
|
DEFAULT_HADOOP_VERSION = '1.0.4'
|
8
8
|
|
9
9
|
SBT = 'sbt/sbt'
|
@@ -11,20 +11,20 @@ module Spark
|
|
11
11
|
SBT_EXT = 'package'
|
12
12
|
SBT_CLEAN = 'clean'
|
13
13
|
|
14
|
-
def self.build(options)
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
only_ext
|
14
|
+
def self.build(options={})
|
15
|
+
scala_version = options[:scala_version] || DEFAULT_SCALA_VERSION
|
16
|
+
spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION
|
17
|
+
spark_version = options[:spark_version] || DEFAULT_SPARK_VERSION
|
18
|
+
hadoop_version = options[:hadoop_version] || DEFAULT_HADOOP_VERSION
|
19
|
+
target = options[:target] || Spark.target_dir
|
20
|
+
only_ext = options[:only_ext] || false
|
21
21
|
|
22
22
|
env = {
|
23
23
|
'SCALA_VERSION' => scala_version,
|
24
24
|
'SPARK_VERSION' => spark_version,
|
25
|
-
'SPARK_CORE_VERSION' =>
|
25
|
+
'SPARK_CORE_VERSION' => spark_core_version,
|
26
26
|
'HADOOP_VERSION' => hadoop_version,
|
27
|
-
'
|
27
|
+
'TARGET_DIR' => target
|
28
28
|
}
|
29
29
|
|
30
30
|
cmd = [SBT]
|
data/lib/spark/cli.rb
CHANGED
@@ -13,8 +13,8 @@ module Spark
|
|
13
13
|
class CLI
|
14
14
|
include Commander::Methods
|
15
15
|
|
16
|
-
IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
|
17
|
-
IRB_HISTORY_SIZE = 100
|
16
|
+
# IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
|
17
|
+
# IRB_HISTORY_SIZE = 100
|
18
18
|
|
19
19
|
def run
|
20
20
|
program :name, 'RubySpark'
|
@@ -29,22 +29,15 @@ module Spark
|
|
29
29
|
command :build do |c|
|
30
30
|
c.syntax = 'build [options]'
|
31
31
|
c.description = 'Build spark and gem extensions'
|
32
|
-
c.option '--hadoop-version STRING', String, 'Version of hadoop which will
|
33
|
-
c.option '--spark-
|
34
|
-
c.option '--spark-
|
35
|
-
c.option '--spark-version STRING', String, 'Version of SPARK'
|
32
|
+
c.option '--hadoop-version STRING', String, 'Version of hadoop which will assembled with the Spark'
|
33
|
+
c.option '--spark-core-version STRING', String, 'Version of Spark core'
|
34
|
+
c.option '--spark-version STRING', String, 'Version of Spark'
|
36
35
|
c.option '--scala-version STRING', String, 'Version of Scala'
|
37
|
-
c.option '--
|
36
|
+
c.option '--target STRING', String, 'Directory where Spark will be stored'
|
37
|
+
c.option '--only-ext', 'Build only extension for RubySpark'
|
38
38
|
|
39
39
|
c.action do |args, options|
|
40
|
-
|
41
|
-
spark_home: Spark.target_dir,
|
42
|
-
spark_core: Spark::Build::DEFAULT_CORE_VERSION,
|
43
|
-
spark_version: Spark::Build::DEFAULT_SPARK_VERSION,
|
44
|
-
scala_version: Spark::Build::DEFAULT_SCALA_VERSION,
|
45
|
-
only_ext: false
|
46
|
-
|
47
|
-
Spark::Build.build(options)
|
40
|
+
Spark::Build.build(options.__hash__)
|
48
41
|
puts
|
49
42
|
puts 'Everything is OK'
|
50
43
|
end
|
@@ -52,23 +45,23 @@ module Spark
|
|
52
45
|
alias_command :install, :build
|
53
46
|
|
54
47
|
|
55
|
-
#
|
56
|
-
command :
|
57
|
-
c.syntax = '
|
48
|
+
# Shell -----------------------------------------------------------------
|
49
|
+
command :shell do |c|
|
50
|
+
c.syntax = 'shell [options]'
|
58
51
|
c.description = 'Start ruby shell for spark'
|
59
|
-
c.option '--
|
52
|
+
c.option '--target STRING', String, 'Directory where Spark is stored'
|
60
53
|
c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'
|
61
|
-
c.option '--[no-]start', 'Start
|
54
|
+
c.option '--[no-]start', 'Start Spark immediately'
|
62
55
|
c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
|
63
56
|
|
64
57
|
c.action do |args, options|
|
65
58
|
options.default start: true, logger: true
|
66
59
|
|
67
|
-
Spark.load_lib(options.
|
68
|
-
Spark
|
60
|
+
Spark.load_lib(options.target)
|
61
|
+
Spark.logger.disable unless options.logger
|
69
62
|
|
70
63
|
Spark.config do
|
71
|
-
set_app_name '
|
64
|
+
set_app_name 'RubySpark'
|
72
65
|
end
|
73
66
|
|
74
67
|
Spark.config.from_file(options.properties_file)
|
@@ -88,61 +81,60 @@ module Spark
|
|
88
81
|
Pry.start
|
89
82
|
end
|
90
83
|
end
|
91
|
-
alias_command :shell, :pry
|
92
84
|
|
93
85
|
|
94
|
-
# IRB -------------------------------------------------------------------
|
95
|
-
command :irb do |c|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
end
|
86
|
+
# # IRB -------------------------------------------------------------------
|
87
|
+
# command :irb do |c|
|
88
|
+
# c.syntax = 'irb [options]'
|
89
|
+
# c.description = 'Start ruby shell for spark'
|
90
|
+
# c.option '--spark-home STRING', String, 'Directory where Spark is stored'
|
91
|
+
# c.option '--[no-]start', 'Start Spark immediately'
|
92
|
+
# c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
|
93
|
+
#
|
94
|
+
# c.action do |args, options|
|
95
|
+
# options.default start: true, logger: true
|
96
|
+
#
|
97
|
+
# Spark.load_lib(options.spark_home)
|
98
|
+
# Spark::Logger.disable unless options.logger
|
99
|
+
#
|
100
|
+
# Spark.config do
|
101
|
+
# set_app_name 'Pry RubySpark'
|
102
|
+
# end
|
103
|
+
#
|
104
|
+
# if options.start
|
105
|
+
# # Load Java and Spark
|
106
|
+
# Spark.start
|
107
|
+
# $sc = Spark.context
|
108
|
+
#
|
109
|
+
# Spark.print_logo('Spark context is loaded as $sc')
|
110
|
+
# else
|
111
|
+
# Spark.print_logo('You can start Spark with Spark.start')
|
112
|
+
# end
|
113
|
+
#
|
114
|
+
# # Load IRB
|
115
|
+
# require 'irb'
|
116
|
+
# require 'irb/completion'
|
117
|
+
# require 'irb/ext/save-history'
|
118
|
+
#
|
119
|
+
# begin
|
120
|
+
# file = File.expand_path(IRB_HISTORY_FILE)
|
121
|
+
# if File.exists?(file)
|
122
|
+
# lines = IO.readlines(file).collect { |line| line.chomp }
|
123
|
+
# Readline::HISTORY.push(*lines)
|
124
|
+
# end
|
125
|
+
# Kernel.at_exit do
|
126
|
+
# lines = Readline::HISTORY.to_a.reverse.uniq.reverse
|
127
|
+
# lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
|
128
|
+
# File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
|
129
|
+
# end
|
130
|
+
# rescue
|
131
|
+
# end
|
132
|
+
#
|
133
|
+
# ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
|
134
|
+
# ARGV.concat ['--readline', '--prompt-mode', 'simple']
|
135
|
+
# IRB.start
|
136
|
+
# end
|
137
|
+
# end
|
146
138
|
|
147
139
|
|
148
140
|
# Home ------------------------------------------------------------------
|
data/lib/spark/config.rb
CHANGED
@@ -16,6 +16,7 @@ module Spark
|
|
16
16
|
def initialize
|
17
17
|
@spark_conf = SparkConf.new(true)
|
18
18
|
set_default
|
19
|
+
from_file(Spark::DEFAULT_CONFIG_FILE)
|
19
20
|
end
|
20
21
|
|
21
22
|
def from_file(file)
|
@@ -140,11 +141,11 @@ module Spark
|
|
140
141
|
set('spark.ruby.serializer', default_serializer)
|
141
142
|
set('spark.ruby.serializer.compress', default_serializer_compress)
|
142
143
|
set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
|
143
|
-
set('spark.ruby.executor.uri', default_executor_uri)
|
144
144
|
set('spark.ruby.executor.command', default_executor_command)
|
145
145
|
set('spark.ruby.executor.options', default_executor_options)
|
146
146
|
set('spark.ruby.worker.type', default_worker_type)
|
147
147
|
load_executor_envs
|
148
|
+
# set('spark.ruby.executor.install', default_executor_install)
|
148
149
|
end
|
149
150
|
|
150
151
|
def default_serializer
|
@@ -159,21 +160,6 @@ module Spark
|
|
159
160
|
ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
|
160
161
|
end
|
161
162
|
|
162
|
-
# Ruby executor.
|
163
|
-
#
|
164
|
-
# == Options:
|
165
|
-
# nil::
|
166
|
-
# System's gem is loaded (ruby-spark).
|
167
|
-
#
|
168
|
-
# other::
|
169
|
-
# Path of library which will be used.
|
170
|
-
# Current ruby-spark gem is used.
|
171
|
-
# (default)
|
172
|
-
#
|
173
|
-
def default_executor_uri
|
174
|
-
ENV['SPARK_RUBY_EXECUTOR_URI'] || ''
|
175
|
-
end
|
176
|
-
|
177
163
|
# Command template which is applied when scala want create a ruby
|
178
164
|
# process (e.g. master, home request). Command is represented by '%s'.
|
179
165
|
#
|
@@ -186,13 +172,23 @@ module Spark
|
|
186
172
|
|
187
173
|
# Options for every worker.
|
188
174
|
#
|
189
|
-
# ==
|
175
|
+
# == Example:
|
190
176
|
# -J-Xmx512m
|
191
177
|
#
|
192
178
|
def default_executor_options
|
193
179
|
ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
|
194
180
|
end
|
195
181
|
|
182
|
+
# # Install command which is triggered before on start.
|
183
|
+
# # This command using executor command template.
|
184
|
+
# #
|
185
|
+
# # == Example:
|
186
|
+
# # gem install ruby-spark -v 1.2.0
|
187
|
+
# #
|
188
|
+
# def default_executor_install
|
189
|
+
# ENV['SPARK_RUBY_EXECUTOR_INSTALL'] || ''
|
190
|
+
# end
|
191
|
+
|
196
192
|
# Type of worker.
|
197
193
|
#
|
198
194
|
# == Options:
|