ruby-spark 1.1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.travis.yml +15 -0
  4. data/CHANGELOG.md +8 -0
  5. data/README.md +184 -57
  6. data/TODO.md +3 -1
  7. data/ext/spark/build.sbt +5 -5
  8. data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
  9. data/lib/spark.rb +69 -10
  10. data/lib/spark/accumulator.rb +8 -0
  11. data/lib/spark/broadcast.rb +7 -0
  12. data/lib/spark/build.rb +10 -10
  13. data/lib/spark/cli.rb +68 -76
  14. data/lib/spark/config.rb +13 -17
  15. data/lib/spark/context.rb +10 -7
  16. data/lib/spark/error.rb +4 -0
  17. data/lib/spark/helper/statistic.rb +5 -1
  18. data/lib/spark/java_bridge.rb +5 -3
  19. data/lib/spark/java_bridge/base.rb +15 -15
  20. data/lib/spark/java_bridge/jruby.rb +3 -1
  21. data/lib/spark/java_bridge/rjb.rb +2 -0
  22. data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
  23. data/lib/spark/mllib/classification/svm.rb +10 -2
  24. data/lib/spark/mllib/clustering/kmeans.rb +6 -2
  25. data/lib/spark/mllib/regression/lasso.rb +18 -2
  26. data/lib/spark/mllib/regression/linear.rb +11 -3
  27. data/lib/spark/mllib/regression/ridge.rb +18 -2
  28. data/lib/spark/rdd.rb +11 -2
  29. data/lib/spark/serializer.rb +1 -1
  30. data/lib/spark/serializer/auto_batched.rb +7 -0
  31. data/lib/spark/version.rb +1 -1
  32. data/ruby-spark.gemspec +4 -5
  33. data/spec/generator.rb +1 -1
  34. data/spec/lib/collect_spec.rb +10 -10
  35. data/spec/lib/config_spec.rb +10 -10
  36. data/spec/lib/context_spec.rb +116 -115
  37. data/spec/lib/ext_spec.rb +17 -17
  38. data/spec/lib/external_apps_spec.rb +1 -1
  39. data/spec/lib/filter_spec.rb +17 -17
  40. data/spec/lib/flat_map_spec.rb +22 -19
  41. data/spec/lib/group_spec.rb +22 -19
  42. data/spec/lib/helper_spec.rb +60 -12
  43. data/spec/lib/key_spec.rb +9 -8
  44. data/spec/lib/manipulation_spec.rb +15 -15
  45. data/spec/lib/map_partitions_spec.rb +6 -4
  46. data/spec/lib/map_spec.rb +22 -19
  47. data/spec/lib/reduce_by_key_spec.rb +19 -19
  48. data/spec/lib/reduce_spec.rb +22 -20
  49. data/spec/lib/sample_spec.rb +13 -12
  50. data/spec/lib/serializer_spec.rb +27 -0
  51. data/spec/lib/sort_spec.rb +16 -14
  52. data/spec/lib/statistic_spec.rb +4 -2
  53. data/spec/lib/whole_text_files_spec.rb +9 -8
  54. data/spec/spec_helper.rb +3 -3
  55. metadata +19 -18
@@ -1,5 +1,6 @@
1
1
  # Gems and libraries
2
2
  require 'method_source'
3
+ require 'securerandom'
3
4
  require 'forwardable'
4
5
  require 'sourcify'
5
6
  require 'socket'
@@ -29,6 +30,8 @@ module Spark
29
30
 
30
31
  include Helper::System
31
32
 
33
+ DEFAULT_CONFIG_FILE = File.join(Dir.home, '.ruby-spark.conf')
34
+
32
35
  def self.print_logo(message=nil)
33
36
  puts <<-STRING
34
37
 
@@ -107,6 +110,63 @@ module Spark
107
110
  !!@context
108
111
  end
109
112
 
113
+
114
+ # ===============================================================================
115
+ # Defaults
116
+
117
+ # Load default configuration for Spark and RubySpark
118
+ # By default are values stored at ~/.ruby-spark.conf
119
+ # File is automatically created
120
+ def self.load_defaults
121
+ unless File.exists?(DEFAULT_CONFIG_FILE)
122
+ save_defaults_to(DEFAULT_CONFIG_FILE)
123
+ end
124
+
125
+ load_defaults_from(DEFAULT_CONFIG_FILE)
126
+ end
127
+
128
+ # Clear prev setting and load new from file
129
+ def self.load_defaults_from(file_path)
130
+ # Parse values
131
+ values = File.readlines(file_path)
132
+ values.map!(&:strip)
133
+ values.select!{|value| value.start_with?('gem.')}
134
+ values.map!{|value| value.split(nil, 2)}
135
+ values = Hash[values]
136
+
137
+ # Clear prev values
138
+ @target_dir = nil
139
+ @ruby_spark_jar = nil
140
+ @spark_home = nil
141
+
142
+ # Load new
143
+ @target_dir = values['gem.target']
144
+ end
145
+
146
+ # Create target dir and new config file
147
+ def self.save_defaults_to(file_path)
148
+ dir = File.join(Dir.home, ".ruby-spark.#{SecureRandom.uuid}")
149
+
150
+ if Dir.exist?(dir)
151
+ save_defaults_to(file_path)
152
+ else
153
+ Dir.mkdir(dir, 0700)
154
+ file = File.open(file_path, 'w')
155
+ file.puts "# Directory where will be Spark saved"
156
+ file.puts "gem.target #{dir}"
157
+ file.puts ""
158
+ file.puts "# You can also defined spark properties"
159
+ file.puts "# spark.master spark://master:7077"
160
+ file.puts "# spark.ruby.serializer marshal"
161
+ file.puts "# spark.ruby.serializer.batch_size 2048"
162
+ file.close
163
+ end
164
+ end
165
+
166
+
167
+ # ===============================================================================
168
+ # Global settings and variables
169
+
110
170
  def self.logger
111
171
  @logger ||= Spark::Logger.new
112
172
  end
@@ -116,10 +176,6 @@ module Spark
116
176
  @root ||= File.expand_path('..', File.dirname(__FILE__))
117
177
  end
118
178
 
119
- def self.home
120
- root
121
- end
122
-
123
179
  # Default directory for java extensions
124
180
  def self.target_dir
125
181
  @target_dir ||= File.join(root, 'target')
@@ -146,17 +202,16 @@ module Spark
146
202
  # Cannot load before CLI::install
147
203
  #
148
204
  # == Parameters:
149
- # spark_home::
205
+ # target::
150
206
  # path to directory where are located sparks .jar files or single Spark jar
151
207
  #
152
- def self.load_lib(spark_home=nil)
208
+ def self.load_lib(target=nil)
153
209
  return if @java_bridge
154
210
 
155
- spark_home ||= Spark.target_dir
211
+ target ||= Spark.target_dir
156
212
 
157
- bridge = JavaBridge.get
158
- @java_bridge = bridge.new(spark_home)
159
- @java_bridge.load
213
+ @java_bridge = JavaBridge.init(target)
214
+ @java_bridge.import_all
160
215
  nil
161
216
  end
162
217
 
@@ -169,6 +224,7 @@ module Spark
169
224
  class << self
170
225
  alias_method :sc, :context
171
226
  alias_method :jb, :java_bridge
227
+ alias_method :home, :root
172
228
  end
173
229
 
174
230
  end
@@ -189,6 +245,9 @@ require 'spark/ext/io'
189
245
  require 'spark/version'
190
246
  require 'spark/error'
191
247
 
248
+ # Load default settings for gem and Spark
249
+ Spark.load_defaults
250
+
192
251
  # Make sure that Spark be always stopped
193
252
  Kernel.at_exit do
194
253
  begin
@@ -77,6 +77,14 @@ module Spark
77
77
  @@instances[@id] = self
78
78
  end
79
79
 
80
+ def inspect
81
+ result = %{#<#{self.class.name}:0x#{object_id}\n}
82
+ result << %{ ID: #{@id}\n}
83
+ result << %{ Zero: #{@zero_value.to_s[0, 10]}\n}
84
+ result << %{Value: #{@value.to_s[0, 10]}>}
85
+ result
86
+ end
87
+
80
88
  def self.changed
81
89
  @@changed
82
90
  end
@@ -61,6 +61,13 @@ module Spark
61
61
  ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })
62
62
  end
63
63
 
64
+ def inspect
65
+ result = %{#<#{self.class.name}:0x#{object_id}\n}
66
+ result << %{ ID: #{@id}\n}
67
+ result << %{Value: #{@value.to_s[0, 10]}>}
68
+ result
69
+ end
70
+
64
71
  def self.register(id, path)
65
72
  @@registered[id] = path
66
73
  end
@@ -3,7 +3,7 @@ module Spark
3
3
 
4
4
  DEFAULT_SCALA_VERSION = '2.10.4'
5
5
  DEFAULT_CORE_VERSION = '2.10'
6
- DEFAULT_SPARK_VERSION = '1.3.0'
6
+ DEFAULT_SPARK_VERSION = '1.4.0'
7
7
  DEFAULT_HADOOP_VERSION = '1.0.4'
8
8
 
9
9
  SBT = 'sbt/sbt'
@@ -11,20 +11,20 @@ module Spark
11
11
  SBT_EXT = 'package'
12
12
  SBT_CLEAN = 'clean'
13
13
 
14
- def self.build(options)
15
- spark_home = options.spark_home || Spark.target_dir
16
- scala_version = options.scala_version || DEFAULT_SCALA_VERSION
17
- spark_core = options.spark_core || DEFAULT_CORE_VERSION
18
- spark_version = options.spark_version || DEFAULT_SPARK_VERSION
19
- hadoop_version = options.hadoop_version || DEFAULT_HADOOP_VERSION
20
- only_ext = options.only_ext
14
+ def self.build(options={})
15
+ scala_version = options[:scala_version] || DEFAULT_SCALA_VERSION
16
+ spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION
17
+ spark_version = options[:spark_version] || DEFAULT_SPARK_VERSION
18
+ hadoop_version = options[:hadoop_version] || DEFAULT_HADOOP_VERSION
19
+ target = options[:target] || Spark.target_dir
20
+ only_ext = options[:only_ext] || false
21
21
 
22
22
  env = {
23
23
  'SCALA_VERSION' => scala_version,
24
24
  'SPARK_VERSION' => spark_version,
25
- 'SPARK_CORE_VERSION' => spark_core,
25
+ 'SPARK_CORE_VERSION' => spark_core_version,
26
26
  'HADOOP_VERSION' => hadoop_version,
27
- 'SPARK_HOME' => spark_home
27
+ 'TARGET_DIR' => target
28
28
  }
29
29
 
30
30
  cmd = [SBT]
@@ -13,8 +13,8 @@ module Spark
13
13
  class CLI
14
14
  include Commander::Methods
15
15
 
16
- IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
17
- IRB_HISTORY_SIZE = 100
16
+ # IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
17
+ # IRB_HISTORY_SIZE = 100
18
18
 
19
19
  def run
20
20
  program :name, 'RubySpark'
@@ -29,22 +29,15 @@ module Spark
29
29
  command :build do |c|
30
30
  c.syntax = 'build [options]'
31
31
  c.description = 'Build spark and gem extensions'
32
- c.option '--hadoop-version STRING', String, 'Version of hadoop which will stored with the SPARK'
33
- c.option '--spark-home STRING', String, 'Directory where SPARK will be stored'
34
- c.option '--spark-core STRING', String, 'Version of SPARK core'
35
- c.option '--spark-version STRING', String, 'Version of SPARK'
32
+ c.option '--hadoop-version STRING', String, 'Version of hadoop which will assembled with the Spark'
33
+ c.option '--spark-core-version STRING', String, 'Version of Spark core'
34
+ c.option '--spark-version STRING', String, 'Version of Spark'
36
35
  c.option '--scala-version STRING', String, 'Version of Scala'
37
- c.option '--only-ext', 'Start SPARK immediately'
36
+ c.option '--target STRING', String, 'Directory where Spark will be stored'
37
+ c.option '--only-ext', 'Build only extension for RubySpark'
38
38
 
39
39
  c.action do |args, options|
40
- options.default hadoop_version: Spark::Build::DEFAULT_HADOOP_VERSION,
41
- spark_home: Spark.target_dir,
42
- spark_core: Spark::Build::DEFAULT_CORE_VERSION,
43
- spark_version: Spark::Build::DEFAULT_SPARK_VERSION,
44
- scala_version: Spark::Build::DEFAULT_SCALA_VERSION,
45
- only_ext: false
46
-
47
- Spark::Build.build(options)
40
+ Spark::Build.build(options.__hash__)
48
41
  puts
49
42
  puts 'Everything is OK'
50
43
  end
@@ -52,23 +45,23 @@ module Spark
52
45
  alias_command :install, :build
53
46
 
54
47
 
55
- # Pry -------------------------------------------------------------------
56
- command :pry do |c|
57
- c.syntax = 'pry [options]'
48
+ # Shell -----------------------------------------------------------------
49
+ command :shell do |c|
50
+ c.syntax = 'shell [options]'
58
51
  c.description = 'Start ruby shell for spark'
59
- c.option '--spark-home STRING', String, 'Directory where SPARK is stored'
52
+ c.option '--target STRING', String, 'Directory where Spark is stored'
60
53
  c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'
61
- c.option '--[no-]start', 'Start SPARK immediately'
54
+ c.option '--[no-]start', 'Start Spark immediately'
62
55
  c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
63
56
 
64
57
  c.action do |args, options|
65
58
  options.default start: true, logger: true
66
59
 
67
- Spark.load_lib(options.spark_home)
68
- Spark::Logger.disable unless options.logger
60
+ Spark.load_lib(options.target)
61
+ Spark.logger.disable unless options.logger
69
62
 
70
63
  Spark.config do
71
- set_app_name 'Pry RubySpark'
64
+ set_app_name 'RubySpark'
72
65
  end
73
66
 
74
67
  Spark.config.from_file(options.properties_file)
@@ -88,61 +81,60 @@ module Spark
88
81
  Pry.start
89
82
  end
90
83
  end
91
- alias_command :shell, :pry
92
84
 
93
85
 
94
- # IRB -------------------------------------------------------------------
95
- command :irb do |c|
96
- c.syntax = 'irb [options]'
97
- c.description = 'Start ruby shell for spark'
98
- c.option '--spark-home STRING', String, 'Directory where SPARK is stored'
99
- c.option '--[no-]start', 'Start SPARK immediately'
100
- c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
101
-
102
- c.action do |args, options|
103
- options.default start: true, logger: true
104
-
105
- Spark.load_lib(options.spark_home)
106
- Spark::Logger.disable unless options.logger
107
-
108
- Spark.config do
109
- set_app_name 'Pry RubySpark'
110
- end
111
-
112
- if options.start
113
- # Load Java and Spark
114
- Spark.start
115
- $sc = Spark.context
116
-
117
- Spark.print_logo('Spark context is loaded as $sc')
118
- else
119
- Spark.print_logo('You can start Spark with Spark.start')
120
- end
121
-
122
- # Load IRB
123
- require 'irb'
124
- require 'irb/completion'
125
- require 'irb/ext/save-history'
126
-
127
- begin
128
- file = File.expand_path(IRB_HISTORY_FILE)
129
- if File.exists?(file)
130
- lines = IO.readlines(file).collect { |line| line.chomp }
131
- Readline::HISTORY.push(*lines)
132
- end
133
- Kernel.at_exit do
134
- lines = Readline::HISTORY.to_a.reverse.uniq.reverse
135
- lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
136
- File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
137
- end
138
- rescue
139
- end
140
-
141
- ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
142
- ARGV.concat ['--readline', '--prompt-mode', 'simple']
143
- IRB.start
144
- end
145
- end
86
+ # # IRB -------------------------------------------------------------------
87
+ # command :irb do |c|
88
+ # c.syntax = 'irb [options]'
89
+ # c.description = 'Start ruby shell for spark'
90
+ # c.option '--spark-home STRING', String, 'Directory where Spark is stored'
91
+ # c.option '--[no-]start', 'Start Spark immediately'
92
+ # c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
93
+ #
94
+ # c.action do |args, options|
95
+ # options.default start: true, logger: true
96
+ #
97
+ # Spark.load_lib(options.spark_home)
98
+ # Spark::Logger.disable unless options.logger
99
+ #
100
+ # Spark.config do
101
+ # set_app_name 'Pry RubySpark'
102
+ # end
103
+ #
104
+ # if options.start
105
+ # # Load Java and Spark
106
+ # Spark.start
107
+ # $sc = Spark.context
108
+ #
109
+ # Spark.print_logo('Spark context is loaded as $sc')
110
+ # else
111
+ # Spark.print_logo('You can start Spark with Spark.start')
112
+ # end
113
+ #
114
+ # # Load IRB
115
+ # require 'irb'
116
+ # require 'irb/completion'
117
+ # require 'irb/ext/save-history'
118
+ #
119
+ # begin
120
+ # file = File.expand_path(IRB_HISTORY_FILE)
121
+ # if File.exists?(file)
122
+ # lines = IO.readlines(file).collect { |line| line.chomp }
123
+ # Readline::HISTORY.push(*lines)
124
+ # end
125
+ # Kernel.at_exit do
126
+ # lines = Readline::HISTORY.to_a.reverse.uniq.reverse
127
+ # lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
128
+ # File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
129
+ # end
130
+ # rescue
131
+ # end
132
+ #
133
+ # ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
134
+ # ARGV.concat ['--readline', '--prompt-mode', 'simple']
135
+ # IRB.start
136
+ # end
137
+ # end
146
138
 
147
139
 
148
140
  # Home ------------------------------------------------------------------
@@ -16,6 +16,7 @@ module Spark
16
16
  def initialize
17
17
  @spark_conf = SparkConf.new(true)
18
18
  set_default
19
+ from_file(Spark::DEFAULT_CONFIG_FILE)
19
20
  end
20
21
 
21
22
  def from_file(file)
@@ -140,11 +141,11 @@ module Spark
140
141
  set('spark.ruby.serializer', default_serializer)
141
142
  set('spark.ruby.serializer.compress', default_serializer_compress)
142
143
  set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
143
- set('spark.ruby.executor.uri', default_executor_uri)
144
144
  set('spark.ruby.executor.command', default_executor_command)
145
145
  set('spark.ruby.executor.options', default_executor_options)
146
146
  set('spark.ruby.worker.type', default_worker_type)
147
147
  load_executor_envs
148
+ # set('spark.ruby.executor.install', default_executor_install)
148
149
  end
149
150
 
150
151
  def default_serializer
@@ -159,21 +160,6 @@ module Spark
159
160
  ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
160
161
  end
161
162
 
162
- # Ruby executor.
163
- #
164
- # == Options:
165
- # nil::
166
- # System's gem is loaded (ruby-spark).
167
- #
168
- # other::
169
- # Path of library which will be used.
170
- # Current ruby-spark gem is used.
171
- # (default)
172
- #
173
- def default_executor_uri
174
- ENV['SPARK_RUBY_EXECUTOR_URI'] || ''
175
- end
176
-
177
163
  # Command template which is applied when scala want create a ruby
178
164
  # process (e.g. master, home request). Command is represented by '%s'.
179
165
  #
@@ -186,13 +172,23 @@ module Spark
186
172
 
187
173
  # Options for every worker.
188
174
  #
189
- # == Examples:
175
+ # == Example:
190
176
  # -J-Xmx512m
191
177
  #
192
178
  def default_executor_options
193
179
  ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
194
180
  end
195
181
 
182
+ # # Install command which is triggered before on start.
183
+ # # This command using executor command template.
184
+ # #
185
+ # # == Example:
186
+ # # gem install ruby-spark -v 1.2.0
187
+ # #
188
+ # def default_executor_install
189
+ # ENV['SPARK_RUBY_EXECUTOR_INSTALL'] || ''
190
+ # end
191
+
196
192
  # Type of worker.
197
193
  #
198
194
  # == Options: