ruby-spark 1.1.0.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.travis.yml +15 -0
  4. data/CHANGELOG.md +8 -0
  5. data/README.md +184 -57
  6. data/TODO.md +3 -1
  7. data/ext/spark/build.sbt +5 -5
  8. data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
  9. data/lib/spark.rb +69 -10
  10. data/lib/spark/accumulator.rb +8 -0
  11. data/lib/spark/broadcast.rb +7 -0
  12. data/lib/spark/build.rb +10 -10
  13. data/lib/spark/cli.rb +68 -76
  14. data/lib/spark/config.rb +13 -17
  15. data/lib/spark/context.rb +10 -7
  16. data/lib/spark/error.rb +4 -0
  17. data/lib/spark/helper/statistic.rb +5 -1
  18. data/lib/spark/java_bridge.rb +5 -3
  19. data/lib/spark/java_bridge/base.rb +15 -15
  20. data/lib/spark/java_bridge/jruby.rb +3 -1
  21. data/lib/spark/java_bridge/rjb.rb +2 -0
  22. data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
  23. data/lib/spark/mllib/classification/svm.rb +10 -2
  24. data/lib/spark/mllib/clustering/kmeans.rb +6 -2
  25. data/lib/spark/mllib/regression/lasso.rb +18 -2
  26. data/lib/spark/mllib/regression/linear.rb +11 -3
  27. data/lib/spark/mllib/regression/ridge.rb +18 -2
  28. data/lib/spark/rdd.rb +11 -2
  29. data/lib/spark/serializer.rb +1 -1
  30. data/lib/spark/serializer/auto_batched.rb +7 -0
  31. data/lib/spark/version.rb +1 -1
  32. data/ruby-spark.gemspec +4 -5
  33. data/spec/generator.rb +1 -1
  34. data/spec/lib/collect_spec.rb +10 -10
  35. data/spec/lib/config_spec.rb +10 -10
  36. data/spec/lib/context_spec.rb +116 -115
  37. data/spec/lib/ext_spec.rb +17 -17
  38. data/spec/lib/external_apps_spec.rb +1 -1
  39. data/spec/lib/filter_spec.rb +17 -17
  40. data/spec/lib/flat_map_spec.rb +22 -19
  41. data/spec/lib/group_spec.rb +22 -19
  42. data/spec/lib/helper_spec.rb +60 -12
  43. data/spec/lib/key_spec.rb +9 -8
  44. data/spec/lib/manipulation_spec.rb +15 -15
  45. data/spec/lib/map_partitions_spec.rb +6 -4
  46. data/spec/lib/map_spec.rb +22 -19
  47. data/spec/lib/reduce_by_key_spec.rb +19 -19
  48. data/spec/lib/reduce_spec.rb +22 -20
  49. data/spec/lib/sample_spec.rb +13 -12
  50. data/spec/lib/serializer_spec.rb +27 -0
  51. data/spec/lib/sort_spec.rb +16 -14
  52. data/spec/lib/statistic_spec.rb +4 -2
  53. data/spec/lib/whole_text_files_spec.rb +9 -8
  54. data/spec/spec_helper.rb +3 -3
  55. metadata +19 -18
@@ -1,5 +1,6 @@
1
1
  # Gems and libraries
2
2
  require 'method_source'
3
+ require 'securerandom'
3
4
  require 'forwardable'
4
5
  require 'sourcify'
5
6
  require 'socket'
@@ -29,6 +30,8 @@ module Spark
29
30
 
30
31
  include Helper::System
31
32
 
33
+ DEFAULT_CONFIG_FILE = File.join(Dir.home, '.ruby-spark.conf')
34
+
32
35
  def self.print_logo(message=nil)
33
36
  puts <<-STRING
34
37
 
@@ -107,6 +110,63 @@ module Spark
107
110
  !!@context
108
111
  end
109
112
 
113
+
114
+ # ===============================================================================
115
+ # Defaults
116
+
117
+ # Load default configuration for Spark and RubySpark
118
+ # By default are values stored at ~/.ruby-spark.conf
119
+ # File is automatically created
120
+ def self.load_defaults
121
+ unless File.exists?(DEFAULT_CONFIG_FILE)
122
+ save_defaults_to(DEFAULT_CONFIG_FILE)
123
+ end
124
+
125
+ load_defaults_from(DEFAULT_CONFIG_FILE)
126
+ end
127
+
128
+ # Clear prev setting and load new from file
129
+ def self.load_defaults_from(file_path)
130
+ # Parse values
131
+ values = File.readlines(file_path)
132
+ values.map!(&:strip)
133
+ values.select!{|value| value.start_with?('gem.')}
134
+ values.map!{|value| value.split(nil, 2)}
135
+ values = Hash[values]
136
+
137
+ # Clear prev values
138
+ @target_dir = nil
139
+ @ruby_spark_jar = nil
140
+ @spark_home = nil
141
+
142
+ # Load new
143
+ @target_dir = values['gem.target']
144
+ end
145
+
146
+ # Create target dir and new config file
147
+ def self.save_defaults_to(file_path)
148
+ dir = File.join(Dir.home, ".ruby-spark.#{SecureRandom.uuid}")
149
+
150
+ if Dir.exist?(dir)
151
+ save_defaults_to(file_path)
152
+ else
153
+ Dir.mkdir(dir, 0700)
154
+ file = File.open(file_path, 'w')
155
+ file.puts "# Directory where will be Spark saved"
156
+ file.puts "gem.target #{dir}"
157
+ file.puts ""
158
+ file.puts "# You can also defined spark properties"
159
+ file.puts "# spark.master spark://master:7077"
160
+ file.puts "# spark.ruby.serializer marshal"
161
+ file.puts "# spark.ruby.serializer.batch_size 2048"
162
+ file.close
163
+ end
164
+ end
165
+
166
+
167
+ # ===============================================================================
168
+ # Global settings and variables
169
+
110
170
  def self.logger
111
171
  @logger ||= Spark::Logger.new
112
172
  end
@@ -116,10 +176,6 @@ module Spark
116
176
  @root ||= File.expand_path('..', File.dirname(__FILE__))
117
177
  end
118
178
 
119
- def self.home
120
- root
121
- end
122
-
123
179
  # Default directory for java extensions
124
180
  def self.target_dir
125
181
  @target_dir ||= File.join(root, 'target')
@@ -146,17 +202,16 @@ module Spark
146
202
  # Cannot load before CLI::install
147
203
  #
148
204
  # == Parameters:
149
- # spark_home::
205
+ # target::
150
206
  # path to directory where are located sparks .jar files or single Spark jar
151
207
  #
152
- def self.load_lib(spark_home=nil)
208
+ def self.load_lib(target=nil)
153
209
  return if @java_bridge
154
210
 
155
- spark_home ||= Spark.target_dir
211
+ target ||= Spark.target_dir
156
212
 
157
- bridge = JavaBridge.get
158
- @java_bridge = bridge.new(spark_home)
159
- @java_bridge.load
213
+ @java_bridge = JavaBridge.init(target)
214
+ @java_bridge.import_all
160
215
  nil
161
216
  end
162
217
 
@@ -169,6 +224,7 @@ module Spark
169
224
  class << self
170
225
  alias_method :sc, :context
171
226
  alias_method :jb, :java_bridge
227
+ alias_method :home, :root
172
228
  end
173
229
 
174
230
  end
@@ -189,6 +245,9 @@ require 'spark/ext/io'
189
245
  require 'spark/version'
190
246
  require 'spark/error'
191
247
 
248
+ # Load default settings for gem and Spark
249
+ Spark.load_defaults
250
+
192
251
  # Make sure that Spark be always stopped
193
252
  Kernel.at_exit do
194
253
  begin
@@ -77,6 +77,14 @@ module Spark
77
77
  @@instances[@id] = self
78
78
  end
79
79
 
80
+ def inspect
81
+ result = %{#<#{self.class.name}:0x#{object_id}\n}
82
+ result << %{ ID: #{@id}\n}
83
+ result << %{ Zero: #{@zero_value.to_s[0, 10]}\n}
84
+ result << %{Value: #{@value.to_s[0, 10]}>}
85
+ result
86
+ end
87
+
80
88
  def self.changed
81
89
  @@changed
82
90
  end
@@ -61,6 +61,13 @@ module Spark
61
61
  ObjectSpace.define_finalizer(self, proc { File.unlink(@path) })
62
62
  end
63
63
 
64
+ def inspect
65
+ result = %{#<#{self.class.name}:0x#{object_id}\n}
66
+ result << %{ ID: #{@id}\n}
67
+ result << %{Value: #{@value.to_s[0, 10]}>}
68
+ result
69
+ end
70
+
64
71
  def self.register(id, path)
65
72
  @@registered[id] = path
66
73
  end
@@ -3,7 +3,7 @@ module Spark
3
3
 
4
4
  DEFAULT_SCALA_VERSION = '2.10.4'
5
5
  DEFAULT_CORE_VERSION = '2.10'
6
- DEFAULT_SPARK_VERSION = '1.3.0'
6
+ DEFAULT_SPARK_VERSION = '1.4.0'
7
7
  DEFAULT_HADOOP_VERSION = '1.0.4'
8
8
 
9
9
  SBT = 'sbt/sbt'
@@ -11,20 +11,20 @@ module Spark
11
11
  SBT_EXT = 'package'
12
12
  SBT_CLEAN = 'clean'
13
13
 
14
- def self.build(options)
15
- spark_home = options.spark_home || Spark.target_dir
16
- scala_version = options.scala_version || DEFAULT_SCALA_VERSION
17
- spark_core = options.spark_core || DEFAULT_CORE_VERSION
18
- spark_version = options.spark_version || DEFAULT_SPARK_VERSION
19
- hadoop_version = options.hadoop_version || DEFAULT_HADOOP_VERSION
20
- only_ext = options.only_ext
14
+ def self.build(options={})
15
+ scala_version = options[:scala_version] || DEFAULT_SCALA_VERSION
16
+ spark_core_version = options[:spark_core_version] || DEFAULT_CORE_VERSION
17
+ spark_version = options[:spark_version] || DEFAULT_SPARK_VERSION
18
+ hadoop_version = options[:hadoop_version] || DEFAULT_HADOOP_VERSION
19
+ target = options[:target] || Spark.target_dir
20
+ only_ext = options[:only_ext] || false
21
21
 
22
22
  env = {
23
23
  'SCALA_VERSION' => scala_version,
24
24
  'SPARK_VERSION' => spark_version,
25
- 'SPARK_CORE_VERSION' => spark_core,
25
+ 'SPARK_CORE_VERSION' => spark_core_version,
26
26
  'HADOOP_VERSION' => hadoop_version,
27
- 'SPARK_HOME' => spark_home
27
+ 'TARGET_DIR' => target
28
28
  }
29
29
 
30
30
  cmd = [SBT]
@@ -13,8 +13,8 @@ module Spark
13
13
  class CLI
14
14
  include Commander::Methods
15
15
 
16
- IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
17
- IRB_HISTORY_SIZE = 100
16
+ # IRB_HISTORY_FILE = File.join(Dir.home, '.irb_spark_history')
17
+ # IRB_HISTORY_SIZE = 100
18
18
 
19
19
  def run
20
20
  program :name, 'RubySpark'
@@ -29,22 +29,15 @@ module Spark
29
29
  command :build do |c|
30
30
  c.syntax = 'build [options]'
31
31
  c.description = 'Build spark and gem extensions'
32
- c.option '--hadoop-version STRING', String, 'Version of hadoop which will stored with the SPARK'
33
- c.option '--spark-home STRING', String, 'Directory where SPARK will be stored'
34
- c.option '--spark-core STRING', String, 'Version of SPARK core'
35
- c.option '--spark-version STRING', String, 'Version of SPARK'
32
+ c.option '--hadoop-version STRING', String, 'Version of hadoop which will assembled with the Spark'
33
+ c.option '--spark-core-version STRING', String, 'Version of Spark core'
34
+ c.option '--spark-version STRING', String, 'Version of Spark'
36
35
  c.option '--scala-version STRING', String, 'Version of Scala'
37
- c.option '--only-ext', 'Start SPARK immediately'
36
+ c.option '--target STRING', String, 'Directory where Spark will be stored'
37
+ c.option '--only-ext', 'Build only extension for RubySpark'
38
38
 
39
39
  c.action do |args, options|
40
- options.default hadoop_version: Spark::Build::DEFAULT_HADOOP_VERSION,
41
- spark_home: Spark.target_dir,
42
- spark_core: Spark::Build::DEFAULT_CORE_VERSION,
43
- spark_version: Spark::Build::DEFAULT_SPARK_VERSION,
44
- scala_version: Spark::Build::DEFAULT_SCALA_VERSION,
45
- only_ext: false
46
-
47
- Spark::Build.build(options)
40
+ Spark::Build.build(options.__hash__)
48
41
  puts
49
42
  puts 'Everything is OK'
50
43
  end
@@ -52,23 +45,23 @@ module Spark
52
45
  alias_command :install, :build
53
46
 
54
47
 
55
- # Pry -------------------------------------------------------------------
56
- command :pry do |c|
57
- c.syntax = 'pry [options]'
48
+ # Shell -----------------------------------------------------------------
49
+ command :shell do |c|
50
+ c.syntax = 'shell [options]'
58
51
  c.description = 'Start ruby shell for spark'
59
- c.option '--spark-home STRING', String, 'Directory where SPARK is stored'
52
+ c.option '--target STRING', String, 'Directory where Spark is stored'
60
53
  c.option '--properties-file STRING', String, 'Path to a file from which to load extra properties'
61
- c.option '--[no-]start', 'Start SPARK immediately'
54
+ c.option '--[no-]start', 'Start Spark immediately'
62
55
  c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
63
56
 
64
57
  c.action do |args, options|
65
58
  options.default start: true, logger: true
66
59
 
67
- Spark.load_lib(options.spark_home)
68
- Spark::Logger.disable unless options.logger
60
+ Spark.load_lib(options.target)
61
+ Spark.logger.disable unless options.logger
69
62
 
70
63
  Spark.config do
71
- set_app_name 'Pry RubySpark'
64
+ set_app_name 'RubySpark'
72
65
  end
73
66
 
74
67
  Spark.config.from_file(options.properties_file)
@@ -88,61 +81,60 @@ module Spark
88
81
  Pry.start
89
82
  end
90
83
  end
91
- alias_command :shell, :pry
92
84
 
93
85
 
94
- # IRB -------------------------------------------------------------------
95
- command :irb do |c|
96
- c.syntax = 'irb [options]'
97
- c.description = 'Start ruby shell for spark'
98
- c.option '--spark-home STRING', String, 'Directory where SPARK is stored'
99
- c.option '--[no-]start', 'Start SPARK immediately'
100
- c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
101
-
102
- c.action do |args, options|
103
- options.default start: true, logger: true
104
-
105
- Spark.load_lib(options.spark_home)
106
- Spark::Logger.disable unless options.logger
107
-
108
- Spark.config do
109
- set_app_name 'Pry RubySpark'
110
- end
111
-
112
- if options.start
113
- # Load Java and Spark
114
- Spark.start
115
- $sc = Spark.context
116
-
117
- Spark.print_logo('Spark context is loaded as $sc')
118
- else
119
- Spark.print_logo('You can start Spark with Spark.start')
120
- end
121
-
122
- # Load IRB
123
- require 'irb'
124
- require 'irb/completion'
125
- require 'irb/ext/save-history'
126
-
127
- begin
128
- file = File.expand_path(IRB_HISTORY_FILE)
129
- if File.exists?(file)
130
- lines = IO.readlines(file).collect { |line| line.chomp }
131
- Readline::HISTORY.push(*lines)
132
- end
133
- Kernel.at_exit do
134
- lines = Readline::HISTORY.to_a.reverse.uniq.reverse
135
- lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
136
- File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
137
- end
138
- rescue
139
- end
140
-
141
- ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
142
- ARGV.concat ['--readline', '--prompt-mode', 'simple']
143
- IRB.start
144
- end
145
- end
86
+ # # IRB -------------------------------------------------------------------
87
+ # command :irb do |c|
88
+ # c.syntax = 'irb [options]'
89
+ # c.description = 'Start ruby shell for spark'
90
+ # c.option '--spark-home STRING', String, 'Directory where Spark is stored'
91
+ # c.option '--[no-]start', 'Start Spark immediately'
92
+ # c.option '--[no-]logger', 'Enable/disable logger (default: enable)'
93
+ #
94
+ # c.action do |args, options|
95
+ # options.default start: true, logger: true
96
+ #
97
+ # Spark.load_lib(options.spark_home)
98
+ # Spark::Logger.disable unless options.logger
99
+ #
100
+ # Spark.config do
101
+ # set_app_name 'Pry RubySpark'
102
+ # end
103
+ #
104
+ # if options.start
105
+ # # Load Java and Spark
106
+ # Spark.start
107
+ # $sc = Spark.context
108
+ #
109
+ # Spark.print_logo('Spark context is loaded as $sc')
110
+ # else
111
+ # Spark.print_logo('You can start Spark with Spark.start')
112
+ # end
113
+ #
114
+ # # Load IRB
115
+ # require 'irb'
116
+ # require 'irb/completion'
117
+ # require 'irb/ext/save-history'
118
+ #
119
+ # begin
120
+ # file = File.expand_path(IRB_HISTORY_FILE)
121
+ # if File.exists?(file)
122
+ # lines = IO.readlines(file).collect { |line| line.chomp }
123
+ # Readline::HISTORY.push(*lines)
124
+ # end
125
+ # Kernel.at_exit do
126
+ # lines = Readline::HISTORY.to_a.reverse.uniq.reverse
127
+ # lines = lines[-IRB_HISTORY_SIZE, IRB_HISTORY_SIZE] if lines.nitems > IRB_HISTORY_SIZE
128
+ # File.open(IRB_HISTORY_FILE, File::WRONLY | File::CREAT | File::TRUNC) { |io| io.puts lines.join("\n") }
129
+ # end
130
+ # rescue
131
+ # end
132
+ #
133
+ # ARGV.clear # Clear Thor ARGV, otherwise IRB will parse it
134
+ # ARGV.concat ['--readline', '--prompt-mode', 'simple']
135
+ # IRB.start
136
+ # end
137
+ # end
146
138
 
147
139
 
148
140
  # Home ------------------------------------------------------------------
@@ -16,6 +16,7 @@ module Spark
16
16
  def initialize
17
17
  @spark_conf = SparkConf.new(true)
18
18
  set_default
19
+ from_file(Spark::DEFAULT_CONFIG_FILE)
19
20
  end
20
21
 
21
22
  def from_file(file)
@@ -140,11 +141,11 @@ module Spark
140
141
  set('spark.ruby.serializer', default_serializer)
141
142
  set('spark.ruby.serializer.compress', default_serializer_compress)
142
143
  set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
143
- set('spark.ruby.executor.uri', default_executor_uri)
144
144
  set('spark.ruby.executor.command', default_executor_command)
145
145
  set('spark.ruby.executor.options', default_executor_options)
146
146
  set('spark.ruby.worker.type', default_worker_type)
147
147
  load_executor_envs
148
+ # set('spark.ruby.executor.install', default_executor_install)
148
149
  end
149
150
 
150
151
  def default_serializer
@@ -159,21 +160,6 @@ module Spark
159
160
  ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
160
161
  end
161
162
 
162
- # Ruby executor.
163
- #
164
- # == Options:
165
- # nil::
166
- # System's gem is loaded (ruby-spark).
167
- #
168
- # other::
169
- # Path of library which will be used.
170
- # Current ruby-spark gem is used.
171
- # (default)
172
- #
173
- def default_executor_uri
174
- ENV['SPARK_RUBY_EXECUTOR_URI'] || ''
175
- end
176
-
177
163
  # Command template which is applied when scala want create a ruby
178
164
  # process (e.g. master, home request). Command is represented by '%s'.
179
165
  #
@@ -186,13 +172,23 @@ module Spark
186
172
 
187
173
  # Options for every worker.
188
174
  #
189
- # == Examples:
175
+ # == Example:
190
176
  # -J-Xmx512m
191
177
  #
192
178
  def default_executor_options
193
179
  ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
194
180
  end
195
181
 
182
+ # # Install command which is triggered before on start.
183
+ # # This command using executor command template.
184
+ # #
185
+ # # == Example:
186
+ # # gem install ruby-spark -v 1.2.0
187
+ # #
188
+ # def default_executor_install
189
+ # ENV['SPARK_RUBY_EXECUTOR_INSTALL'] || ''
190
+ # end
191
+
196
192
  # Type of worker.
197
193
  #
198
194
  # == Options: