ruby-spark 1.1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.travis.yml +15 -0
  4. data/CHANGELOG.md +8 -0
  5. data/README.md +184 -57
  6. data/TODO.md +3 -1
  7. data/ext/spark/build.sbt +5 -5
  8. data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
  9. data/lib/spark.rb +69 -10
  10. data/lib/spark/accumulator.rb +8 -0
  11. data/lib/spark/broadcast.rb +7 -0
  12. data/lib/spark/build.rb +10 -10
  13. data/lib/spark/cli.rb +68 -76
  14. data/lib/spark/config.rb +13 -17
  15. data/lib/spark/context.rb +10 -7
  16. data/lib/spark/error.rb +4 -0
  17. data/lib/spark/helper/statistic.rb +5 -1
  18. data/lib/spark/java_bridge.rb +5 -3
  19. data/lib/spark/java_bridge/base.rb +15 -15
  20. data/lib/spark/java_bridge/jruby.rb +3 -1
  21. data/lib/spark/java_bridge/rjb.rb +2 -0
  22. data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
  23. data/lib/spark/mllib/classification/svm.rb +10 -2
  24. data/lib/spark/mllib/clustering/kmeans.rb +6 -2
  25. data/lib/spark/mllib/regression/lasso.rb +18 -2
  26. data/lib/spark/mllib/regression/linear.rb +11 -3
  27. data/lib/spark/mllib/regression/ridge.rb +18 -2
  28. data/lib/spark/rdd.rb +11 -2
  29. data/lib/spark/serializer.rb +1 -1
  30. data/lib/spark/serializer/auto_batched.rb +7 -0
  31. data/lib/spark/version.rb +1 -1
  32. data/ruby-spark.gemspec +4 -5
  33. data/spec/generator.rb +1 -1
  34. data/spec/lib/collect_spec.rb +10 -10
  35. data/spec/lib/config_spec.rb +10 -10
  36. data/spec/lib/context_spec.rb +116 -115
  37. data/spec/lib/ext_spec.rb +17 -17
  38. data/spec/lib/external_apps_spec.rb +1 -1
  39. data/spec/lib/filter_spec.rb +17 -17
  40. data/spec/lib/flat_map_spec.rb +22 -19
  41. data/spec/lib/group_spec.rb +22 -19
  42. data/spec/lib/helper_spec.rb +60 -12
  43. data/spec/lib/key_spec.rb +9 -8
  44. data/spec/lib/manipulation_spec.rb +15 -15
  45. data/spec/lib/map_partitions_spec.rb +6 -4
  46. data/spec/lib/map_spec.rb +22 -19
  47. data/spec/lib/reduce_by_key_spec.rb +19 -19
  48. data/spec/lib/reduce_spec.rb +22 -20
  49. data/spec/lib/sample_spec.rb +13 -12
  50. data/spec/lib/serializer_spec.rb +27 -0
  51. data/spec/lib/sort_spec.rb +16 -14
  52. data/spec/lib/statistic_spec.rb +4 -2
  53. data/spec/lib/whole_text_files_spec.rb +9 -8
  54. data/spec/spec_helper.rb +3 -3
  55. metadata +19 -18
@@ -38,6 +38,12 @@ module Spark
38
38
  set_call_site('Ruby') # description of stage
39
39
  end
40
40
 
41
+ def inspect
42
+ result = %{#<#{self.class.name}:0x#{object_id}\n}
43
+ result << %{Tempdir: "#{temp_dir}">}
44
+ result
45
+ end
46
+
41
47
  def stop
42
48
  Spark::Accumulator::Server.stop
43
49
  log_info('Ruby accumulator server was stopped')
@@ -108,14 +114,11 @@ module Spark
108
114
  # Support function for API backtraces.
109
115
  #
110
116
  def set_call_site(site)
111
- set_local_property('externalCallSite', site)
117
+ jcontext.setCallSite(site)
112
118
  end
113
119
 
114
- # Capture the current user callsite and return a formatted version for printing. If the user
115
- # has overridden the call site, this will return the user's version.
116
- #
117
- def get_call_site
118
- jcontext.getCallSite
120
+ def clear_call_site
121
+ jcontext.clearCallSite
119
122
  end
120
123
 
121
124
  # Return a copy of this SparkContext's configuration. The configuration *cannot*
@@ -313,7 +316,7 @@ module Spark
313
316
  alias_method :setLocalProperty, :set_local_property
314
317
  alias_method :getLocalProperty, :get_local_property
315
318
  alias_method :setCallSite, :set_call_site
316
- alias_method :getCallSite, :get_call_site
319
+ alias_method :clearCallSite, :clear_call_site
317
320
  alias_method :runJob, :run_job
318
321
  alias_method :runJobWithCommand, :run_job_with_command
319
322
  alias_method :addFile, :add_file
@@ -47,4 +47,8 @@ module Spark
47
47
  # Wrong instances
48
48
  class MllibError < StandardError
49
49
  end
50
+
51
+ # Missing Java class
52
+ class JavaBridgeError < StandardError
53
+ end
50
54
  end
@@ -77,9 +77,13 @@ module Spark
77
77
  # == Example:
78
78
  # data = [0,1,2,3,4,5,6,7,8,9,10]
79
79
  # determine_bounds(data, 3)
80
- # # => [2, 5, 8]
80
+ # # => [3, 7]
81
81
  #
82
82
  def determine_bounds(data, num_partitions)
83
+ if num_partitions > data.size
84
+ return data
85
+ end
86
+
83
87
  bounds = []
84
88
  count = data.size
85
89
  (0...(num_partitions-1)).each do |index|
@@ -7,12 +7,14 @@ module Spark
7
7
 
8
8
  include Spark::Helper::System
9
9
 
10
- def self.get
10
+ def self.init(*args)
11
11
  if jruby?
12
- JRuby
12
+ klass = JRuby
13
13
  else
14
- RJB
14
+ klass = RJB
15
15
  end
16
+
17
+ klass.new(*args)
16
18
  end
17
19
 
18
20
  end
@@ -41,31 +41,31 @@ module Spark
41
41
 
42
42
  RUBY_TO_JAVA_SKIP = [Fixnum, Integer]
43
43
 
44
- def initialize(spark_home)
45
- @spark_home = spark_home
44
+ def initialize(target)
45
+ @target = target
46
46
  end
47
47
 
48
48
  # Import all important classes into Objects
49
- def load
50
- return if @loaded
49
+ def import_all
50
+ return if @imported
51
51
 
52
52
  java_objects.each do |name, klass|
53
53
  import(name, klass)
54
54
  end
55
55
 
56
- @loaded = true
56
+ @imported = true
57
57
  nil
58
58
  end
59
59
 
60
60
  # Import classes for testing
61
- def load_test
62
- return if @loaded_test
61
+ def import_all_test
62
+ return if @imported_test
63
63
 
64
64
  java_test_objects.each do |name, klass|
65
65
  import(name, klass)
66
66
  end
67
67
 
68
- @loaded_test = true
68
+ @imported_test = true
69
69
  nil
70
70
  end
71
71
 
@@ -168,13 +168,9 @@ module Spark
168
168
  private
169
169
 
170
170
  def jars
171
- result = []
172
- if File.file?(@spark_home)
173
- result << @spark_home
174
- else
175
- result << Dir.glob(File.join(@spark_home, '*.jar'))
176
- end
177
- result.flatten
171
+ result = Dir.glob(File.join(@target, '*.jar'))
172
+ result.flatten!
173
+ result
178
174
  end
179
175
 
180
176
  def objects_with_names(objects)
@@ -198,6 +194,10 @@ module Spark
198
194
  objects_with_names(JAVA_TEST_OBJECTS)
199
195
  end
200
196
 
197
+ def raise_missing_class(klass)
198
+ raise Spark::JavaBridgeError, "Class #{klass} is missing. Make sure that Spark and RubySpark is assembled."
199
+ end
200
+
201
201
  end
202
202
  end
203
203
  end
@@ -11,7 +11,9 @@ module Spark
11
11
 
12
12
  def import(name, klass)
13
13
  klass = "Java::#{klass}"
14
- Object.const_set(name, eval(klass)) rescue nil
14
+ Object.const_set(name, eval(klass))
15
+ rescue NameError
16
+ raise_missing_class(klass)
15
17
  end
16
18
 
17
19
  def java_object?(object)
@@ -16,6 +16,8 @@ module Spark
16
16
 
17
17
  def import(name, klass)
18
18
  Object.const_set(name, silence_warnings { Rjb.import(klass) })
19
+ rescue NoClassDefFoundError
20
+ raise_missing_class(klass)
19
21
  end
20
22
 
21
23
  def java_object?(object)
@@ -97,7 +97,8 @@ module Spark
97
97
  initial_weights: nil,
98
98
  reg_param: 0.01,
99
99
  reg_type: 'l2',
100
- intercept: false
100
+ intercept: false,
101
+ validate: true
101
102
  }
102
103
 
103
104
  # Train a logistic regression model on the given data.
@@ -134,6 +135,12 @@ module Spark
134
135
  # or not of the augmented representation for
135
136
  # training data (i.e. whether bias features
136
137
  # are activated or not).
138
+ # (default: false)
139
+ #
140
+ # validate::
141
+ # Boolean parameter which indicates if the
142
+ # algorithm should validate data before training.
143
+ # (default: true)
137
144
  #
138
145
  def self.train(rdd, options={})
139
146
  super
@@ -145,7 +152,8 @@ module Spark
145
152
  options[:initial_weights],
146
153
  options[:reg_param].to_f,
147
154
  options[:reg_type],
148
- options[:intercept])
155
+ options[:intercept],
156
+ options[:validate])
149
157
 
150
158
  LogisticRegressionModel.new(weights, intercept)
151
159
  end
@@ -78,7 +78,8 @@ module Spark
78
78
  mini_batch_fraction: 1.0,
79
79
  initial_weights: nil,
80
80
  reg_type: 'l2',
81
- intercept: false
81
+ intercept: false,
82
+ validate: true
82
83
  }
83
84
 
84
85
  # Train a support vector machine on the given data.
@@ -114,6 +115,12 @@ module Spark
114
115
  # or not of the augmented representation for
115
116
  # training data (i.e. whether bias features
116
117
  # are activated or not).
118
+ # (default: false)
119
+ #
120
+ # validateData::
121
+ # Boolean parameter which indicates if the
122
+ # algorithm should validate data before training.
123
+ # (default: true)
117
124
  #
118
125
  def self.train(rdd, options={})
119
126
  super
@@ -125,7 +132,8 @@ module Spark
125
132
  options[:mini_batch_fraction].to_f,
126
133
  options[:initial_weights],
127
134
  options[:reg_type],
128
- options[:intercept])
135
+ options[:intercept],
136
+ options[:validate])
129
137
 
130
138
  SVMModel.new(weights, intercept)
131
139
  end
@@ -107,10 +107,14 @@ module Spark
107
107
  # seed::
108
108
  # Random seed value for cluster initialization.
109
109
  #
110
- def self.train(rdd, k, max_iterations: 100, runs: 1, initialization_mode: 'k-means||', seed: nil)
110
+ # epsilon::
111
+ # The distance threshold within which we've consider centers to have converged.
112
+ #
113
+ def self.train(rdd, k, max_iterations: 100, runs: 1, initialization_mode: 'k-means||', seed: nil,
114
+ initialization_steps: 5, epsilon: 0.0001)
111
115
  # Call returns KMeansModel
112
116
  Spark.jb.call(RubyMLLibAPI.new, 'trainKMeansModel', rdd,
113
- k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed))
117
+ k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed), initialization_steps, epsilon)
114
118
  end
115
119
 
116
120
  end
@@ -58,7 +58,9 @@ module Spark
58
58
  step: 1.0,
59
59
  reg_param: 0.01,
60
60
  mini_batch_fraction: 1.0,
61
- initial_weights: nil
61
+ initial_weights: nil,
62
+ intercept: false,
63
+ validate: true
62
64
  }
63
65
 
64
66
  # Train a Lasso regression model on the given data.
@@ -82,6 +84,18 @@ module Spark
82
84
  # initial_weights::
83
85
  # The initial weights (default: nil).
84
86
  #
87
+ # intercept::
88
+ # Boolean parameter which indicates the use
89
+ # or not of the augmented representation for
90
+ # training data (i.e. whether bias features
91
+ # are activated or not).
92
+ # (default: false)
93
+ #
94
+ # validate::
95
+ # Boolean parameter which indicates if the
96
+ # algorithm should validate data before training.
97
+ # (default: true)
98
+ #
85
99
  def self.train(rdd, options={})
86
100
  super
87
101
 
@@ -90,7 +104,9 @@ module Spark
90
104
  options[:step].to_f,
91
105
  options[:reg_param].to_f,
92
106
  options[:mini_batch_fraction].to_f,
93
- options[:initial_weights])
107
+ options[:initial_weights],
108
+ options[:intercept],
109
+ options[:validate])
94
110
 
95
111
  LassoModel.new(weights, intercept)
96
112
  end
@@ -66,7 +66,8 @@ module Spark
66
66
  initial_weights: nil,
67
67
  reg_param: 0.0,
68
68
  reg_type: nil,
69
- intercept: false
69
+ intercept: false,
70
+ validate: true
70
71
  }
71
72
 
72
73
  # Train a linear regression model on the given data.
@@ -102,7 +103,13 @@ module Spark
102
103
  # Boolean parameter which indicates the use
103
104
  # or not of the augmented representation for
104
105
  # training data (i.e. whether bias features
105
- # are activated or not). (default: False)
106
+ # are activated or not).
107
+ # (default: false)
108
+ #
109
+ # validate::
110
+ # Boolean parameter which indicates if the
111
+ # algorithm should validate data before training.
112
+ # (default: true)
106
113
  #
107
114
  def self.train(rdd, options={})
108
115
  super
@@ -114,7 +121,8 @@ module Spark
114
121
  options[:initial_weights],
115
122
  options[:reg_param].to_f,
116
123
  options[:reg_type],
117
- options[:intercept])
124
+ options[:intercept],
125
+ options[:validate])
118
126
 
119
127
  LinearRegressionModel.new(weights, intercept)
120
128
  end
@@ -55,7 +55,9 @@ module Spark
55
55
  step: 1.0,
56
56
  reg_param: 0.01,
57
57
  mini_batch_fraction: 1.0,
58
- initial_weights: nil
58
+ initial_weights: nil,
59
+ intercept: false,
60
+ validate: true
59
61
  }
60
62
 
61
63
  # Train a ridge regression model on the given data.
@@ -79,6 +81,18 @@ module Spark
79
81
  # initial_weights::
80
82
  # The initial weights (default: nil).
81
83
  #
84
+ # intercept::
85
+ # Boolean parameter which indicates the use
86
+ # or not of the augmented representation for
87
+ # training data (i.e. whether bias features
88
+ # are activated or not).
89
+ # (default: false)
90
+ #
91
+ # validate::
92
+ # Boolean parameter which indicates if the
93
+ # algorithm should validate data before training.
94
+ # (default: true)
95
+ #
82
96
  def self.train(rdd, options={})
83
97
  super
84
98
 
@@ -87,7 +101,9 @@ module Spark
87
101
  options[:step].to_f,
88
102
  options[:reg_param].to_f,
89
103
  options[:mini_batch_fraction].to_f,
90
- options[:initial_weights])
104
+ options[:initial_weights],
105
+ options[:intercept],
106
+ options[:validate])
91
107
 
92
108
  RidgeRegressionModel.new(weights, intercept)
93
109
  end
@@ -39,6 +39,7 @@ module Spark
39
39
 
40
40
  result = %{#<#{self.class.name}:0x#{object_id}}
41
41
  result << %{ (#{comms})} unless comms.empty?
42
+ result << %{ (cached)} if cached?
42
43
  result << %{\n}
43
44
  result << %{ Serializer: "#{serializer}"\n}
44
45
  result << %{Deserializer: "#{deserializer}"}
@@ -166,8 +167,13 @@ module Spark
166
167
 
167
168
  # Assign a name to this RDD.
168
169
  #
169
- def set_name(name)
170
- jrdd.setName(name)
170
+ def set_name(value)
171
+ jrdd.setName(value)
172
+ value
173
+ end
174
+
175
+ def name=(value)
176
+ set_name(value)
171
177
  end
172
178
 
173
179
  def to_java
@@ -193,11 +199,14 @@ module Spark
193
199
  def collect(as_enum=false)
194
200
  file = Tempfile.new('collect', context.temp_dir)
195
201
 
202
+ context.set_call_site(caller.first)
196
203
  RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
197
204
 
198
205
  collect_from_file(file, as_enum)
199
206
  rescue => e
200
207
  raise Spark::RDDError, e.message
208
+ ensure
209
+ context.clear_call_site
201
210
  end
202
211
 
203
212
  def collect_from_file(file, as_enum=false)
@@ -55,7 +55,7 @@ module Spark
55
55
  if block_given?
56
56
  class_eval(&block)
57
57
  else
58
- class_eval(text.to_s)
58
+ class_eval(text.to_s.downcase)
59
59
  end
60
60
  end
61
61
 
@@ -16,6 +16,13 @@ module Spark
16
16
  error('Batch size must be greater than 1') if @best_size < 2
17
17
  end
18
18
 
19
+ def batched?
20
+ true
21
+ end
22
+
23
+ def unbatch!
24
+ end
25
+
19
26
  def name
20
27
  "AutoBatched(#{@best_size})"
21
28
  end
@@ -1,3 +1,3 @@
1
1
  module Spark
2
- VERSION = '1.1.0.1'
2
+ VERSION = '1.2.0'
3
3
  end
@@ -21,9 +21,13 @@ Gem::Specification.new do |spec|
21
21
  spec.require_paths = ['lib']
22
22
 
23
23
  if RUBY_PLATFORM =~ /java/
24
+ spec.platform = 'java'
25
+
24
26
  extensions = ['ext/ruby_java/extconf.rb']
25
27
  else
26
28
  extensions = ['ext/ruby_c/extconf.rb']
29
+
30
+ spec.add_dependency 'rjb'
27
31
  end
28
32
 
29
33
  spec.extensions = extensions
@@ -38,11 +42,6 @@ Gem::Specification.new do |spec|
38
42
  spec.add_dependency 'nio4r'
39
43
  spec.add_dependency 'distribution'
40
44
 
41
- if RUBY_PLATFORM =~ /java/
42
- else
43
- spec.add_dependency 'rjb'
44
- end
45
-
46
45
  spec.add_development_dependency 'bundler', '~> 1.6'
47
46
  spec.add_development_dependency 'rake'
48
47
  end