ruby-spark 1.1.0.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/.travis.yml +15 -0
  4. data/CHANGELOG.md +8 -0
  5. data/README.md +184 -57
  6. data/TODO.md +3 -1
  7. data/ext/spark/build.sbt +5 -5
  8. data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
  9. data/lib/spark.rb +69 -10
  10. data/lib/spark/accumulator.rb +8 -0
  11. data/lib/spark/broadcast.rb +7 -0
  12. data/lib/spark/build.rb +10 -10
  13. data/lib/spark/cli.rb +68 -76
  14. data/lib/spark/config.rb +13 -17
  15. data/lib/spark/context.rb +10 -7
  16. data/lib/spark/error.rb +4 -0
  17. data/lib/spark/helper/statistic.rb +5 -1
  18. data/lib/spark/java_bridge.rb +5 -3
  19. data/lib/spark/java_bridge/base.rb +15 -15
  20. data/lib/spark/java_bridge/jruby.rb +3 -1
  21. data/lib/spark/java_bridge/rjb.rb +2 -0
  22. data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
  23. data/lib/spark/mllib/classification/svm.rb +10 -2
  24. data/lib/spark/mllib/clustering/kmeans.rb +6 -2
  25. data/lib/spark/mllib/regression/lasso.rb +18 -2
  26. data/lib/spark/mllib/regression/linear.rb +11 -3
  27. data/lib/spark/mllib/regression/ridge.rb +18 -2
  28. data/lib/spark/rdd.rb +11 -2
  29. data/lib/spark/serializer.rb +1 -1
  30. data/lib/spark/serializer/auto_batched.rb +7 -0
  31. data/lib/spark/version.rb +1 -1
  32. data/ruby-spark.gemspec +4 -5
  33. data/spec/generator.rb +1 -1
  34. data/spec/lib/collect_spec.rb +10 -10
  35. data/spec/lib/config_spec.rb +10 -10
  36. data/spec/lib/context_spec.rb +116 -115
  37. data/spec/lib/ext_spec.rb +17 -17
  38. data/spec/lib/external_apps_spec.rb +1 -1
  39. data/spec/lib/filter_spec.rb +17 -17
  40. data/spec/lib/flat_map_spec.rb +22 -19
  41. data/spec/lib/group_spec.rb +22 -19
  42. data/spec/lib/helper_spec.rb +60 -12
  43. data/spec/lib/key_spec.rb +9 -8
  44. data/spec/lib/manipulation_spec.rb +15 -15
  45. data/spec/lib/map_partitions_spec.rb +6 -4
  46. data/spec/lib/map_spec.rb +22 -19
  47. data/spec/lib/reduce_by_key_spec.rb +19 -19
  48. data/spec/lib/reduce_spec.rb +22 -20
  49. data/spec/lib/sample_spec.rb +13 -12
  50. data/spec/lib/serializer_spec.rb +27 -0
  51. data/spec/lib/sort_spec.rb +16 -14
  52. data/spec/lib/statistic_spec.rb +4 -2
  53. data/spec/lib/whole_text_files_spec.rb +9 -8
  54. data/spec/spec_helper.rb +3 -3
  55. metadata +19 -18
@@ -38,6 +38,12 @@ module Spark
38
38
  set_call_site('Ruby') # description of stage
39
39
  end
40
40
 
41
+ def inspect
42
+ result = %{#<#{self.class.name}:0x#{object_id}\n}
43
+ result << %{Tempdir: "#{temp_dir}">}
44
+ result
45
+ end
46
+
41
47
  def stop
42
48
  Spark::Accumulator::Server.stop
43
49
  log_info('Ruby accumulator server was stopped')
@@ -108,14 +114,11 @@ module Spark
108
114
  # Support function for API backtraces.
109
115
  #
110
116
  def set_call_site(site)
111
- set_local_property('externalCallSite', site)
117
+ jcontext.setCallSite(site)
112
118
  end
113
119
 
114
- # Capture the current user callsite and return a formatted version for printing. If the user
115
- # has overridden the call site, this will return the user's version.
116
- #
117
- def get_call_site
118
- jcontext.getCallSite
120
+ def clear_call_site
121
+ jcontext.clearCallSite
119
122
  end
120
123
 
121
124
  # Return a copy of this SparkContext's configuration. The configuration *cannot*
@@ -313,7 +316,7 @@ module Spark
313
316
  alias_method :setLocalProperty, :set_local_property
314
317
  alias_method :getLocalProperty, :get_local_property
315
318
  alias_method :setCallSite, :set_call_site
316
- alias_method :getCallSite, :get_call_site
319
+ alias_method :clearCallSite, :clear_call_site
317
320
  alias_method :runJob, :run_job
318
321
  alias_method :runJobWithCommand, :run_job_with_command
319
322
  alias_method :addFile, :add_file
@@ -47,4 +47,8 @@ module Spark
47
47
  # Wrong instances
48
48
  class MllibError < StandardError
49
49
  end
50
+
51
+ # Missing Java class
52
+ class JavaBridgeError < StandardError
53
+ end
50
54
  end
@@ -77,9 +77,13 @@ module Spark
77
77
  # == Example:
78
78
  # data = [0,1,2,3,4,5,6,7,8,9,10]
79
79
  # determine_bounds(data, 3)
80
- # # => [2, 5, 8]
80
+ # # => [3, 7]
81
81
  #
82
82
  def determine_bounds(data, num_partitions)
83
+ if num_partitions > data.size
84
+ return data
85
+ end
86
+
83
87
  bounds = []
84
88
  count = data.size
85
89
  (0...(num_partitions-1)).each do |index|
@@ -7,12 +7,14 @@ module Spark
7
7
 
8
8
  include Spark::Helper::System
9
9
 
10
- def self.get
10
+ def self.init(*args)
11
11
  if jruby?
12
- JRuby
12
+ klass = JRuby
13
13
  else
14
- RJB
14
+ klass = RJB
15
15
  end
16
+
17
+ klass.new(*args)
16
18
  end
17
19
 
18
20
  end
@@ -41,31 +41,31 @@ module Spark
41
41
 
42
42
  RUBY_TO_JAVA_SKIP = [Fixnum, Integer]
43
43
 
44
- def initialize(spark_home)
45
- @spark_home = spark_home
44
+ def initialize(target)
45
+ @target = target
46
46
  end
47
47
 
48
48
  # Import all important classes into Objects
49
- def load
50
- return if @loaded
49
+ def import_all
50
+ return if @imported
51
51
 
52
52
  java_objects.each do |name, klass|
53
53
  import(name, klass)
54
54
  end
55
55
 
56
- @loaded = true
56
+ @imported = true
57
57
  nil
58
58
  end
59
59
 
60
60
  # Import classes for testing
61
- def load_test
62
- return if @loaded_test
61
+ def import_all_test
62
+ return if @imported_test
63
63
 
64
64
  java_test_objects.each do |name, klass|
65
65
  import(name, klass)
66
66
  end
67
67
 
68
- @loaded_test = true
68
+ @imported_test = true
69
69
  nil
70
70
  end
71
71
 
@@ -168,13 +168,9 @@ module Spark
168
168
  private
169
169
 
170
170
  def jars
171
- result = []
172
- if File.file?(@spark_home)
173
- result << @spark_home
174
- else
175
- result << Dir.glob(File.join(@spark_home, '*.jar'))
176
- end
177
- result.flatten
171
+ result = Dir.glob(File.join(@target, '*.jar'))
172
+ result.flatten!
173
+ result
178
174
  end
179
175
 
180
176
  def objects_with_names(objects)
@@ -198,6 +194,10 @@ module Spark
198
194
  objects_with_names(JAVA_TEST_OBJECTS)
199
195
  end
200
196
 
197
+ def raise_missing_class(klass)
198
+ raise Spark::JavaBridgeError, "Class #{klass} is missing. Make sure that Spark and RubySpark is assembled."
199
+ end
200
+
201
201
  end
202
202
  end
203
203
  end
@@ -11,7 +11,9 @@ module Spark
11
11
 
12
12
  def import(name, klass)
13
13
  klass = "Java::#{klass}"
14
- Object.const_set(name, eval(klass)) rescue nil
14
+ Object.const_set(name, eval(klass))
15
+ rescue NameError
16
+ raise_missing_class(klass)
15
17
  end
16
18
 
17
19
  def java_object?(object)
@@ -16,6 +16,8 @@ module Spark
16
16
 
17
17
  def import(name, klass)
18
18
  Object.const_set(name, silence_warnings { Rjb.import(klass) })
19
+ rescue NoClassDefFoundError
20
+ raise_missing_class(klass)
19
21
  end
20
22
 
21
23
  def java_object?(object)
@@ -97,7 +97,8 @@ module Spark
97
97
  initial_weights: nil,
98
98
  reg_param: 0.01,
99
99
  reg_type: 'l2',
100
- intercept: false
100
+ intercept: false,
101
+ validate: true
101
102
  }
102
103
 
103
104
  # Train a logistic regression model on the given data.
@@ -134,6 +135,12 @@ module Spark
134
135
  # or not of the augmented representation for
135
136
  # training data (i.e. whether bias features
136
137
  # are activated or not).
138
+ # (default: false)
139
+ #
140
+ # validate::
141
+ # Boolean parameter which indicates if the
142
+ # algorithm should validate data before training.
143
+ # (default: true)
137
144
  #
138
145
  def self.train(rdd, options={})
139
146
  super
@@ -145,7 +152,8 @@ module Spark
145
152
  options[:initial_weights],
146
153
  options[:reg_param].to_f,
147
154
  options[:reg_type],
148
- options[:intercept])
155
+ options[:intercept],
156
+ options[:validate])
149
157
 
150
158
  LogisticRegressionModel.new(weights, intercept)
151
159
  end
@@ -78,7 +78,8 @@ module Spark
78
78
  mini_batch_fraction: 1.0,
79
79
  initial_weights: nil,
80
80
  reg_type: 'l2',
81
- intercept: false
81
+ intercept: false,
82
+ validate: true
82
83
  }
83
84
 
84
85
  # Train a support vector machine on the given data.
@@ -114,6 +115,12 @@ module Spark
114
115
  # or not of the augmented representation for
115
116
  # training data (i.e. whether bias features
116
117
  # are activated or not).
118
+ # (default: false)
119
+ #
120
+ # validateData::
121
+ # Boolean parameter which indicates if the
122
+ # algorithm should validate data before training.
123
+ # (default: true)
117
124
  #
118
125
  def self.train(rdd, options={})
119
126
  super
@@ -125,7 +132,8 @@ module Spark
125
132
  options[:mini_batch_fraction].to_f,
126
133
  options[:initial_weights],
127
134
  options[:reg_type],
128
- options[:intercept])
135
+ options[:intercept],
136
+ options[:validate])
129
137
 
130
138
  SVMModel.new(weights, intercept)
131
139
  end
@@ -107,10 +107,14 @@ module Spark
107
107
  # seed::
108
108
  # Random seed value for cluster initialization.
109
109
  #
110
- def self.train(rdd, k, max_iterations: 100, runs: 1, initialization_mode: 'k-means||', seed: nil)
110
+ # epsilon::
111
+ # The distance threshold within which we've consider centers to have converged.
112
+ #
113
+ def self.train(rdd, k, max_iterations: 100, runs: 1, initialization_mode: 'k-means||', seed: nil,
114
+ initialization_steps: 5, epsilon: 0.0001)
111
115
  # Call returns KMeansModel
112
116
  Spark.jb.call(RubyMLLibAPI.new, 'trainKMeansModel', rdd,
113
- k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed))
117
+ k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed), initialization_steps, epsilon)
114
118
  end
115
119
 
116
120
  end
@@ -58,7 +58,9 @@ module Spark
58
58
  step: 1.0,
59
59
  reg_param: 0.01,
60
60
  mini_batch_fraction: 1.0,
61
- initial_weights: nil
61
+ initial_weights: nil,
62
+ intercept: false,
63
+ validate: true
62
64
  }
63
65
 
64
66
  # Train a Lasso regression model on the given data.
@@ -82,6 +84,18 @@ module Spark
82
84
  # initial_weights::
83
85
  # The initial weights (default: nil).
84
86
  #
87
+ # intercept::
88
+ # Boolean parameter which indicates the use
89
+ # or not of the augmented representation for
90
+ # training data (i.e. whether bias features
91
+ # are activated or not).
92
+ # (default: false)
93
+ #
94
+ # validate::
95
+ # Boolean parameter which indicates if the
96
+ # algorithm should validate data before training.
97
+ # (default: true)
98
+ #
85
99
  def self.train(rdd, options={})
86
100
  super
87
101
 
@@ -90,7 +104,9 @@ module Spark
90
104
  options[:step].to_f,
91
105
  options[:reg_param].to_f,
92
106
  options[:mini_batch_fraction].to_f,
93
- options[:initial_weights])
107
+ options[:initial_weights],
108
+ options[:intercept],
109
+ options[:validate])
94
110
 
95
111
  LassoModel.new(weights, intercept)
96
112
  end
@@ -66,7 +66,8 @@ module Spark
66
66
  initial_weights: nil,
67
67
  reg_param: 0.0,
68
68
  reg_type: nil,
69
- intercept: false
69
+ intercept: false,
70
+ validate: true
70
71
  }
71
72
 
72
73
  # Train a linear regression model on the given data.
@@ -102,7 +103,13 @@ module Spark
102
103
  # Boolean parameter which indicates the use
103
104
  # or not of the augmented representation for
104
105
  # training data (i.e. whether bias features
105
- # are activated or not). (default: False)
106
+ # are activated or not).
107
+ # (default: false)
108
+ #
109
+ # validate::
110
+ # Boolean parameter which indicates if the
111
+ # algorithm should validate data before training.
112
+ # (default: true)
106
113
  #
107
114
  def self.train(rdd, options={})
108
115
  super
@@ -114,7 +121,8 @@ module Spark
114
121
  options[:initial_weights],
115
122
  options[:reg_param].to_f,
116
123
  options[:reg_type],
117
- options[:intercept])
124
+ options[:intercept],
125
+ options[:validate])
118
126
 
119
127
  LinearRegressionModel.new(weights, intercept)
120
128
  end
@@ -55,7 +55,9 @@ module Spark
55
55
  step: 1.0,
56
56
  reg_param: 0.01,
57
57
  mini_batch_fraction: 1.0,
58
- initial_weights: nil
58
+ initial_weights: nil,
59
+ intercept: false,
60
+ validate: true
59
61
  }
60
62
 
61
63
  # Train a ridge regression model on the given data.
@@ -79,6 +81,18 @@ module Spark
79
81
  # initial_weights::
80
82
  # The initial weights (default: nil).
81
83
  #
84
+ # intercept::
85
+ # Boolean parameter which indicates the use
86
+ # or not of the augmented representation for
87
+ # training data (i.e. whether bias features
88
+ # are activated or not).
89
+ # (default: false)
90
+ #
91
+ # validate::
92
+ # Boolean parameter which indicates if the
93
+ # algorithm should validate data before training.
94
+ # (default: true)
95
+ #
82
96
  def self.train(rdd, options={})
83
97
  super
84
98
 
@@ -87,7 +101,9 @@ module Spark
87
101
  options[:step].to_f,
88
102
  options[:reg_param].to_f,
89
103
  options[:mini_batch_fraction].to_f,
90
- options[:initial_weights])
104
+ options[:initial_weights],
105
+ options[:intercept],
106
+ options[:validate])
91
107
 
92
108
  RidgeRegressionModel.new(weights, intercept)
93
109
  end
@@ -39,6 +39,7 @@ module Spark
39
39
 
40
40
  result = %{#<#{self.class.name}:0x#{object_id}}
41
41
  result << %{ (#{comms})} unless comms.empty?
42
+ result << %{ (cached)} if cached?
42
43
  result << %{\n}
43
44
  result << %{ Serializer: "#{serializer}"\n}
44
45
  result << %{Deserializer: "#{deserializer}"}
@@ -166,8 +167,13 @@ module Spark
166
167
 
167
168
  # Assign a name to this RDD.
168
169
  #
169
- def set_name(name)
170
- jrdd.setName(name)
170
+ def set_name(value)
171
+ jrdd.setName(value)
172
+ value
173
+ end
174
+
175
+ def name=(value)
176
+ set_name(value)
171
177
  end
172
178
 
173
179
  def to_java
@@ -193,11 +199,14 @@ module Spark
193
199
  def collect(as_enum=false)
194
200
  file = Tempfile.new('collect', context.temp_dir)
195
201
 
202
+ context.set_call_site(caller.first)
196
203
  RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
197
204
 
198
205
  collect_from_file(file, as_enum)
199
206
  rescue => e
200
207
  raise Spark::RDDError, e.message
208
+ ensure
209
+ context.clear_call_site
201
210
  end
202
211
 
203
212
  def collect_from_file(file, as_enum=false)
@@ -55,7 +55,7 @@ module Spark
55
55
  if block_given?
56
56
  class_eval(&block)
57
57
  else
58
- class_eval(text.to_s)
58
+ class_eval(text.to_s.downcase)
59
59
  end
60
60
  end
61
61
 
@@ -16,6 +16,13 @@ module Spark
16
16
  error('Batch size must be greater than 1') if @best_size < 2
17
17
  end
18
18
 
19
+ def batched?
20
+ true
21
+ end
22
+
23
+ def unbatch!
24
+ end
25
+
19
26
  def name
20
27
  "AutoBatched(#{@best_size})"
21
28
  end
@@ -1,3 +1,3 @@
1
1
  module Spark
2
- VERSION = '1.1.0.1'
2
+ VERSION = '1.2.0'
3
3
  end
@@ -21,9 +21,13 @@ Gem::Specification.new do |spec|
21
21
  spec.require_paths = ['lib']
22
22
 
23
23
  if RUBY_PLATFORM =~ /java/
24
+ spec.platform = 'java'
25
+
24
26
  extensions = ['ext/ruby_java/extconf.rb']
25
27
  else
26
28
  extensions = ['ext/ruby_c/extconf.rb']
29
+
30
+ spec.add_dependency 'rjb'
27
31
  end
28
32
 
29
33
  spec.extensions = extensions
@@ -38,11 +42,6 @@ Gem::Specification.new do |spec|
38
42
  spec.add_dependency 'nio4r'
39
43
  spec.add_dependency 'distribution'
40
44
 
41
- if RUBY_PLATFORM =~ /java/
42
- else
43
- spec.add_dependency 'rjb'
44
- end
45
-
46
45
  spec.add_development_dependency 'bundler', '~> 1.6'
47
46
  spec.add_development_dependency 'rake'
48
47
  end