ruby-spark 1.1.0.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +15 -0
- data/CHANGELOG.md +8 -0
- data/README.md +184 -57
- data/TODO.md +3 -1
- data/ext/spark/build.sbt +5 -5
- data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
- data/lib/spark.rb +69 -10
- data/lib/spark/accumulator.rb +8 -0
- data/lib/spark/broadcast.rb +7 -0
- data/lib/spark/build.rb +10 -10
- data/lib/spark/cli.rb +68 -76
- data/lib/spark/config.rb +13 -17
- data/lib/spark/context.rb +10 -7
- data/lib/spark/error.rb +4 -0
- data/lib/spark/helper/statistic.rb +5 -1
- data/lib/spark/java_bridge.rb +5 -3
- data/lib/spark/java_bridge/base.rb +15 -15
- data/lib/spark/java_bridge/jruby.rb +3 -1
- data/lib/spark/java_bridge/rjb.rb +2 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
- data/lib/spark/mllib/classification/svm.rb +10 -2
- data/lib/spark/mllib/clustering/kmeans.rb +6 -2
- data/lib/spark/mllib/regression/lasso.rb +18 -2
- data/lib/spark/mllib/regression/linear.rb +11 -3
- data/lib/spark/mllib/regression/ridge.rb +18 -2
- data/lib/spark/rdd.rb +11 -2
- data/lib/spark/serializer.rb +1 -1
- data/lib/spark/serializer/auto_batched.rb +7 -0
- data/lib/spark/version.rb +1 -1
- data/ruby-spark.gemspec +4 -5
- data/spec/generator.rb +1 -1
- data/spec/lib/collect_spec.rb +10 -10
- data/spec/lib/config_spec.rb +10 -10
- data/spec/lib/context_spec.rb +116 -115
- data/spec/lib/ext_spec.rb +17 -17
- data/spec/lib/external_apps_spec.rb +1 -1
- data/spec/lib/filter_spec.rb +17 -17
- data/spec/lib/flat_map_spec.rb +22 -19
- data/spec/lib/group_spec.rb +22 -19
- data/spec/lib/helper_spec.rb +60 -12
- data/spec/lib/key_spec.rb +9 -8
- data/spec/lib/manipulation_spec.rb +15 -15
- data/spec/lib/map_partitions_spec.rb +6 -4
- data/spec/lib/map_spec.rb +22 -19
- data/spec/lib/reduce_by_key_spec.rb +19 -19
- data/spec/lib/reduce_spec.rb +22 -20
- data/spec/lib/sample_spec.rb +13 -12
- data/spec/lib/serializer_spec.rb +27 -0
- data/spec/lib/sort_spec.rb +16 -14
- data/spec/lib/statistic_spec.rb +4 -2
- data/spec/lib/whole_text_files_spec.rb +9 -8
- data/spec/spec_helper.rb +3 -3
- metadata +19 -18
data/lib/spark/context.rb
CHANGED
@@ -38,6 +38,12 @@ module Spark
|
|
38
38
|
set_call_site('Ruby') # description of stage
|
39
39
|
end
|
40
40
|
|
41
|
+
def inspect
|
42
|
+
result = %{#<#{self.class.name}:0x#{object_id}\n}
|
43
|
+
result << %{Tempdir: "#{temp_dir}">}
|
44
|
+
result
|
45
|
+
end
|
46
|
+
|
41
47
|
def stop
|
42
48
|
Spark::Accumulator::Server.stop
|
43
49
|
log_info('Ruby accumulator server was stopped')
|
@@ -108,14 +114,11 @@ module Spark
|
|
108
114
|
# Support function for API backtraces.
|
109
115
|
#
|
110
116
|
def set_call_site(site)
|
111
|
-
|
117
|
+
jcontext.setCallSite(site)
|
112
118
|
end
|
113
119
|
|
114
|
-
|
115
|
-
|
116
|
-
#
|
117
|
-
def get_call_site
|
118
|
-
jcontext.getCallSite
|
120
|
+
def clear_call_site
|
121
|
+
jcontext.clearCallSite
|
119
122
|
end
|
120
123
|
|
121
124
|
# Return a copy of this SparkContext's configuration. The configuration *cannot*
|
@@ -313,7 +316,7 @@ module Spark
|
|
313
316
|
alias_method :setLocalProperty, :set_local_property
|
314
317
|
alias_method :getLocalProperty, :get_local_property
|
315
318
|
alias_method :setCallSite, :set_call_site
|
316
|
-
alias_method :
|
319
|
+
alias_method :clearCallSite, :clear_call_site
|
317
320
|
alias_method :runJob, :run_job
|
318
321
|
alias_method :runJobWithCommand, :run_job_with_command
|
319
322
|
alias_method :addFile, :add_file
|
data/lib/spark/error.rb
CHANGED
@@ -77,9 +77,13 @@ module Spark
|
|
77
77
|
# == Example:
|
78
78
|
# data = [0,1,2,3,4,5,6,7,8,9,10]
|
79
79
|
# determine_bounds(data, 3)
|
80
|
-
# # => [
|
80
|
+
# # => [3, 7]
|
81
81
|
#
|
82
82
|
def determine_bounds(data, num_partitions)
|
83
|
+
if num_partitions > data.size
|
84
|
+
return data
|
85
|
+
end
|
86
|
+
|
83
87
|
bounds = []
|
84
88
|
count = data.size
|
85
89
|
(0...(num_partitions-1)).each do |index|
|
data/lib/spark/java_bridge.rb
CHANGED
@@ -41,31 +41,31 @@ module Spark
|
|
41
41
|
|
42
42
|
RUBY_TO_JAVA_SKIP = [Fixnum, Integer]
|
43
43
|
|
44
|
-
def initialize(
|
45
|
-
@
|
44
|
+
def initialize(target)
|
45
|
+
@target = target
|
46
46
|
end
|
47
47
|
|
48
48
|
# Import all important classes into Objects
|
49
|
-
def
|
50
|
-
return if @
|
49
|
+
def import_all
|
50
|
+
return if @imported
|
51
51
|
|
52
52
|
java_objects.each do |name, klass|
|
53
53
|
import(name, klass)
|
54
54
|
end
|
55
55
|
|
56
|
-
@
|
56
|
+
@imported = true
|
57
57
|
nil
|
58
58
|
end
|
59
59
|
|
60
60
|
# Import classes for testing
|
61
|
-
def
|
62
|
-
return if @
|
61
|
+
def import_all_test
|
62
|
+
return if @imported_test
|
63
63
|
|
64
64
|
java_test_objects.each do |name, klass|
|
65
65
|
import(name, klass)
|
66
66
|
end
|
67
67
|
|
68
|
-
@
|
68
|
+
@imported_test = true
|
69
69
|
nil
|
70
70
|
end
|
71
71
|
|
@@ -168,13 +168,9 @@ module Spark
|
|
168
168
|
private
|
169
169
|
|
170
170
|
def jars
|
171
|
-
result =
|
172
|
-
|
173
|
-
|
174
|
-
else
|
175
|
-
result << Dir.glob(File.join(@spark_home, '*.jar'))
|
176
|
-
end
|
177
|
-
result.flatten
|
171
|
+
result = Dir.glob(File.join(@target, '*.jar'))
|
172
|
+
result.flatten!
|
173
|
+
result
|
178
174
|
end
|
179
175
|
|
180
176
|
def objects_with_names(objects)
|
@@ -198,6 +194,10 @@ module Spark
|
|
198
194
|
objects_with_names(JAVA_TEST_OBJECTS)
|
199
195
|
end
|
200
196
|
|
197
|
+
def raise_missing_class(klass)
|
198
|
+
raise Spark::JavaBridgeError, "Class #{klass} is missing. Make sure that Spark and RubySpark is assembled."
|
199
|
+
end
|
200
|
+
|
201
201
|
end
|
202
202
|
end
|
203
203
|
end
|
@@ -97,7 +97,8 @@ module Spark
|
|
97
97
|
initial_weights: nil,
|
98
98
|
reg_param: 0.01,
|
99
99
|
reg_type: 'l2',
|
100
|
-
intercept: false
|
100
|
+
intercept: false,
|
101
|
+
validate: true
|
101
102
|
}
|
102
103
|
|
103
104
|
# Train a logistic regression model on the given data.
|
@@ -134,6 +135,12 @@ module Spark
|
|
134
135
|
# or not of the augmented representation for
|
135
136
|
# training data (i.e. whether bias features
|
136
137
|
# are activated or not).
|
138
|
+
# (default: false)
|
139
|
+
#
|
140
|
+
# validate::
|
141
|
+
# Boolean parameter which indicates if the
|
142
|
+
# algorithm should validate data before training.
|
143
|
+
# (default: true)
|
137
144
|
#
|
138
145
|
def self.train(rdd, options={})
|
139
146
|
super
|
@@ -145,7 +152,8 @@ module Spark
|
|
145
152
|
options[:initial_weights],
|
146
153
|
options[:reg_param].to_f,
|
147
154
|
options[:reg_type],
|
148
|
-
options[:intercept]
|
155
|
+
options[:intercept],
|
156
|
+
options[:validate])
|
149
157
|
|
150
158
|
LogisticRegressionModel.new(weights, intercept)
|
151
159
|
end
|
@@ -78,7 +78,8 @@ module Spark
|
|
78
78
|
mini_batch_fraction: 1.0,
|
79
79
|
initial_weights: nil,
|
80
80
|
reg_type: 'l2',
|
81
|
-
intercept: false
|
81
|
+
intercept: false,
|
82
|
+
validate: true
|
82
83
|
}
|
83
84
|
|
84
85
|
# Train a support vector machine on the given data.
|
@@ -114,6 +115,12 @@ module Spark
|
|
114
115
|
# or not of the augmented representation for
|
115
116
|
# training data (i.e. whether bias features
|
116
117
|
# are activated or not).
|
118
|
+
# (default: false)
|
119
|
+
#
|
120
|
+
# validateData::
|
121
|
+
# Boolean parameter which indicates if the
|
122
|
+
# algorithm should validate data before training.
|
123
|
+
# (default: true)
|
117
124
|
#
|
118
125
|
def self.train(rdd, options={})
|
119
126
|
super
|
@@ -125,7 +132,8 @@ module Spark
|
|
125
132
|
options[:mini_batch_fraction].to_f,
|
126
133
|
options[:initial_weights],
|
127
134
|
options[:reg_type],
|
128
|
-
options[:intercept]
|
135
|
+
options[:intercept],
|
136
|
+
options[:validate])
|
129
137
|
|
130
138
|
SVMModel.new(weights, intercept)
|
131
139
|
end
|
@@ -107,10 +107,14 @@ module Spark
|
|
107
107
|
# seed::
|
108
108
|
# Random seed value for cluster initialization.
|
109
109
|
#
|
110
|
-
|
110
|
+
# epsilon::
|
111
|
+
# The distance threshold within which we've consider centers to have converged.
|
112
|
+
#
|
113
|
+
def self.train(rdd, k, max_iterations: 100, runs: 1, initialization_mode: 'k-means||', seed: nil,
|
114
|
+
initialization_steps: 5, epsilon: 0.0001)
|
111
115
|
# Call returns KMeansModel
|
112
116
|
Spark.jb.call(RubyMLLibAPI.new, 'trainKMeansModel', rdd,
|
113
|
-
k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed))
|
117
|
+
k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed), initialization_steps, epsilon)
|
114
118
|
end
|
115
119
|
|
116
120
|
end
|
@@ -58,7 +58,9 @@ module Spark
|
|
58
58
|
step: 1.0,
|
59
59
|
reg_param: 0.01,
|
60
60
|
mini_batch_fraction: 1.0,
|
61
|
-
initial_weights: nil
|
61
|
+
initial_weights: nil,
|
62
|
+
intercept: false,
|
63
|
+
validate: true
|
62
64
|
}
|
63
65
|
|
64
66
|
# Train a Lasso regression model on the given data.
|
@@ -82,6 +84,18 @@ module Spark
|
|
82
84
|
# initial_weights::
|
83
85
|
# The initial weights (default: nil).
|
84
86
|
#
|
87
|
+
# intercept::
|
88
|
+
# Boolean parameter which indicates the use
|
89
|
+
# or not of the augmented representation for
|
90
|
+
# training data (i.e. whether bias features
|
91
|
+
# are activated or not).
|
92
|
+
# (default: false)
|
93
|
+
#
|
94
|
+
# validate::
|
95
|
+
# Boolean parameter which indicates if the
|
96
|
+
# algorithm should validate data before training.
|
97
|
+
# (default: true)
|
98
|
+
#
|
85
99
|
def self.train(rdd, options={})
|
86
100
|
super
|
87
101
|
|
@@ -90,7 +104,9 @@ module Spark
|
|
90
104
|
options[:step].to_f,
|
91
105
|
options[:reg_param].to_f,
|
92
106
|
options[:mini_batch_fraction].to_f,
|
93
|
-
options[:initial_weights]
|
107
|
+
options[:initial_weights],
|
108
|
+
options[:intercept],
|
109
|
+
options[:validate])
|
94
110
|
|
95
111
|
LassoModel.new(weights, intercept)
|
96
112
|
end
|
@@ -66,7 +66,8 @@ module Spark
|
|
66
66
|
initial_weights: nil,
|
67
67
|
reg_param: 0.0,
|
68
68
|
reg_type: nil,
|
69
|
-
intercept: false
|
69
|
+
intercept: false,
|
70
|
+
validate: true
|
70
71
|
}
|
71
72
|
|
72
73
|
# Train a linear regression model on the given data.
|
@@ -102,7 +103,13 @@ module Spark
|
|
102
103
|
# Boolean parameter which indicates the use
|
103
104
|
# or not of the augmented representation for
|
104
105
|
# training data (i.e. whether bias features
|
105
|
-
# are activated or not).
|
106
|
+
# are activated or not).
|
107
|
+
# (default: false)
|
108
|
+
#
|
109
|
+
# validate::
|
110
|
+
# Boolean parameter which indicates if the
|
111
|
+
# algorithm should validate data before training.
|
112
|
+
# (default: true)
|
106
113
|
#
|
107
114
|
def self.train(rdd, options={})
|
108
115
|
super
|
@@ -114,7 +121,8 @@ module Spark
|
|
114
121
|
options[:initial_weights],
|
115
122
|
options[:reg_param].to_f,
|
116
123
|
options[:reg_type],
|
117
|
-
options[:intercept]
|
124
|
+
options[:intercept],
|
125
|
+
options[:validate])
|
118
126
|
|
119
127
|
LinearRegressionModel.new(weights, intercept)
|
120
128
|
end
|
@@ -55,7 +55,9 @@ module Spark
|
|
55
55
|
step: 1.0,
|
56
56
|
reg_param: 0.01,
|
57
57
|
mini_batch_fraction: 1.0,
|
58
|
-
initial_weights: nil
|
58
|
+
initial_weights: nil,
|
59
|
+
intercept: false,
|
60
|
+
validate: true
|
59
61
|
}
|
60
62
|
|
61
63
|
# Train a ridge regression model on the given data.
|
@@ -79,6 +81,18 @@ module Spark
|
|
79
81
|
# initial_weights::
|
80
82
|
# The initial weights (default: nil).
|
81
83
|
#
|
84
|
+
# intercept::
|
85
|
+
# Boolean parameter which indicates the use
|
86
|
+
# or not of the augmented representation for
|
87
|
+
# training data (i.e. whether bias features
|
88
|
+
# are activated or not).
|
89
|
+
# (default: false)
|
90
|
+
#
|
91
|
+
# validate::
|
92
|
+
# Boolean parameter which indicates if the
|
93
|
+
# algorithm should validate data before training.
|
94
|
+
# (default: true)
|
95
|
+
#
|
82
96
|
def self.train(rdd, options={})
|
83
97
|
super
|
84
98
|
|
@@ -87,7 +101,9 @@ module Spark
|
|
87
101
|
options[:step].to_f,
|
88
102
|
options[:reg_param].to_f,
|
89
103
|
options[:mini_batch_fraction].to_f,
|
90
|
-
options[:initial_weights]
|
104
|
+
options[:initial_weights],
|
105
|
+
options[:intercept],
|
106
|
+
options[:validate])
|
91
107
|
|
92
108
|
RidgeRegressionModel.new(weights, intercept)
|
93
109
|
end
|
data/lib/spark/rdd.rb
CHANGED
@@ -39,6 +39,7 @@ module Spark
|
|
39
39
|
|
40
40
|
result = %{#<#{self.class.name}:0x#{object_id}}
|
41
41
|
result << %{ (#{comms})} unless comms.empty?
|
42
|
+
result << %{ (cached)} if cached?
|
42
43
|
result << %{\n}
|
43
44
|
result << %{ Serializer: "#{serializer}"\n}
|
44
45
|
result << %{Deserializer: "#{deserializer}"}
|
@@ -166,8 +167,13 @@ module Spark
|
|
166
167
|
|
167
168
|
# Assign a name to this RDD.
|
168
169
|
#
|
169
|
-
def set_name(
|
170
|
-
jrdd.setName(
|
170
|
+
def set_name(value)
|
171
|
+
jrdd.setName(value)
|
172
|
+
value
|
173
|
+
end
|
174
|
+
|
175
|
+
def name=(value)
|
176
|
+
set_name(value)
|
171
177
|
end
|
172
178
|
|
173
179
|
def to_java
|
@@ -193,11 +199,14 @@ module Spark
|
|
193
199
|
def collect(as_enum=false)
|
194
200
|
file = Tempfile.new('collect', context.temp_dir)
|
195
201
|
|
202
|
+
context.set_call_site(caller.first)
|
196
203
|
RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
|
197
204
|
|
198
205
|
collect_from_file(file, as_enum)
|
199
206
|
rescue => e
|
200
207
|
raise Spark::RDDError, e.message
|
208
|
+
ensure
|
209
|
+
context.clear_call_site
|
201
210
|
end
|
202
211
|
|
203
212
|
def collect_from_file(file, as_enum=false)
|
data/lib/spark/serializer.rb
CHANGED
data/lib/spark/version.rb
CHANGED
data/ruby-spark.gemspec
CHANGED
@@ -21,9 +21,13 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.require_paths = ['lib']
|
22
22
|
|
23
23
|
if RUBY_PLATFORM =~ /java/
|
24
|
+
spec.platform = 'java'
|
25
|
+
|
24
26
|
extensions = ['ext/ruby_java/extconf.rb']
|
25
27
|
else
|
26
28
|
extensions = ['ext/ruby_c/extconf.rb']
|
29
|
+
|
30
|
+
spec.add_dependency 'rjb'
|
27
31
|
end
|
28
32
|
|
29
33
|
spec.extensions = extensions
|
@@ -38,11 +42,6 @@ Gem::Specification.new do |spec|
|
|
38
42
|
spec.add_dependency 'nio4r'
|
39
43
|
spec.add_dependency 'distribution'
|
40
44
|
|
41
|
-
if RUBY_PLATFORM =~ /java/
|
42
|
-
else
|
43
|
-
spec.add_dependency 'rjb'
|
44
|
-
end
|
45
|
-
|
46
45
|
spec.add_development_dependency 'bundler', '~> 1.6'
|
47
46
|
spec.add_development_dependency 'rake'
|
48
47
|
end
|