ruby-spark 1.1.0.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.travis.yml +15 -0
- data/CHANGELOG.md +8 -0
- data/README.md +184 -57
- data/TODO.md +3 -1
- data/ext/spark/build.sbt +5 -5
- data/ext/spark/src/main/scala/RubyWorker.scala +7 -16
- data/lib/spark.rb +69 -10
- data/lib/spark/accumulator.rb +8 -0
- data/lib/spark/broadcast.rb +7 -0
- data/lib/spark/build.rb +10 -10
- data/lib/spark/cli.rb +68 -76
- data/lib/spark/config.rb +13 -17
- data/lib/spark/context.rb +10 -7
- data/lib/spark/error.rb +4 -0
- data/lib/spark/helper/statistic.rb +5 -1
- data/lib/spark/java_bridge.rb +5 -3
- data/lib/spark/java_bridge/base.rb +15 -15
- data/lib/spark/java_bridge/jruby.rb +3 -1
- data/lib/spark/java_bridge/rjb.rb +2 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +10 -2
- data/lib/spark/mllib/classification/svm.rb +10 -2
- data/lib/spark/mllib/clustering/kmeans.rb +6 -2
- data/lib/spark/mllib/regression/lasso.rb +18 -2
- data/lib/spark/mllib/regression/linear.rb +11 -3
- data/lib/spark/mllib/regression/ridge.rb +18 -2
- data/lib/spark/rdd.rb +11 -2
- data/lib/spark/serializer.rb +1 -1
- data/lib/spark/serializer/auto_batched.rb +7 -0
- data/lib/spark/version.rb +1 -1
- data/ruby-spark.gemspec +4 -5
- data/spec/generator.rb +1 -1
- data/spec/lib/collect_spec.rb +10 -10
- data/spec/lib/config_spec.rb +10 -10
- data/spec/lib/context_spec.rb +116 -115
- data/spec/lib/ext_spec.rb +17 -17
- data/spec/lib/external_apps_spec.rb +1 -1
- data/spec/lib/filter_spec.rb +17 -17
- data/spec/lib/flat_map_spec.rb +22 -19
- data/spec/lib/group_spec.rb +22 -19
- data/spec/lib/helper_spec.rb +60 -12
- data/spec/lib/key_spec.rb +9 -8
- data/spec/lib/manipulation_spec.rb +15 -15
- data/spec/lib/map_partitions_spec.rb +6 -4
- data/spec/lib/map_spec.rb +22 -19
- data/spec/lib/reduce_by_key_spec.rb +19 -19
- data/spec/lib/reduce_spec.rb +22 -20
- data/spec/lib/sample_spec.rb +13 -12
- data/spec/lib/serializer_spec.rb +27 -0
- data/spec/lib/sort_spec.rb +16 -14
- data/spec/lib/statistic_spec.rb +4 -2
- data/spec/lib/whole_text_files_spec.rb +9 -8
- data/spec/spec_helper.rb +3 -3
- metadata +19 -18
data/lib/spark/context.rb
CHANGED
@@ -38,6 +38,12 @@ module Spark
|
|
38
38
|
set_call_site('Ruby') # description of stage
|
39
39
|
end
|
40
40
|
|
41
|
+
def inspect
|
42
|
+
result = %{#<#{self.class.name}:0x#{object_id}\n}
|
43
|
+
result << %{Tempdir: "#{temp_dir}">}
|
44
|
+
result
|
45
|
+
end
|
46
|
+
|
41
47
|
def stop
|
42
48
|
Spark::Accumulator::Server.stop
|
43
49
|
log_info('Ruby accumulator server was stopped')
|
@@ -108,14 +114,11 @@ module Spark
|
|
108
114
|
# Support function for API backtraces.
|
109
115
|
#
|
110
116
|
def set_call_site(site)
|
111
|
-
|
117
|
+
jcontext.setCallSite(site)
|
112
118
|
end
|
113
119
|
|
114
|
-
|
115
|
-
|
116
|
-
#
|
117
|
-
def get_call_site
|
118
|
-
jcontext.getCallSite
|
120
|
+
def clear_call_site
|
121
|
+
jcontext.clearCallSite
|
119
122
|
end
|
120
123
|
|
121
124
|
# Return a copy of this SparkContext's configuration. The configuration *cannot*
|
@@ -313,7 +316,7 @@ module Spark
|
|
313
316
|
alias_method :setLocalProperty, :set_local_property
|
314
317
|
alias_method :getLocalProperty, :get_local_property
|
315
318
|
alias_method :setCallSite, :set_call_site
|
316
|
-
alias_method :
|
319
|
+
alias_method :clearCallSite, :clear_call_site
|
317
320
|
alias_method :runJob, :run_job
|
318
321
|
alias_method :runJobWithCommand, :run_job_with_command
|
319
322
|
alias_method :addFile, :add_file
|
data/lib/spark/error.rb
CHANGED
@@ -77,9 +77,13 @@ module Spark
|
|
77
77
|
# == Example:
|
78
78
|
# data = [0,1,2,3,4,5,6,7,8,9,10]
|
79
79
|
# determine_bounds(data, 3)
|
80
|
-
# # => [
|
80
|
+
# # => [3, 7]
|
81
81
|
#
|
82
82
|
def determine_bounds(data, num_partitions)
|
83
|
+
if num_partitions > data.size
|
84
|
+
return data
|
85
|
+
end
|
86
|
+
|
83
87
|
bounds = []
|
84
88
|
count = data.size
|
85
89
|
(0...(num_partitions-1)).each do |index|
|
data/lib/spark/java_bridge.rb
CHANGED
@@ -41,31 +41,31 @@ module Spark
|
|
41
41
|
|
42
42
|
RUBY_TO_JAVA_SKIP = [Fixnum, Integer]
|
43
43
|
|
44
|
-
def initialize(
|
45
|
-
@
|
44
|
+
def initialize(target)
|
45
|
+
@target = target
|
46
46
|
end
|
47
47
|
|
48
48
|
# Import all important classes into Objects
|
49
|
-
def
|
50
|
-
return if @
|
49
|
+
def import_all
|
50
|
+
return if @imported
|
51
51
|
|
52
52
|
java_objects.each do |name, klass|
|
53
53
|
import(name, klass)
|
54
54
|
end
|
55
55
|
|
56
|
-
@
|
56
|
+
@imported = true
|
57
57
|
nil
|
58
58
|
end
|
59
59
|
|
60
60
|
# Import classes for testing
|
61
|
-
def
|
62
|
-
return if @
|
61
|
+
def import_all_test
|
62
|
+
return if @imported_test
|
63
63
|
|
64
64
|
java_test_objects.each do |name, klass|
|
65
65
|
import(name, klass)
|
66
66
|
end
|
67
67
|
|
68
|
-
@
|
68
|
+
@imported_test = true
|
69
69
|
nil
|
70
70
|
end
|
71
71
|
|
@@ -168,13 +168,9 @@ module Spark
|
|
168
168
|
private
|
169
169
|
|
170
170
|
def jars
|
171
|
-
result =
|
172
|
-
|
173
|
-
|
174
|
-
else
|
175
|
-
result << Dir.glob(File.join(@spark_home, '*.jar'))
|
176
|
-
end
|
177
|
-
result.flatten
|
171
|
+
result = Dir.glob(File.join(@target, '*.jar'))
|
172
|
+
result.flatten!
|
173
|
+
result
|
178
174
|
end
|
179
175
|
|
180
176
|
def objects_with_names(objects)
|
@@ -198,6 +194,10 @@ module Spark
|
|
198
194
|
objects_with_names(JAVA_TEST_OBJECTS)
|
199
195
|
end
|
200
196
|
|
197
|
+
def raise_missing_class(klass)
|
198
|
+
raise Spark::JavaBridgeError, "Class #{klass} is missing. Make sure that Spark and RubySpark is assembled."
|
199
|
+
end
|
200
|
+
|
201
201
|
end
|
202
202
|
end
|
203
203
|
end
|
@@ -97,7 +97,8 @@ module Spark
|
|
97
97
|
initial_weights: nil,
|
98
98
|
reg_param: 0.01,
|
99
99
|
reg_type: 'l2',
|
100
|
-
intercept: false
|
100
|
+
intercept: false,
|
101
|
+
validate: true
|
101
102
|
}
|
102
103
|
|
103
104
|
# Train a logistic regression model on the given data.
|
@@ -134,6 +135,12 @@ module Spark
|
|
134
135
|
# or not of the augmented representation for
|
135
136
|
# training data (i.e. whether bias features
|
136
137
|
# are activated or not).
|
138
|
+
# (default: false)
|
139
|
+
#
|
140
|
+
# validate::
|
141
|
+
# Boolean parameter which indicates if the
|
142
|
+
# algorithm should validate data before training.
|
143
|
+
# (default: true)
|
137
144
|
#
|
138
145
|
def self.train(rdd, options={})
|
139
146
|
super
|
@@ -145,7 +152,8 @@ module Spark
|
|
145
152
|
options[:initial_weights],
|
146
153
|
options[:reg_param].to_f,
|
147
154
|
options[:reg_type],
|
148
|
-
options[:intercept]
|
155
|
+
options[:intercept],
|
156
|
+
options[:validate])
|
149
157
|
|
150
158
|
LogisticRegressionModel.new(weights, intercept)
|
151
159
|
end
|
@@ -78,7 +78,8 @@ module Spark
|
|
78
78
|
mini_batch_fraction: 1.0,
|
79
79
|
initial_weights: nil,
|
80
80
|
reg_type: 'l2',
|
81
|
-
intercept: false
|
81
|
+
intercept: false,
|
82
|
+
validate: true
|
82
83
|
}
|
83
84
|
|
84
85
|
# Train a support vector machine on the given data.
|
@@ -114,6 +115,12 @@ module Spark
|
|
114
115
|
# or not of the augmented representation for
|
115
116
|
# training data (i.e. whether bias features
|
116
117
|
# are activated or not).
|
118
|
+
# (default: false)
|
119
|
+
#
|
120
|
+
# validateData::
|
121
|
+
# Boolean parameter which indicates if the
|
122
|
+
# algorithm should validate data before training.
|
123
|
+
# (default: true)
|
117
124
|
#
|
118
125
|
def self.train(rdd, options={})
|
119
126
|
super
|
@@ -125,7 +132,8 @@ module Spark
|
|
125
132
|
options[:mini_batch_fraction].to_f,
|
126
133
|
options[:initial_weights],
|
127
134
|
options[:reg_type],
|
128
|
-
options[:intercept]
|
135
|
+
options[:intercept],
|
136
|
+
options[:validate])
|
129
137
|
|
130
138
|
SVMModel.new(weights, intercept)
|
131
139
|
end
|
@@ -107,10 +107,14 @@ module Spark
|
|
107
107
|
# seed::
|
108
108
|
# Random seed value for cluster initialization.
|
109
109
|
#
|
110
|
-
|
110
|
+
# epsilon::
|
111
|
+
# The distance threshold within which we've consider centers to have converged.
|
112
|
+
#
|
113
|
+
def self.train(rdd, k, max_iterations: 100, runs: 1, initialization_mode: 'k-means||', seed: nil,
|
114
|
+
initialization_steps: 5, epsilon: 0.0001)
|
111
115
|
# Call returns KMeansModel
|
112
116
|
Spark.jb.call(RubyMLLibAPI.new, 'trainKMeansModel', rdd,
|
113
|
-
k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed))
|
117
|
+
k, max_iterations, runs, initialization_mode, Spark.jb.to_long(seed), initialization_steps, epsilon)
|
114
118
|
end
|
115
119
|
|
116
120
|
end
|
@@ -58,7 +58,9 @@ module Spark
|
|
58
58
|
step: 1.0,
|
59
59
|
reg_param: 0.01,
|
60
60
|
mini_batch_fraction: 1.0,
|
61
|
-
initial_weights: nil
|
61
|
+
initial_weights: nil,
|
62
|
+
intercept: false,
|
63
|
+
validate: true
|
62
64
|
}
|
63
65
|
|
64
66
|
# Train a Lasso regression model on the given data.
|
@@ -82,6 +84,18 @@ module Spark
|
|
82
84
|
# initial_weights::
|
83
85
|
# The initial weights (default: nil).
|
84
86
|
#
|
87
|
+
# intercept::
|
88
|
+
# Boolean parameter which indicates the use
|
89
|
+
# or not of the augmented representation for
|
90
|
+
# training data (i.e. whether bias features
|
91
|
+
# are activated or not).
|
92
|
+
# (default: false)
|
93
|
+
#
|
94
|
+
# validate::
|
95
|
+
# Boolean parameter which indicates if the
|
96
|
+
# algorithm should validate data before training.
|
97
|
+
# (default: true)
|
98
|
+
#
|
85
99
|
def self.train(rdd, options={})
|
86
100
|
super
|
87
101
|
|
@@ -90,7 +104,9 @@ module Spark
|
|
90
104
|
options[:step].to_f,
|
91
105
|
options[:reg_param].to_f,
|
92
106
|
options[:mini_batch_fraction].to_f,
|
93
|
-
options[:initial_weights]
|
107
|
+
options[:initial_weights],
|
108
|
+
options[:intercept],
|
109
|
+
options[:validate])
|
94
110
|
|
95
111
|
LassoModel.new(weights, intercept)
|
96
112
|
end
|
@@ -66,7 +66,8 @@ module Spark
|
|
66
66
|
initial_weights: nil,
|
67
67
|
reg_param: 0.0,
|
68
68
|
reg_type: nil,
|
69
|
-
intercept: false
|
69
|
+
intercept: false,
|
70
|
+
validate: true
|
70
71
|
}
|
71
72
|
|
72
73
|
# Train a linear regression model on the given data.
|
@@ -102,7 +103,13 @@ module Spark
|
|
102
103
|
# Boolean parameter which indicates the use
|
103
104
|
# or not of the augmented representation for
|
104
105
|
# training data (i.e. whether bias features
|
105
|
-
# are activated or not).
|
106
|
+
# are activated or not).
|
107
|
+
# (default: false)
|
108
|
+
#
|
109
|
+
# validate::
|
110
|
+
# Boolean parameter which indicates if the
|
111
|
+
# algorithm should validate data before training.
|
112
|
+
# (default: true)
|
106
113
|
#
|
107
114
|
def self.train(rdd, options={})
|
108
115
|
super
|
@@ -114,7 +121,8 @@ module Spark
|
|
114
121
|
options[:initial_weights],
|
115
122
|
options[:reg_param].to_f,
|
116
123
|
options[:reg_type],
|
117
|
-
options[:intercept]
|
124
|
+
options[:intercept],
|
125
|
+
options[:validate])
|
118
126
|
|
119
127
|
LinearRegressionModel.new(weights, intercept)
|
120
128
|
end
|
@@ -55,7 +55,9 @@ module Spark
|
|
55
55
|
step: 1.0,
|
56
56
|
reg_param: 0.01,
|
57
57
|
mini_batch_fraction: 1.0,
|
58
|
-
initial_weights: nil
|
58
|
+
initial_weights: nil,
|
59
|
+
intercept: false,
|
60
|
+
validate: true
|
59
61
|
}
|
60
62
|
|
61
63
|
# Train a ridge regression model on the given data.
|
@@ -79,6 +81,18 @@ module Spark
|
|
79
81
|
# initial_weights::
|
80
82
|
# The initial weights (default: nil).
|
81
83
|
#
|
84
|
+
# intercept::
|
85
|
+
# Boolean parameter which indicates the use
|
86
|
+
# or not of the augmented representation for
|
87
|
+
# training data (i.e. whether bias features
|
88
|
+
# are activated or not).
|
89
|
+
# (default: false)
|
90
|
+
#
|
91
|
+
# validate::
|
92
|
+
# Boolean parameter which indicates if the
|
93
|
+
# algorithm should validate data before training.
|
94
|
+
# (default: true)
|
95
|
+
#
|
82
96
|
def self.train(rdd, options={})
|
83
97
|
super
|
84
98
|
|
@@ -87,7 +101,9 @@ module Spark
|
|
87
101
|
options[:step].to_f,
|
88
102
|
options[:reg_param].to_f,
|
89
103
|
options[:mini_batch_fraction].to_f,
|
90
|
-
options[:initial_weights]
|
104
|
+
options[:initial_weights],
|
105
|
+
options[:intercept],
|
106
|
+
options[:validate])
|
91
107
|
|
92
108
|
RidgeRegressionModel.new(weights, intercept)
|
93
109
|
end
|
data/lib/spark/rdd.rb
CHANGED
@@ -39,6 +39,7 @@ module Spark
|
|
39
39
|
|
40
40
|
result = %{#<#{self.class.name}:0x#{object_id}}
|
41
41
|
result << %{ (#{comms})} unless comms.empty?
|
42
|
+
result << %{ (cached)} if cached?
|
42
43
|
result << %{\n}
|
43
44
|
result << %{ Serializer: "#{serializer}"\n}
|
44
45
|
result << %{Deserializer: "#{deserializer}"}
|
@@ -166,8 +167,13 @@ module Spark
|
|
166
167
|
|
167
168
|
# Assign a name to this RDD.
|
168
169
|
#
|
169
|
-
def set_name(
|
170
|
-
jrdd.setName(
|
170
|
+
def set_name(value)
|
171
|
+
jrdd.setName(value)
|
172
|
+
value
|
173
|
+
end
|
174
|
+
|
175
|
+
def name=(value)
|
176
|
+
set_name(value)
|
171
177
|
end
|
172
178
|
|
173
179
|
def to_java
|
@@ -193,11 +199,14 @@ module Spark
|
|
193
199
|
def collect(as_enum=false)
|
194
200
|
file = Tempfile.new('collect', context.temp_dir)
|
195
201
|
|
202
|
+
context.set_call_site(caller.first)
|
196
203
|
RubyRDD.writeRDDToFile(jrdd.rdd, file.path)
|
197
204
|
|
198
205
|
collect_from_file(file, as_enum)
|
199
206
|
rescue => e
|
200
207
|
raise Spark::RDDError, e.message
|
208
|
+
ensure
|
209
|
+
context.clear_call_site
|
201
210
|
end
|
202
211
|
|
203
212
|
def collect_from_file(file, as_enum=false)
|
data/lib/spark/serializer.rb
CHANGED
data/lib/spark/version.rb
CHANGED
data/ruby-spark.gemspec
CHANGED
@@ -21,9 +21,13 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.require_paths = ['lib']
|
22
22
|
|
23
23
|
if RUBY_PLATFORM =~ /java/
|
24
|
+
spec.platform = 'java'
|
25
|
+
|
24
26
|
extensions = ['ext/ruby_java/extconf.rb']
|
25
27
|
else
|
26
28
|
extensions = ['ext/ruby_c/extconf.rb']
|
29
|
+
|
30
|
+
spec.add_dependency 'rjb'
|
27
31
|
end
|
28
32
|
|
29
33
|
spec.extensions = extensions
|
@@ -38,11 +42,6 @@ Gem::Specification.new do |spec|
|
|
38
42
|
spec.add_dependency 'nio4r'
|
39
43
|
spec.add_dependency 'distribution'
|
40
44
|
|
41
|
-
if RUBY_PLATFORM =~ /java/
|
42
|
-
else
|
43
|
-
spec.add_dependency 'rjb'
|
44
|
-
end
|
45
|
-
|
46
45
|
spec.add_development_dependency 'bundler', '~> 1.6'
|
47
46
|
spec.add_development_dependency 'rake'
|
48
47
|
end
|