ruby-spark 1.1.0.1-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +252 -0
- data/Rakefile +35 -0
- data/TODO.md +6 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/comparison/prepare.sh +18 -0
- data/benchmark/comparison/python.py +156 -0
- data/benchmark/comparison/r.r +69 -0
- data/benchmark/comparison/ruby.rb +167 -0
- data/benchmark/comparison/run-all.sh +160 -0
- data/benchmark/comparison/scala.scala +181 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/example/website_search.rb +83 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +158 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +238 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +322 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +67 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1377 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +79 -0
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +63 -0
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +13 -0
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +17 -0
- data/lib/spark/serializer/message_pack.rb +23 -0
- data/lib/spark/serializer/oj.rb +23 -0
- data/lib/spark/serializer/pair.rb +41 -0
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +200 -0
- data/ruby-spark.gemspec +47 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +165 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +122 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +88 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +170 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +38 -0
- metadata +389 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
module Spark
|
2
|
+
module Helper
|
3
|
+
module Serialize
|
4
|
+
|
5
|
+
DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>'
|
6
|
+
DIRECTIVE_INTEGERS_BIG_ENDIAN = 'l>*'
|
7
|
+
DIRECTIVE_LONG_BIG_ENDIAN = 'q>'
|
8
|
+
DIRECTIVE_LONGS_BIG_ENDIAN = 'q>*'
|
9
|
+
DIRECTIVE_DOUBLE_BIG_ENDIAN = 'G'
|
10
|
+
DIRECTIVE_DOUBLES_BIG_ENDIAN = 'G*'
|
11
|
+
DIRECTIVE_UNSIGNED_CHARS = 'C*'
|
12
|
+
DIRECTIVE_CHARS = 'c*'
|
13
|
+
|
14
|
+
# Packing
|
15
|
+
|
16
|
+
def pack_int(data)
|
17
|
+
[data].pack(DIRECTIVE_INTEGER_BIG_ENDIAN)
|
18
|
+
end
|
19
|
+
|
20
|
+
def pack_long(data)
|
21
|
+
[data].pack(DIRECTIVE_LONG_BIG_ENDIAN)
|
22
|
+
end
|
23
|
+
|
24
|
+
def pack_double(data)
|
25
|
+
[data].pack(DIRECTIVE_DOUBLE_BIG_ENDIAN)
|
26
|
+
end
|
27
|
+
|
28
|
+
def pack_unsigned_chars(data)
|
29
|
+
data.pack(DIRECTIVE_UNSIGNED_CHARS)
|
30
|
+
end
|
31
|
+
|
32
|
+
def pack_ints(data)
|
33
|
+
__check_array(data)
|
34
|
+
data.pack(DIRECTIVE_INTEGERS_BIG_ENDIAN)
|
35
|
+
end
|
36
|
+
|
37
|
+
def pack_longs(data)
|
38
|
+
__check_array(data)
|
39
|
+
data.pack(DIRECTIVE_LONGS_BIG_ENDIAN)
|
40
|
+
end
|
41
|
+
|
42
|
+
def pack_doubles(data)
|
43
|
+
__check_array(data)
|
44
|
+
data.pack(DIRECTIVE_DOUBLES_BIG_ENDIAN)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Unpacking
|
48
|
+
|
49
|
+
def unpack_int(data)
|
50
|
+
data.unpack(DIRECTIVE_INTEGER_BIG_ENDIAN)[0]
|
51
|
+
end
|
52
|
+
|
53
|
+
def unpack_long(data)
|
54
|
+
data.unpack(DIRECTIVE_LONG_BIG_ENDIAN)[0]
|
55
|
+
end
|
56
|
+
|
57
|
+
def unpack_chars(data)
|
58
|
+
data.unpack(DIRECTIVE_CHARS)
|
59
|
+
end
|
60
|
+
|
61
|
+
private
|
62
|
+
|
63
|
+
def __check_array(data)
|
64
|
+
unless data.is_a?(Array)
|
65
|
+
raise ArgumentError, 'Data must be an Array.'
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module Spark
|
2
|
+
module Helper
|
3
|
+
module Statistic
|
4
|
+
|
5
|
+
# Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
|
6
|
+
#
|
7
|
+
# == How the sampling rate is determined:
|
8
|
+
# Let p = num / total, where num is the sample size and total is the total number of
|
9
|
+
# datapoints in the RDD. We're trying to compute q > p such that
|
10
|
+
# * when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q),
|
11
|
+
# where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),
|
12
|
+
# i.e. the failure rate of not having a sufficiently large sample < 0.0001.
|
13
|
+
# Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for
|
14
|
+
# num > 12, but we need a slightly larger q (9 empirically determined).
|
15
|
+
# * when sampling without replacement, we're drawing each datapoint with prob_i
|
16
|
+
# ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
|
17
|
+
# rate, where success rate is defined the same as in sampling with replacement.
|
18
|
+
#
|
19
|
+
def compute_fraction(lower_bound, total, with_replacement)
|
20
|
+
lower_bound = lower_bound.to_f
|
21
|
+
|
22
|
+
if with_replacement
|
23
|
+
upper_poisson_bound(lower_bound) / total
|
24
|
+
else
|
25
|
+
fraction = lower_bound / total
|
26
|
+
upper_binomial_bound(0.00001, total, fraction)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def upper_poisson_bound(bound)
|
31
|
+
num_std = if bound < 6
|
32
|
+
12
|
33
|
+
elsif bound < 16
|
34
|
+
9
|
35
|
+
else
|
36
|
+
6
|
37
|
+
end.to_f
|
38
|
+
|
39
|
+
[bound + num_std * Math.sqrt(bound), 1e-10].max
|
40
|
+
end
|
41
|
+
|
42
|
+
def upper_binomial_bound(delta, total, fraction)
|
43
|
+
gamma = -Math.log(delta) / total
|
44
|
+
[1, fraction + gamma + Math.sqrt(gamma*gamma + 2*gamma*fraction)].min
|
45
|
+
end
|
46
|
+
|
47
|
+
# Bisect right
|
48
|
+
#
|
49
|
+
# == Examples:
|
50
|
+
# data = [1,5,6,8,96,120,133]
|
51
|
+
#
|
52
|
+
# bisect_right(data, 0) # => 0
|
53
|
+
# bisect_right(data, 1) # => 1
|
54
|
+
# bisect_right(data, 5) # => 2
|
55
|
+
# bisect_right(data, 9) # => 4
|
56
|
+
# bisect_right(data, 150) # => 7
|
57
|
+
#
|
58
|
+
def bisect_right(data, value, low=0, high=data.size)
|
59
|
+
if low < 0
|
60
|
+
raise ArgumentError, 'Low must be >= 0.'
|
61
|
+
end
|
62
|
+
|
63
|
+
while low < high
|
64
|
+
mid = (low + high) / 2
|
65
|
+
if value < data[mid]
|
66
|
+
high = mid
|
67
|
+
else
|
68
|
+
low = mid + 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
low
|
73
|
+
end
|
74
|
+
|
75
|
+
# Determine bound of partitioning
|
76
|
+
#
|
77
|
+
# == Example:
|
78
|
+
# data = [0,1,2,3,4,5,6,7,8,9,10]
|
79
|
+
# determine_bounds(data, 3)
|
80
|
+
# # => [2, 5, 8]
|
81
|
+
#
|
82
|
+
def determine_bounds(data, num_partitions)
|
83
|
+
bounds = []
|
84
|
+
count = data.size
|
85
|
+
(0...(num_partitions-1)).each do |index|
|
86
|
+
bounds << data[count * (index+1) / num_partitions]
|
87
|
+
end
|
88
|
+
bounds
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Spark
|
2
|
+
module Helper
|
3
|
+
module System
|
4
|
+
|
5
|
+
def self.included(base)
|
6
|
+
base.send :extend, Methods
|
7
|
+
base.send :include, Methods
|
8
|
+
end
|
9
|
+
|
10
|
+
module Methods
|
11
|
+
def windows?
|
12
|
+
RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
|
13
|
+
end
|
14
|
+
|
15
|
+
def mri?
|
16
|
+
RbConfig::CONFIG['ruby_install_name'] == 'ruby'
|
17
|
+
end
|
18
|
+
|
19
|
+
def jruby?
|
20
|
+
RbConfig::CONFIG['ruby_install_name'] == 'jruby'
|
21
|
+
end
|
22
|
+
|
23
|
+
def pry?
|
24
|
+
!!Thread.current[:__pry__]
|
25
|
+
end
|
26
|
+
|
27
|
+
# Memory usage in kb
|
28
|
+
def memory_usage
|
29
|
+
if jruby?
|
30
|
+
runtime = java.lang.Runtime.getRuntime
|
31
|
+
(runtime.totalMemory - runtime.freeMemory) >> 10
|
32
|
+
elsif windows?
|
33
|
+
# not yet
|
34
|
+
else
|
35
|
+
`ps -o rss= -p #{Process.pid}`.to_i
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end # Methods
|
39
|
+
|
40
|
+
end # System
|
41
|
+
end # Helper
|
42
|
+
end # Spark
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Spark
|
2
|
+
module JavaBridge
|
3
|
+
|
4
|
+
autoload :Base, 'spark/java_bridge/base'
|
5
|
+
autoload :JRuby, 'spark/java_bridge/jruby'
|
6
|
+
autoload :RJB, 'spark/java_bridge/rjb'
|
7
|
+
|
8
|
+
include Spark::Helper::System
|
9
|
+
|
10
|
+
def self.get
|
11
|
+
if jruby?
|
12
|
+
JRuby
|
13
|
+
else
|
14
|
+
RJB
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,203 @@
|
|
1
|
+
##
|
2
|
+
# Spark::JavaBridge::Base
|
3
|
+
#
|
4
|
+
# Parent for all adapter (ruby - java)
|
5
|
+
#
|
6
|
+
module Spark
|
7
|
+
module JavaBridge
|
8
|
+
class Base
|
9
|
+
|
10
|
+
include Spark::Helper::System
|
11
|
+
|
12
|
+
JAVA_OBJECTS = [
|
13
|
+
'java.util.ArrayList',
|
14
|
+
'org.apache.spark.SparkConf',
|
15
|
+
'org.apache.spark.api.java.JavaSparkContext',
|
16
|
+
'org.apache.spark.api.ruby.RubyRDD',
|
17
|
+
'org.apache.spark.api.ruby.RubyUtils',
|
18
|
+
'org.apache.spark.api.ruby.RubyWorker',
|
19
|
+
'org.apache.spark.api.ruby.PairwiseRDD',
|
20
|
+
'org.apache.spark.api.ruby.RubyAccumulatorParam',
|
21
|
+
'org.apache.spark.api.ruby.RubySerializer',
|
22
|
+
'org.apache.spark.api.python.PythonRDD',
|
23
|
+
'org.apache.spark.api.python.PythonPartitioner',
|
24
|
+
'org.apache.spark.ui.ruby.RubyTab',
|
25
|
+
'org.apache.spark.mllib.api.ruby.RubyMLLibAPI',
|
26
|
+
'scala.collection.mutable.HashMap',
|
27
|
+
:JInteger => 'java.lang.Integer',
|
28
|
+
:JLong => 'java.lang.Long',
|
29
|
+
:JLogger => 'org.apache.log4j.Logger',
|
30
|
+
:JLevel => 'org.apache.log4j.Level',
|
31
|
+
:JPriority => 'org.apache.log4j.Priority',
|
32
|
+
:JUtils => 'org.apache.spark.util.Utils',
|
33
|
+
:JStorageLevel => 'org.apache.spark.storage.StorageLevel',
|
34
|
+
:JDenseVector => 'org.apache.spark.mllib.linalg.DenseVector',
|
35
|
+
:JDenseMatrix => 'org.apache.spark.mllib.linalg.DenseMatrix'
|
36
|
+
]
|
37
|
+
|
38
|
+
JAVA_TEST_OBJECTS = [
|
39
|
+
'org.apache.spark.mllib.api.ruby.RubyMLLibUtilAPI'
|
40
|
+
]
|
41
|
+
|
42
|
+
RUBY_TO_JAVA_SKIP = [Fixnum, Integer]
|
43
|
+
|
44
|
+
def initialize(spark_home)
|
45
|
+
@spark_home = spark_home
|
46
|
+
end
|
47
|
+
|
48
|
+
# Import all important classes into Objects
|
49
|
+
def load
|
50
|
+
return if @loaded
|
51
|
+
|
52
|
+
java_objects.each do |name, klass|
|
53
|
+
import(name, klass)
|
54
|
+
end
|
55
|
+
|
56
|
+
@loaded = true
|
57
|
+
nil
|
58
|
+
end
|
59
|
+
|
60
|
+
# Import classes for testing
|
61
|
+
def load_test
|
62
|
+
return if @loaded_test
|
63
|
+
|
64
|
+
java_test_objects.each do |name, klass|
|
65
|
+
import(name, klass)
|
66
|
+
end
|
67
|
+
|
68
|
+
@loaded_test = true
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
|
72
|
+
# Call java object
|
73
|
+
def call(klass, method, *args)
|
74
|
+
# To java
|
75
|
+
args.map!{|item| to_java(item)}
|
76
|
+
|
77
|
+
# Call java
|
78
|
+
result = klass.__send__(method, *args)
|
79
|
+
|
80
|
+
# To ruby
|
81
|
+
to_ruby(result)
|
82
|
+
end
|
83
|
+
|
84
|
+
def to_java_array_list(array)
|
85
|
+
array_list = ArrayList.new
|
86
|
+
array.each do |item|
|
87
|
+
array_list.add(to_java(item))
|
88
|
+
end
|
89
|
+
array_list
|
90
|
+
end
|
91
|
+
|
92
|
+
def to_long(number)
|
93
|
+
return nil if number.nil?
|
94
|
+
JLong.new(number)
|
95
|
+
end
|
96
|
+
|
97
|
+
def to_java(object)
|
98
|
+
if RUBY_TO_JAVA_SKIP.include?(object.class)
|
99
|
+
# Some object are convert automatically
|
100
|
+
# This is for preventing errors
|
101
|
+
# For example: jruby store integer as long so 1.to_java is Long
|
102
|
+
object
|
103
|
+
elsif object.respond_to?(:to_java)
|
104
|
+
object.to_java
|
105
|
+
elsif object.is_a?(Array)
|
106
|
+
to_java_array_list(object)
|
107
|
+
else
|
108
|
+
object
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# Array problem:
|
113
|
+
# Rjb: object.toArray -> Array
|
114
|
+
# Jruby: object.toArray -> java.lang.Object
|
115
|
+
#
|
116
|
+
def to_ruby(object)
|
117
|
+
if java_object?(object)
|
118
|
+
class_name = object.getClass.getSimpleName
|
119
|
+
case class_name
|
120
|
+
when 'ArraySeq'
|
121
|
+
result = []
|
122
|
+
iterator = object.iterator
|
123
|
+
while iterator.hasNext
|
124
|
+
result << to_ruby(iterator.next)
|
125
|
+
end
|
126
|
+
result
|
127
|
+
when 'Map2', 'Map3', 'Map4', 'HashTrieMap'
|
128
|
+
Hash[
|
129
|
+
object.toSeq.array.to_a.map!{|item| [item._1, item._2]}
|
130
|
+
]
|
131
|
+
when 'SeqWrapper'; object.toArray.to_a.map!{|item| to_ruby(item)}
|
132
|
+
when 'ofRef'; object.array.to_a.map!{|item| to_ruby(item)} # WrappedArray$ofRef
|
133
|
+
when 'LabeledPoint'; Spark::Mllib::LabeledPoint.from_java(object)
|
134
|
+
when 'DenseVector'; Spark::Mllib::DenseVector.from_java(object)
|
135
|
+
when 'KMeansModel'; Spark::Mllib::KMeansModel.from_java(object)
|
136
|
+
when 'DenseMatrix'; Spark::Mllib::DenseMatrix.from_java(object)
|
137
|
+
else
|
138
|
+
# Some RDD
|
139
|
+
if class_name != 'JavaRDD' && class_name.end_with?('RDD')
|
140
|
+
object = object.toJavaRDD
|
141
|
+
class_name = 'JavaRDD'
|
142
|
+
end
|
143
|
+
|
144
|
+
# JavaRDD
|
145
|
+
if class_name == 'JavaRDD'
|
146
|
+
jrdd = RubyRDD.toRuby(object)
|
147
|
+
|
148
|
+
serializer = Spark::Serializer.build { __batched__(__marshal__) }
|
149
|
+
serializer = Spark::Serializer.build { __batched__(__marshal__, 2) }
|
150
|
+
|
151
|
+
return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
|
152
|
+
end
|
153
|
+
|
154
|
+
# Unknow
|
155
|
+
Spark.logger.warn("Java object '#{object.getClass.name}' was not converted.")
|
156
|
+
object
|
157
|
+
end
|
158
|
+
|
159
|
+
else
|
160
|
+
# Already transfered
|
161
|
+
object
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
alias_method :java_to_ruby, :to_ruby
|
166
|
+
alias_method :ruby_to_java, :to_java
|
167
|
+
|
168
|
+
private
|
169
|
+
|
170
|
+
def jars
|
171
|
+
result = []
|
172
|
+
if File.file?(@spark_home)
|
173
|
+
result << @spark_home
|
174
|
+
else
|
175
|
+
result << Dir.glob(File.join(@spark_home, '*.jar'))
|
176
|
+
end
|
177
|
+
result.flatten
|
178
|
+
end
|
179
|
+
|
180
|
+
def objects_with_names(objects)
|
181
|
+
hash = {}
|
182
|
+
objects.each do |object|
|
183
|
+
if object.is_a?(Hash)
|
184
|
+
hash.merge!(object)
|
185
|
+
else
|
186
|
+
key = object.split('.').last.to_sym
|
187
|
+
hash[key] = object
|
188
|
+
end
|
189
|
+
end
|
190
|
+
hash
|
191
|
+
end
|
192
|
+
|
193
|
+
def java_objects
|
194
|
+
objects_with_names(JAVA_OBJECTS)
|
195
|
+
end
|
196
|
+
|
197
|
+
def java_test_objects
|
198
|
+
objects_with_names(JAVA_TEST_OBJECTS)
|
199
|
+
end
|
200
|
+
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'java'
|
2
|
+
|
3
|
+
module Spark
|
4
|
+
module JavaBridge
|
5
|
+
class JRuby < Base
|
6
|
+
|
7
|
+
def initialize(*args)
|
8
|
+
super
|
9
|
+
jars.each {|jar| require jar}
|
10
|
+
end
|
11
|
+
|
12
|
+
def import(name, klass)
|
13
|
+
klass = "Java::#{klass}"
|
14
|
+
Object.const_set(name, eval(klass)) rescue nil
|
15
|
+
end
|
16
|
+
|
17
|
+
def java_object?(object)
|
18
|
+
object.is_a?(JavaProxy)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|