ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,71 @@
1
+ module Spark
2
+ module Helper
3
+ module Serialize
4
+
5
+ DIRECTIVE_INTEGER_BIG_ENDIAN = 'l>'
6
+ DIRECTIVE_INTEGERS_BIG_ENDIAN = 'l>*'
7
+ DIRECTIVE_LONG_BIG_ENDIAN = 'q>'
8
+ DIRECTIVE_LONGS_BIG_ENDIAN = 'q>*'
9
+ DIRECTIVE_DOUBLE_BIG_ENDIAN = 'G'
10
+ DIRECTIVE_DOUBLES_BIG_ENDIAN = 'G*'
11
+ DIRECTIVE_UNSIGNED_CHARS = 'C*'
12
+ DIRECTIVE_CHARS = 'c*'
13
+
14
+ # Packing
15
+
16
+ def pack_int(data)
17
+ [data].pack(DIRECTIVE_INTEGER_BIG_ENDIAN)
18
+ end
19
+
20
+ def pack_long(data)
21
+ [data].pack(DIRECTIVE_LONG_BIG_ENDIAN)
22
+ end
23
+
24
+ def pack_double(data)
25
+ [data].pack(DIRECTIVE_DOUBLE_BIG_ENDIAN)
26
+ end
27
+
28
+ def pack_unsigned_chars(data)
29
+ data.pack(DIRECTIVE_UNSIGNED_CHARS)
30
+ end
31
+
32
+ def pack_ints(data)
33
+ __check_array(data)
34
+ data.pack(DIRECTIVE_INTEGERS_BIG_ENDIAN)
35
+ end
36
+
37
+ def pack_longs(data)
38
+ __check_array(data)
39
+ data.pack(DIRECTIVE_LONGS_BIG_ENDIAN)
40
+ end
41
+
42
+ def pack_doubles(data)
43
+ __check_array(data)
44
+ data.pack(DIRECTIVE_DOUBLES_BIG_ENDIAN)
45
+ end
46
+
47
+ # Unpacking
48
+
49
+ def unpack_int(data)
50
+ data.unpack(DIRECTIVE_INTEGER_BIG_ENDIAN)[0]
51
+ end
52
+
53
+ def unpack_long(data)
54
+ data.unpack(DIRECTIVE_LONG_BIG_ENDIAN)[0]
55
+ end
56
+
57
+ def unpack_chars(data)
58
+ data.unpack(DIRECTIVE_CHARS)
59
+ end
60
+
61
+ private
62
+
63
+ def __check_array(data)
64
+ unless data.is_a?(Array)
65
+ raise ArgumentError, 'Data must be an Array.'
66
+ end
67
+ end
68
+
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,93 @@
1
+ module Spark
2
+ module Helper
3
+ module Statistic
4
+
5
+ # Returns a sampling rate that guarantees a sample of size >= sampleSizeLowerBound 99.99% of the time.
6
+ #
7
+ # == How the sampling rate is determined:
8
+ # Let p = num / total, where num is the sample size and total is the total number of
9
+ # datapoints in the RDD. We're trying to compute q > p such that
10
+ # * when sampling with replacement, we're drawing each datapoint with prob_i ~ Pois(q),
11
+ # where we want to guarantee Pr[s < num] < 0.0001 for s = sum(prob_i for i from 0 to total),
12
+ # i.e. the failure rate of not having a sufficiently large sample < 0.0001.
13
+ # Setting q = p + 5 * sqrt(p/total) is sufficient to guarantee 0.9999 success rate for
14
+ # num > 12, but we need a slightly larger q (9 empirically determined).
15
+ # * when sampling without replacement, we're drawing each datapoint with prob_i
16
+ # ~ Binomial(total, fraction) and our choice of q guarantees 1-delta, or 0.9999 success
17
+ # rate, where success rate is defined the same as in sampling with replacement.
18
+ #
19
+ def compute_fraction(lower_bound, total, with_replacement)
20
+ lower_bound = lower_bound.to_f
21
+
22
+ if with_replacement
23
+ upper_poisson_bound(lower_bound) / total
24
+ else
25
+ fraction = lower_bound / total
26
+ upper_binomial_bound(0.00001, total, fraction)
27
+ end
28
+ end
29
+
30
+ def upper_poisson_bound(bound)
31
+ num_std = if bound < 6
32
+ 12
33
+ elsif bound < 16
34
+ 9
35
+ else
36
+ 6
37
+ end.to_f
38
+
39
+ [bound + num_std * Math.sqrt(bound), 1e-10].max
40
+ end
41
+
42
+ def upper_binomial_bound(delta, total, fraction)
43
+ gamma = -Math.log(delta) / total
44
+ [1, fraction + gamma + Math.sqrt(gamma*gamma + 2*gamma*fraction)].min
45
+ end
46
+
47
+ # Bisect right
48
+ #
49
+ # == Examples:
50
+ # data = [1,5,6,8,96,120,133]
51
+ #
52
+ # bisect_right(data, 0) # => 0
53
+ # bisect_right(data, 1) # => 1
54
+ # bisect_right(data, 5) # => 2
55
+ # bisect_right(data, 9) # => 4
56
+ # bisect_right(data, 150) # => 7
57
+ #
58
+ def bisect_right(data, value, low=0, high=data.size)
59
+ if low < 0
60
+ raise ArgumentError, 'Low must be >= 0.'
61
+ end
62
+
63
+ while low < high
64
+ mid = (low + high) / 2
65
+ if value < data[mid]
66
+ high = mid
67
+ else
68
+ low = mid + 1
69
+ end
70
+ end
71
+
72
+ low
73
+ end
74
+
75
+ # Determine bound of partitioning
76
+ #
77
+ # == Example:
78
+ # data = [0,1,2,3,4,5,6,7,8,9,10]
79
+ # determine_bounds(data, 3)
80
+ # # => [2, 5, 8]
81
+ #
82
+ def determine_bounds(data, num_partitions)
83
+ bounds = []
84
+ count = data.size
85
+ (0...(num_partitions-1)).each do |index|
86
+ bounds << data[count * (index+1) / num_partitions]
87
+ end
88
+ bounds
89
+ end
90
+
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,42 @@
1
+ module Spark
2
+ module Helper
3
+ module System
4
+
5
+ def self.included(base)
6
+ base.send :extend, Methods
7
+ base.send :include, Methods
8
+ end
9
+
10
+ module Methods
11
+ def windows?
12
+ RbConfig::CONFIG['host_os'] =~ /mswin|mingw/
13
+ end
14
+
15
+ def mri?
16
+ RbConfig::CONFIG['ruby_install_name'] == 'ruby'
17
+ end
18
+
19
+ def jruby?
20
+ RbConfig::CONFIG['ruby_install_name'] == 'jruby'
21
+ end
22
+
23
+ def pry?
24
+ !!Thread.current[:__pry__]
25
+ end
26
+
27
+ # Memory usage in kb
28
+ def memory_usage
29
+ if jruby?
30
+ runtime = java.lang.Runtime.getRuntime
31
+ (runtime.totalMemory - runtime.freeMemory) >> 10
32
+ elsif windows?
33
+ # not yet
34
+ else
35
+ `ps -o rss= -p #{Process.pid}`.to_i
36
+ end
37
+ end
38
+ end # Methods
39
+
40
+ end # System
41
+ end # Helper
42
+ end # Spark
@@ -0,0 +1,19 @@
1
+ module Spark
2
+ module JavaBridge
3
+
4
+ autoload :Base, 'spark/java_bridge/base'
5
+ autoload :JRuby, 'spark/java_bridge/jruby'
6
+ autoload :RJB, 'spark/java_bridge/rjb'
7
+
8
+ include Spark::Helper::System
9
+
10
+ def self.get
11
+ if jruby?
12
+ JRuby
13
+ else
14
+ RJB
15
+ end
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,203 @@
1
+ ##
2
+ # Spark::JavaBridge::Base
3
+ #
4
+ # Parent for all adapter (ruby - java)
5
+ #
6
+ module Spark
7
+ module JavaBridge
8
+ class Base
9
+
10
+ include Spark::Helper::System
11
+
12
+ JAVA_OBJECTS = [
13
+ 'java.util.ArrayList',
14
+ 'org.apache.spark.SparkConf',
15
+ 'org.apache.spark.api.java.JavaSparkContext',
16
+ 'org.apache.spark.api.ruby.RubyRDD',
17
+ 'org.apache.spark.api.ruby.RubyUtils',
18
+ 'org.apache.spark.api.ruby.RubyWorker',
19
+ 'org.apache.spark.api.ruby.PairwiseRDD',
20
+ 'org.apache.spark.api.ruby.RubyAccumulatorParam',
21
+ 'org.apache.spark.api.ruby.RubySerializer',
22
+ 'org.apache.spark.api.python.PythonRDD',
23
+ 'org.apache.spark.api.python.PythonPartitioner',
24
+ 'org.apache.spark.ui.ruby.RubyTab',
25
+ 'org.apache.spark.mllib.api.ruby.RubyMLLibAPI',
26
+ 'scala.collection.mutable.HashMap',
27
+ :JInteger => 'java.lang.Integer',
28
+ :JLong => 'java.lang.Long',
29
+ :JLogger => 'org.apache.log4j.Logger',
30
+ :JLevel => 'org.apache.log4j.Level',
31
+ :JPriority => 'org.apache.log4j.Priority',
32
+ :JUtils => 'org.apache.spark.util.Utils',
33
+ :JStorageLevel => 'org.apache.spark.storage.StorageLevel',
34
+ :JDenseVector => 'org.apache.spark.mllib.linalg.DenseVector',
35
+ :JDenseMatrix => 'org.apache.spark.mllib.linalg.DenseMatrix'
36
+ ]
37
+
38
+ JAVA_TEST_OBJECTS = [
39
+ 'org.apache.spark.mllib.api.ruby.RubyMLLibUtilAPI'
40
+ ]
41
+
42
+ RUBY_TO_JAVA_SKIP = [Fixnum, Integer]
43
+
44
+ def initialize(spark_home)
45
+ @spark_home = spark_home
46
+ end
47
+
48
+ # Import all important classes into Objects
49
+ def load
50
+ return if @loaded
51
+
52
+ java_objects.each do |name, klass|
53
+ import(name, klass)
54
+ end
55
+
56
+ @loaded = true
57
+ nil
58
+ end
59
+
60
+ # Import classes for testing
61
+ def load_test
62
+ return if @loaded_test
63
+
64
+ java_test_objects.each do |name, klass|
65
+ import(name, klass)
66
+ end
67
+
68
+ @loaded_test = true
69
+ nil
70
+ end
71
+
72
+ # Call java object
73
+ def call(klass, method, *args)
74
+ # To java
75
+ args.map!{|item| to_java(item)}
76
+
77
+ # Call java
78
+ result = klass.__send__(method, *args)
79
+
80
+ # To ruby
81
+ to_ruby(result)
82
+ end
83
+
84
+ def to_java_array_list(array)
85
+ array_list = ArrayList.new
86
+ array.each do |item|
87
+ array_list.add(to_java(item))
88
+ end
89
+ array_list
90
+ end
91
+
92
+ def to_long(number)
93
+ return nil if number.nil?
94
+ JLong.new(number)
95
+ end
96
+
97
+ def to_java(object)
98
+ if RUBY_TO_JAVA_SKIP.include?(object.class)
99
+ # Some object are convert automatically
100
+ # This is for preventing errors
101
+ # For example: jruby store integer as long so 1.to_java is Long
102
+ object
103
+ elsif object.respond_to?(:to_java)
104
+ object.to_java
105
+ elsif object.is_a?(Array)
106
+ to_java_array_list(object)
107
+ else
108
+ object
109
+ end
110
+ end
111
+
112
+ # Array problem:
113
+ # Rjb: object.toArray -> Array
114
+ # Jruby: object.toArray -> java.lang.Object
115
+ #
116
+ def to_ruby(object)
117
+ if java_object?(object)
118
+ class_name = object.getClass.getSimpleName
119
+ case class_name
120
+ when 'ArraySeq'
121
+ result = []
122
+ iterator = object.iterator
123
+ while iterator.hasNext
124
+ result << to_ruby(iterator.next)
125
+ end
126
+ result
127
+ when 'Map2', 'Map3', 'Map4', 'HashTrieMap'
128
+ Hash[
129
+ object.toSeq.array.to_a.map!{|item| [item._1, item._2]}
130
+ ]
131
+ when 'SeqWrapper'; object.toArray.to_a.map!{|item| to_ruby(item)}
132
+ when 'ofRef'; object.array.to_a.map!{|item| to_ruby(item)} # WrappedArray$ofRef
133
+ when 'LabeledPoint'; Spark::Mllib::LabeledPoint.from_java(object)
134
+ when 'DenseVector'; Spark::Mllib::DenseVector.from_java(object)
135
+ when 'KMeansModel'; Spark::Mllib::KMeansModel.from_java(object)
136
+ when 'DenseMatrix'; Spark::Mllib::DenseMatrix.from_java(object)
137
+ else
138
+ # Some RDD
139
+ if class_name != 'JavaRDD' && class_name.end_with?('RDD')
140
+ object = object.toJavaRDD
141
+ class_name = 'JavaRDD'
142
+ end
143
+
144
+ # JavaRDD
145
+ if class_name == 'JavaRDD'
146
+ jrdd = RubyRDD.toRuby(object)
147
+
148
+ serializer = Spark.sc.get_serializer('marshal', nil)
149
+ deserializer = Spark.sc.get_serializer('marshal', 2) # is fully batched
150
+
151
+ return Spark::RDD.new(jrdd, Spark.sc, serializer, deserializer)
152
+ end
153
+
154
+ # Unknow
155
+ Spark.logger.warn("Java object '#{object.getClass.name}' was not converted.")
156
+ object
157
+ end
158
+
159
+ else
160
+ # Already transfered
161
+ object
162
+ end
163
+ end
164
+
165
+ alias_method :java_to_ruby, :to_ruby
166
+ alias_method :ruby_to_java, :to_java
167
+
168
+ private
169
+
170
+ def jars
171
+ result = []
172
+ if File.file?(@spark_home)
173
+ result << @spark_home
174
+ else
175
+ result << Dir.glob(File.join(@spark_home, '*.jar'))
176
+ end
177
+ result.flatten
178
+ end
179
+
180
+ def objects_with_names(objects)
181
+ hash = {}
182
+ objects.each do |object|
183
+ if object.is_a?(Hash)
184
+ hash.merge!(object)
185
+ else
186
+ key = object.split('.').last.to_sym
187
+ hash[key] = object
188
+ end
189
+ end
190
+ hash
191
+ end
192
+
193
+ def java_objects
194
+ objects_with_names(JAVA_OBJECTS)
195
+ end
196
+
197
+ def java_test_objects
198
+ objects_with_names(JAVA_TEST_OBJECTS)
199
+ end
200
+
201
+ end
202
+ end
203
+ end
@@ -0,0 +1,23 @@
1
+ require 'java'
2
+
3
+ module Spark
4
+ module JavaBridge
5
+ class JRuby < Base
6
+
7
+ def initialize(*args)
8
+ super
9
+ jars.each {|jar| require jar}
10
+ end
11
+
12
+ def import(name, klass)
13
+ klass = "Java::#{klass}"
14
+ Object.const_set(name, eval(klass)) rescue nil
15
+ end
16
+
17
+ def java_object?(object)
18
+ object.is_a?(JavaProxy)
19
+ end
20
+
21
+ end
22
+ end
23
+ end