ruby-spark 1.1.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +252 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +6 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/comparison/prepare.sh +18 -0
  12. data/benchmark/comparison/python.py +156 -0
  13. data/benchmark/comparison/r.r +69 -0
  14. data/benchmark/comparison/ruby.rb +167 -0
  15. data/benchmark/comparison/run-all.sh +160 -0
  16. data/benchmark/comparison/scala.scala +181 -0
  17. data/benchmark/custom_marshal.rb +94 -0
  18. data/benchmark/digest.rb +150 -0
  19. data/benchmark/enumerator.rb +88 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/example/website_search.rb +83 -0
  27. data/ext/ruby_c/extconf.rb +3 -0
  28. data/ext/ruby_c/murmur.c +158 -0
  29. data/ext/ruby_c/murmur.h +9 -0
  30. data/ext/ruby_c/ruby-spark.c +18 -0
  31. data/ext/ruby_java/Digest.java +36 -0
  32. data/ext/ruby_java/Murmur2.java +98 -0
  33. data/ext/ruby_java/RubySparkExtService.java +28 -0
  34. data/ext/ruby_java/extconf.rb +3 -0
  35. data/ext/spark/build.sbt +73 -0
  36. data/ext/spark/project/plugins.sbt +9 -0
  37. data/ext/spark/sbt/sbt +34 -0
  38. data/ext/spark/src/main/scala/Exec.scala +91 -0
  39. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  40. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  41. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  42. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  43. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  44. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  46. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  47. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  48. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  49. data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
  50. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  51. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  52. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  53. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  54. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  55. data/lib/ruby-spark.rb +1 -0
  56. data/lib/spark.rb +198 -0
  57. data/lib/spark/accumulator.rb +260 -0
  58. data/lib/spark/broadcast.rb +98 -0
  59. data/lib/spark/build.rb +43 -0
  60. data/lib/spark/cli.rb +169 -0
  61. data/lib/spark/command.rb +86 -0
  62. data/lib/spark/command/base.rb +158 -0
  63. data/lib/spark/command/basic.rb +345 -0
  64. data/lib/spark/command/pair.rb +124 -0
  65. data/lib/spark/command/sort.rb +51 -0
  66. data/lib/spark/command/statistic.rb +144 -0
  67. data/lib/spark/command_builder.rb +141 -0
  68. data/lib/spark/command_validator.rb +34 -0
  69. data/lib/spark/config.rb +238 -0
  70. data/lib/spark/constant.rb +14 -0
  71. data/lib/spark/context.rb +322 -0
  72. data/lib/spark/error.rb +50 -0
  73. data/lib/spark/ext/hash.rb +41 -0
  74. data/lib/spark/ext/integer.rb +25 -0
  75. data/lib/spark/ext/io.rb +67 -0
  76. data/lib/spark/ext/ip_socket.rb +29 -0
  77. data/lib/spark/ext/module.rb +58 -0
  78. data/lib/spark/ext/object.rb +24 -0
  79. data/lib/spark/ext/string.rb +24 -0
  80. data/lib/spark/helper.rb +10 -0
  81. data/lib/spark/helper/logger.rb +40 -0
  82. data/lib/spark/helper/parser.rb +85 -0
  83. data/lib/spark/helper/serialize.rb +71 -0
  84. data/lib/spark/helper/statistic.rb +93 -0
  85. data/lib/spark/helper/system.rb +42 -0
  86. data/lib/spark/java_bridge.rb +19 -0
  87. data/lib/spark/java_bridge/base.rb +203 -0
  88. data/lib/spark/java_bridge/jruby.rb +23 -0
  89. data/lib/spark/java_bridge/rjb.rb +41 -0
  90. data/lib/spark/logger.rb +76 -0
  91. data/lib/spark/mllib.rb +100 -0
  92. data/lib/spark/mllib/classification/common.rb +31 -0
  93. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  94. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  95. data/lib/spark/mllib/classification/svm.rb +135 -0
  96. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  97. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  98. data/lib/spark/mllib/matrix.rb +120 -0
  99. data/lib/spark/mllib/regression/common.rb +73 -0
  100. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  101. data/lib/spark/mllib/regression/lasso.rb +100 -0
  102. data/lib/spark/mllib/regression/linear.rb +124 -0
  103. data/lib/spark/mllib/regression/ridge.rb +97 -0
  104. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  105. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  106. data/lib/spark/mllib/stat/distribution.rb +12 -0
  107. data/lib/spark/mllib/vector.rb +185 -0
  108. data/lib/spark/rdd.rb +1377 -0
  109. data/lib/spark/sampler.rb +92 -0
  110. data/lib/spark/serializer.rb +79 -0
  111. data/lib/spark/serializer/auto_batched.rb +59 -0
  112. data/lib/spark/serializer/base.rb +63 -0
  113. data/lib/spark/serializer/batched.rb +84 -0
  114. data/lib/spark/serializer/cartesian.rb +13 -0
  115. data/lib/spark/serializer/compressed.rb +27 -0
  116. data/lib/spark/serializer/marshal.rb +17 -0
  117. data/lib/spark/serializer/message_pack.rb +23 -0
  118. data/lib/spark/serializer/oj.rb +23 -0
  119. data/lib/spark/serializer/pair.rb +41 -0
  120. data/lib/spark/serializer/text.rb +25 -0
  121. data/lib/spark/sort.rb +189 -0
  122. data/lib/spark/stat_counter.rb +125 -0
  123. data/lib/spark/storage_level.rb +39 -0
  124. data/lib/spark/version.rb +3 -0
  125. data/lib/spark/worker/master.rb +144 -0
  126. data/lib/spark/worker/spark_files.rb +15 -0
  127. data/lib/spark/worker/worker.rb +200 -0
  128. data/ruby-spark.gemspec +47 -0
  129. data/spec/generator.rb +37 -0
  130. data/spec/inputs/lorem_300.txt +316 -0
  131. data/spec/inputs/numbers/1.txt +50 -0
  132. data/spec/inputs/numbers/10.txt +50 -0
  133. data/spec/inputs/numbers/11.txt +50 -0
  134. data/spec/inputs/numbers/12.txt +50 -0
  135. data/spec/inputs/numbers/13.txt +50 -0
  136. data/spec/inputs/numbers/14.txt +50 -0
  137. data/spec/inputs/numbers/15.txt +50 -0
  138. data/spec/inputs/numbers/16.txt +50 -0
  139. data/spec/inputs/numbers/17.txt +50 -0
  140. data/spec/inputs/numbers/18.txt +50 -0
  141. data/spec/inputs/numbers/19.txt +50 -0
  142. data/spec/inputs/numbers/2.txt +50 -0
  143. data/spec/inputs/numbers/20.txt +50 -0
  144. data/spec/inputs/numbers/3.txt +50 -0
  145. data/spec/inputs/numbers/4.txt +50 -0
  146. data/spec/inputs/numbers/5.txt +50 -0
  147. data/spec/inputs/numbers/6.txt +50 -0
  148. data/spec/inputs/numbers/7.txt +50 -0
  149. data/spec/inputs/numbers/8.txt +50 -0
  150. data/spec/inputs/numbers/9.txt +50 -0
  151. data/spec/inputs/numbers_0_100.txt +101 -0
  152. data/spec/inputs/numbers_1_100.txt +100 -0
  153. data/spec/lib/collect_spec.rb +42 -0
  154. data/spec/lib/command_spec.rb +68 -0
  155. data/spec/lib/config_spec.rb +64 -0
  156. data/spec/lib/context_spec.rb +165 -0
  157. data/spec/lib/ext_spec.rb +72 -0
  158. data/spec/lib/external_apps_spec.rb +45 -0
  159. data/spec/lib/filter_spec.rb +80 -0
  160. data/spec/lib/flat_map_spec.rb +100 -0
  161. data/spec/lib/group_spec.rb +109 -0
  162. data/spec/lib/helper_spec.rb +19 -0
  163. data/spec/lib/key_spec.rb +41 -0
  164. data/spec/lib/manipulation_spec.rb +122 -0
  165. data/spec/lib/map_partitions_spec.rb +87 -0
  166. data/spec/lib/map_spec.rb +91 -0
  167. data/spec/lib/mllib/classification_spec.rb +54 -0
  168. data/spec/lib/mllib/clustering_spec.rb +35 -0
  169. data/spec/lib/mllib/matrix_spec.rb +32 -0
  170. data/spec/lib/mllib/regression_spec.rb +116 -0
  171. data/spec/lib/mllib/vector_spec.rb +77 -0
  172. data/spec/lib/reduce_by_key_spec.rb +118 -0
  173. data/spec/lib/reduce_spec.rb +131 -0
  174. data/spec/lib/sample_spec.rb +46 -0
  175. data/spec/lib/serializer_spec.rb +88 -0
  176. data/spec/lib/sort_spec.rb +58 -0
  177. data/spec/lib/statistic_spec.rb +170 -0
  178. data/spec/lib/whole_text_files_spec.rb +33 -0
  179. data/spec/spec_helper.rb +38 -0
  180. metadata +389 -0
@@ -0,0 +1,92 @@
1
+ require 'distribution'
2
+
3
+ # Random Generators
4
+ module Spark
5
+ module RandomGenerator
6
+ class Poisson
7
+
8
+ def initialize(mean, seed)
9
+ generator = Random.new(seed)
10
+ @exp_rng = Distribution::Exponential.rng(1.0/mean, random: generator)
11
+ end
12
+
13
+ def rand
14
+ t = 0.0
15
+ number = 0
16
+
17
+ loop{
18
+ t += @exp_rng.call
19
+ if t > 1
20
+ return number
21
+ end
22
+ number += 1
23
+ }
24
+ end
25
+
26
+ end
27
+ end
28
+ end
29
+
30
+ # Samplers
31
+ module Spark
32
+ module Sampler
33
+
34
+ class Base
35
+ attr_reader :fraction, :seed
36
+
37
+ def initialize(fraction, seed=nil)
38
+ @fraction = fraction
39
+ @seed = seed || Random.new_seed
40
+ end
41
+ end
42
+
43
+ # Poisson Sampler
44
+ # -------------------------------------------------------------------------
45
+ class Poisson < Base
46
+
47
+ def sample(iterator)
48
+ iterator.map! do |item|
49
+ count = rng.rand
50
+ Array.new(count) { item }
51
+ end
52
+ iterator.flatten!
53
+ iterator.compact!
54
+ iterator
55
+ end
56
+
57
+ def lazy_sample(iterator)
58
+ Enumerator::Lazy.new(iterator) do |yielder, value|
59
+ count = rng.rand
60
+ count.times { yielder << value }
61
+ end
62
+ end
63
+
64
+ def rng
65
+ @rng ||= Spark::RandomGenerator::Poisson.new(fraction, seed)
66
+ end
67
+
68
+ end
69
+
70
+ # Uniform Sampler
71
+ # -------------------------------------------------------------------------
72
+ class Uniform < Base
73
+
74
+ def sample(iterator)
75
+ iterator.select!{|item| rng.rand <= fraction}
76
+ iterator
77
+ end
78
+
79
+ def lazy_sample(iterator)
80
+ iterator.select do |item|
81
+ rng.rand <= fraction
82
+ end
83
+ end
84
+
85
+ def rng
86
+ @rng ||= Random.new(seed)
87
+ end
88
+
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,79 @@
1
+ module Spark
2
+ ##
3
+ # Serializer
4
+ #
5
+ module Serializer
6
+
7
+ DEFAULT_COMPRESS = false
8
+ DEFAULT_BATCH_SIZE = 1024
9
+ DEFAULT_SERIALIZER_NAME = 'marshal'
10
+
11
+ @@registered = {}
12
+
13
+ # Register class and create method for quick access.
14
+ # Class will be available also as __name__ for using
15
+ # in build method (Proc binding problem).
16
+ #
17
+ # == Examples:
18
+ # register('test1', 'test2', Class)
19
+ #
20
+ # Spark::Serializer.test1
21
+ # Spark::Serializer.test2
22
+ #
23
+ # # Proc binding problem
24
+ # build { marshal } # => Spark::Serializer::Marshal
25
+ #
26
+ # marshal = 1
27
+ # build { marshal } # => 1
28
+ #
29
+ # build { __marshal__ } # => Spark::Serializer::Marshal
30
+ #
31
+ def self.register(*args)
32
+ klass = args.pop
33
+ args.each do |arg|
34
+ @@registered[arg] = klass
35
+ define_singleton_method(arg.to_sym){|*args| klass.new(*args) }
36
+ define_singleton_method("__#{arg}__".to_sym){|*args| klass.new(*args) }
37
+ end
38
+ end
39
+
40
+ def self.find(name)
41
+ @@registered[name.to_s.downcase]
42
+ end
43
+
44
+ def self.find!(name)
45
+ klass = find(name)
46
+
47
+ if klass.nil?
48
+ raise Spark::SerializeError, "Unknow serializer #{name}."
49
+ end
50
+
51
+ klass
52
+ end
53
+
54
+ def self.build(text=nil, &block)
55
+ if block_given?
56
+ class_eval(&block)
57
+ else
58
+ class_eval(text.to_s)
59
+ end
60
+ end
61
+
62
+ end
63
+ end
64
+
65
+ # Parent
66
+ require 'spark/serializer/base'
67
+
68
+ # Basic
69
+ require 'spark/serializer/oj'
70
+ require 'spark/serializer/marshal'
71
+ require 'spark/serializer/message_pack'
72
+ require 'spark/serializer/text'
73
+
74
+ # Others
75
+ require 'spark/serializer/batched'
76
+ require 'spark/serializer/auto_batched'
77
+ require 'spark/serializer/compressed'
78
+ require 'spark/serializer/pair'
79
+ require 'spark/serializer/cartesian'
@@ -0,0 +1,59 @@
1
+ module Spark
2
+ module Serializer
3
+ ##
4
+ # AutoBatched serializator
5
+ #
6
+ # Batch size is computed automatically. Simillar to Python's AutoBatchedSerializer.
7
+ #
8
+ class AutoBatched < Batched
9
+
10
+ MAX_RATIO = 10
11
+
12
+ def initialize(serializer, best_size=65536)
13
+ @serializer = serializer
14
+ @best_size = best_size.to_i
15
+
16
+ error('Batch size must be greater than 1') if @best_size < 2
17
+ end
18
+
19
+ def name
20
+ "AutoBatched(#{@best_size})"
21
+ end
22
+
23
+ def dump_to_io(data, io)
24
+ check_each(data)
25
+
26
+ # Only Array have .slice
27
+ data = data.to_a
28
+
29
+ index = 0
30
+ batch = 2
31
+ max = @best_size * MAX_RATIO
32
+
33
+ loop do
34
+ chunk = data.slice(index, batch)
35
+ if chunk.nil? || chunk.empty?
36
+ break
37
+ end
38
+
39
+ serialized = @serializer.dump(chunk)
40
+ io.write_string(serialized)
41
+
42
+ index += batch
43
+
44
+ size = serialized.bytesize
45
+ if size < @best_size
46
+ batch *= 2
47
+ elsif size > max && batch > 1
48
+ batch /= 2
49
+ end
50
+ end
51
+
52
+ io.flush
53
+ end
54
+
55
+ end
56
+ end
57
+ end
58
+
59
+ Spark::Serializer.register('auto_batched', 'autobatched', Spark::Serializer::AutoBatched)
@@ -0,0 +1,63 @@
1
+ module Spark
2
+ module Serializer
3
+ # @abstract Parent for all serializers
4
+ class Base
5
+
6
+ def load_from_io(io)
7
+ return to_enum(__callee__, io) unless block_given?
8
+
9
+ loop do
10
+ size = io.read_int_or_eof
11
+ break if size == Spark::Constant::DATA_EOF
12
+
13
+ yield load(io.read(size))
14
+ end
15
+ end
16
+
17
+ def load_from_file(file, *args)
18
+ return to_enum(__callee__, file, *args) unless block_given?
19
+
20
+ load_from_io(file, *args).each do |item|
21
+ yield item
22
+ end
23
+
24
+ file.close
25
+ file.unlink
26
+ end
27
+
28
+ def ==(other)
29
+ self.to_s == other.to_s
30
+ end
31
+
32
+ def batched?
33
+ false
34
+ end
35
+
36
+ def unbatch!
37
+ end
38
+
39
+ def check_each(data)
40
+ unless data.respond_to?(:each)
41
+ error('Data must be iterable.')
42
+ end
43
+ end
44
+
45
+ def error(message)
46
+ raise Spark::SerializeError, message
47
+ end
48
+
49
+ def name
50
+ self.class.name.split('::').last
51
+ end
52
+
53
+ def to_s
54
+ name
55
+ end
56
+
57
+ def inspect
58
+ %{#<Spark::Serializer:0x#{object_id} "#{self}">}
59
+ end
60
+
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,84 @@
1
+ module Spark
2
+ module Serializer
3
+ class Batched < Base
4
+
5
+ attr_writer :serializer
6
+
7
+ def initialize(serializer, batch_size=nil)
8
+ batch_size ||= Spark::Serializer::DEFAULT_BATCH_SIZE
9
+
10
+ @serializer = serializer
11
+ @batch_size = batch_size.to_i
12
+
13
+ error('Batch size must be greater than 0') if @batch_size < 1
14
+ end
15
+
16
+ # Really batched
17
+ def batched?
18
+ @batch_size > 1
19
+ end
20
+
21
+ def unbatch!
22
+ @batch_size = 1
23
+ end
24
+
25
+ def load(data)
26
+ @serializer.load(data)
27
+ end
28
+
29
+ def dump(data)
30
+ @serializer.dump(data)
31
+ end
32
+
33
+ def name
34
+ "Batched(#{@batch_size})"
35
+ end
36
+
37
+ def to_s
38
+ "#{name} -> #{@serializer}"
39
+ end
40
+
41
+
42
+ # === Dump ==============================================================
43
+
44
+ def dump_to_io(data, io)
45
+ check_each(data)
46
+
47
+ if batched?
48
+ data = data.each_slice(@batch_size)
49
+ end
50
+
51
+ data.each do |item|
52
+ serialized = dump(item)
53
+ io.write_string(serialized)
54
+ end
55
+
56
+ io.flush
57
+ end
58
+
59
+
60
+ # === Load ==============================================================
61
+
62
+ def load_from_io(io)
63
+ return to_enum(__callee__, io) unless block_given?
64
+
65
+ loop do
66
+ size = io.read_int_or_eof
67
+ break if size == Spark::Constant::DATA_EOF
68
+
69
+ data = io.read(size)
70
+ data = load(data)
71
+
72
+ if batched?
73
+ data.each{|item| yield item }
74
+ else
75
+ yield data
76
+ end
77
+ end
78
+ end
79
+
80
+ end
81
+ end
82
+ end
83
+
84
+ Spark::Serializer.register('batched', Spark::Serializer::Batched)
@@ -0,0 +1,13 @@
1
+ module Spark
2
+ module Serializer
3
+ class Cartesian < Pair
4
+
5
+ def aggregate(item1, item2)
6
+ item1.product(item2)
7
+ end
8
+
9
+ end
10
+ end
11
+ end
12
+
13
+ Spark::Serializer.register('cartesian', Spark::Serializer::Cartesian)
@@ -0,0 +1,27 @@
1
+ module Spark
2
+ module Serializer
3
+ class Compressed < Base
4
+
5
+ def initialize(serializer)
6
+ @serializer = serializer
7
+ end
8
+
9
+ def dump(data)
10
+ Zlib::Deflate.deflate(@serializer.dump(data))
11
+ end
12
+
13
+ def load(data)
14
+ @serializer.load(Zlib::Inflate.inflate(data))
15
+ end
16
+
17
+ end
18
+ end
19
+ end
20
+
21
+ begin
22
+ # TODO: require only if it is necessary
23
+ require 'zlib'
24
+
25
+ Spark::Serializer.register('compress', 'compressed', Spark::Serializer::Compressed)
26
+ rescue LoadError
27
+ end
@@ -0,0 +1,17 @@
1
+ module Spark
2
+ module Serializer
3
+ class Marshal < Base
4
+
5
+ def dump(data)
6
+ ::Marshal.dump(data)
7
+ end
8
+
9
+ def load(data)
10
+ ::Marshal.load(data)
11
+ end
12
+
13
+ end
14
+ end
15
+ end
16
+
17
+ Spark::Serializer.register('marshal', Spark::Serializer::Marshal)
@@ -0,0 +1,23 @@
1
+ module Spark
2
+ module Serializer
3
+ class MessagePack < Base
4
+
5
+ def dump(data)
6
+ ::MessagePack.dump(data)
7
+ end
8
+
9
+ def load(data)
10
+ ::MessagePack.load(data)
11
+ end
12
+
13
+ end
14
+ end
15
+ end
16
+
17
+ begin
18
+ # TODO: require only if it is necessary
19
+ require 'msgpack'
20
+
21
+ Spark::Serializer.register('messagepack', 'message_pack', 'msgpack', 'msg_pack', Spark::Serializer::MessagePack)
22
+ rescue LoadError
23
+ end