ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,92 @@
1
+ require 'distribution'
2
+
3
+ # Random Generators
4
+ module Spark
5
+ module RandomGenerator
6
+ class Poisson
7
+
8
+ def initialize(mean, seed)
9
+ generator = Random.new(seed)
10
+ @exp_rng = Distribution::Exponential.rng(1.0/mean, random: generator)
11
+ end
12
+
13
+ def rand
14
+ t = 0.0
15
+ number = 0
16
+
17
+ loop{
18
+ t += @exp_rng.call
19
+ if t > 1
20
+ return number
21
+ end
22
+ number += 1
23
+ }
24
+ end
25
+
26
+ end
27
+ end
28
+ end
29
+
30
+ # Samplers
31
+ module Spark
32
+ module Sampler
33
+
34
+ class Base
35
+ attr_reader :fraction, :seed
36
+
37
+ def initialize(fraction, seed=nil)
38
+ @fraction = fraction
39
+ @seed = seed || Random.new_seed
40
+ end
41
+ end
42
+
43
+ # Poisson Sampler
44
+ # -------------------------------------------------------------------------
45
+ class Poisson < Base
46
+
47
+ def sample(iterator)
48
+ iterator.map! do |item|
49
+ count = rng.rand
50
+ Array.new(count) { item }
51
+ end
52
+ iterator.flatten!
53
+ iterator.compact!
54
+ iterator
55
+ end
56
+
57
+ def lazy_sample(iterator)
58
+ Enumerator::Lazy.new(iterator) do |yielder, value|
59
+ count = rng.rand
60
+ count.times { yielder << value }
61
+ end
62
+ end
63
+
64
+ def rng
65
+ @rng ||= Spark::RandomGenerator::Poisson.new(fraction, seed)
66
+ end
67
+
68
+ end
69
+
70
+ # Uniform Sampler
71
+ # -------------------------------------------------------------------------
72
+ class Uniform < Base
73
+
74
+ def sample(iterator)
75
+ iterator.select!{|item| rng.rand <= fraction}
76
+ iterator
77
+ end
78
+
79
+ def lazy_sample(iterator)
80
+ iterator.select do |item|
81
+ rng.rand <= fraction
82
+ end
83
+ end
84
+
85
+ def rng
86
+ @rng ||= Random.new(seed)
87
+ end
88
+
89
+ end
90
+
91
+ end
92
+ end
@@ -0,0 +1,24 @@
1
+ module Spark
2
+ module Serializer
3
+ autoload :Base, 'spark/serializer/base'
4
+ autoload :UTF8, 'spark/serializer/utf8'
5
+ autoload :Marshal, 'spark/serializer/marshal'
6
+ autoload :MessagePack, 'spark/serializer/message_pack'
7
+ autoload :Oj, 'spark/serializer/oj'
8
+ autoload :Pair, 'spark/serializer/pair'
9
+ autoload :Cartesian, 'spark/serializer/cartesian'
10
+
11
+ DEFAULT_BATCH_SIZE = 1024
12
+ DEFAULT_SERIALIZER_NAME = 'marshal'
13
+
14
+ def self.get(suggestion)
15
+ const_get(suggestion.to_s.camelize) rescue nil
16
+ end
17
+
18
+ def self.get!(suggestion)
19
+ const_get(suggestion.to_s.camelize)
20
+ rescue
21
+ raise Spark::NotImplemented, "Serializer #{suggestion.to_s.camelize} not exist."
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,170 @@
1
+ module Spark
2
+ module Serializer
3
+ # @abstract Parent for all type of serializers
4
+ class Base
5
+
6
+ include Spark::Helper::Serialize
7
+ include Spark::Constant
8
+
9
+ attr_reader :batch_size
10
+
11
+ # Set default values
12
+ def initialize(batch_size=nil)
13
+ self.batch_size = batch_size
14
+ end
15
+
16
+ def ==(other)
17
+ self.class == other.class && self.batch_size == other.batch_size
18
+ end
19
+
20
+ # Set values given by user
21
+ def set(batch_size)
22
+ self.batch_size = batch_size unless batch_size.nil?
23
+ self
24
+ end
25
+
26
+ def batch_size=(size)
27
+ @batch_size = size.to_i
28
+ end
29
+
30
+ def unbatch!
31
+ self.batch_size = 1
32
+ end
33
+
34
+ # nil, 0, 1 are considered as non-batched
35
+ def batched?
36
+ batch_size > 1
37
+ end
38
+
39
+ # ===========================================================================
40
+ # Load
41
+
42
+ # Load and deserialize an Array from IO, Array of Java iterator
43
+ # mri: respond_to?(:iterator) => false
44
+ # jruby: respond_to?(:iterator) => true
45
+ #
46
+ def load(source)
47
+ # Tempfile is Delegator for File so it is not IO
48
+ # second wasy is __getobj__.is_a?(IO)
49
+ if source.is_a?(IO) || source.is_a?(Tempfile)
50
+ load_from_io(source)
51
+ # elsif source.is_a?(Array)
52
+ # load_from_array(source)
53
+ elsif try(source, :iterator)
54
+ load_from_iterator(source.iterator)
55
+ end
56
+ end
57
+
58
+ # Load data from IO. Data must have a format:
59
+ #
60
+ # +------------+--------+
61
+ # | signed int | data |
62
+ # | 4B | |
63
+ # +------------+--------+
64
+ #
65
+ def load_from_io(io)
66
+ return to_enum(__callee__, io) unless block_given?
67
+
68
+ loop do
69
+ lenght = read_int(io)
70
+ break if lenght == DATA_EOF
71
+
72
+ result = load_next_from_io(io, lenght)
73
+ if batched? && result.respond_to?(:each)
74
+ result.each {|item| yield item }
75
+ else
76
+ yield result
77
+ end
78
+ end # loop
79
+ end # load_from_io
80
+
81
+ def load_next_from_io(io, lenght)
82
+ deserialize(io.read(lenght))
83
+ end
84
+
85
+ # Load from Java iterator by calling hasNext and next
86
+ #
87
+ def load_from_iterator(iterator)
88
+ result = []
89
+ while iterator.hasNext
90
+ item = iterator.next
91
+
92
+ # mri: data are String
93
+ # jruby: data are bytes Array
94
+
95
+ if item.is_a?(String)
96
+ # Serialized data
97
+ result << deserialize(item)
98
+ else
99
+ # Java object
100
+ if try(item, :getClass)
101
+ case item.getClass.name
102
+ when '[B'
103
+ # Array of bytes
104
+ result << deserialize(pack_unsigned_chars(item.to_a))
105
+ when 'scala.Tuple2'
106
+ # Tuple2
107
+ result << deserialize(item._1, item._2)
108
+ end
109
+ end
110
+ end
111
+
112
+ end
113
+
114
+ result.flatten!(1) if batched?
115
+ result
116
+ end
117
+
118
+ def read_int(io)
119
+ bytes = io.read(4)
120
+ return DATA_EOF if bytes.nil?
121
+ unpack_int(bytes)
122
+ end
123
+
124
+ # ===========================================================================
125
+ # Dump
126
+
127
+ # Serialize and send data into IO. Check 'load_from_io' for data format.
128
+ def dump(data, io)
129
+ if !data.is_a?(Array) && !data.is_a?(Enumerator)
130
+ data = [data]
131
+ end
132
+ data = data.each_slice(batch_size) if batched?
133
+
134
+ data.each do |item|
135
+ serialized = serialize(item)
136
+
137
+ # Size and data can have different encoding
138
+ # Marshal: both ASCII
139
+ # Oj: ASCII and UTF-8
140
+ io.write(pack_int(serialized.bytesize))
141
+ io.write(serialized)
142
+ end
143
+
144
+ io.flush
145
+ end
146
+
147
+ # For direct serialization
148
+ def dump_to_java(data)
149
+ data.map! do |item|
150
+ serialize(item).to_java_bytes
151
+ end
152
+ end
153
+
154
+ # Rescue cannot be defined
155
+ #
156
+ # mri => RuntimeError
157
+ # jruby => NoMethodError
158
+ #
159
+ def try(object, method)
160
+ begin
161
+ object.__send__(method)
162
+ return true
163
+ rescue
164
+ return false
165
+ end
166
+ end
167
+
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,37 @@
1
+ module Spark
2
+ module Serializer
3
+ class Cartesian < Base
4
+
5
+ attr_reader :first, :second
6
+
7
+ def set(first, second)
8
+ @first = first
9
+ @second = second
10
+ self
11
+ end
12
+
13
+ # Little hack
14
+ # Data does not have to be batched but items are added by <<
15
+ def batched?
16
+ true
17
+ end
18
+
19
+ def load_next_from_io(io, lenght)
20
+ item1 = io.read(lenght)
21
+ item2 = io.read_string
22
+ deserialize(item1, item2)
23
+ end
24
+
25
+ def deserialize(item1, item2)
26
+ deserialized_item1 = @first.deserialize(item1)
27
+ deserialized_item2 = @second.deserialize(item2)
28
+
29
+ deserialized_item1 = [deserialized_item1] unless @first.batched?
30
+ deserialized_item2 = [deserialized_item2] unless @second.batched?
31
+
32
+ deserialized_item1.product(deserialized_item2)
33
+ end
34
+
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,19 @@
1
+ module Spark
2
+ module Serializer
3
+ class Marshal < Base
4
+
5
+ def name
6
+ 'marshal'
7
+ end
8
+
9
+ def serialize(data)
10
+ ::Marshal::dump(data)
11
+ end
12
+
13
+ def deserialize(data)
14
+ ::Marshal::load(data)
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,25 @@
1
+ module Spark
2
+ module Serializer
3
+ class MessagePack < Marshal
4
+
5
+ def name
6
+ 'message_pack'
7
+ end
8
+
9
+ def self.serialize(data)
10
+ ::MessagePack::dump(data)
11
+ end
12
+
13
+ def self.deserialize(data)
14
+ ::MessagePack::load(data)
15
+ end
16
+
17
+ end
18
+ end
19
+ end
20
+
21
+ begin
22
+ require 'msgpack'
23
+ rescue LoadError
24
+ Spark::Serializer::MessagePack = Spark::Serializer::Marshal
25
+ end
@@ -0,0 +1,25 @@
1
+ module Spark
2
+ module Serializer
3
+ class Oj < Marshal
4
+
5
+ def name
6
+ 'oj'
7
+ end
8
+
9
+ def serialize(data)
10
+ ::Oj::dump(data)
11
+ end
12
+
13
+ def deserialize(data)
14
+ ::Oj::load(data)
15
+ end
16
+
17
+ end
18
+ end
19
+ end
20
+
21
+ begin
22
+ require 'oj'
23
+ rescue LoadError
24
+ Spark::Serializer::Oj = Spark::Serializer::Marshal
25
+ end
@@ -0,0 +1,27 @@
1
+ module Spark
2
+ module Serializer
3
+ class Pair < Base
4
+
5
+ attr_reader :first, :second
6
+
7
+ def set(first, second)
8
+ unbatch!
9
+ @first = first
10
+ @second = second
11
+ self
12
+ end
13
+
14
+ def batched?
15
+ false
16
+ end
17
+
18
+ def load_next_from_io(io, lenght)
19
+ key_value = []
20
+ key_value << @first.load_next_from_io(io, lenght)
21
+ key_value << @second.load_next_from_io(io, read_int(io))
22
+ key_value
23
+ end
24
+
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,25 @@
1
+ module Spark
2
+ module Serializer
3
+ ##
4
+ # Used for file
5
+ #
6
+ # File is sended as String but worker use serialization
7
+ #
8
+ class UTF8 < Base
9
+
10
+ def set(*)
11
+ unbatch!
12
+ self
13
+ end
14
+
15
+ def batched?
16
+ false
17
+ end
18
+
19
+ def load_next_from_io(io, lenght)
20
+ io.read(lenght).force_encoding(Encoding::UTF_8)
21
+ end
22
+
23
+ end
24
+ end
25
+ end