ruby-spark 1.1.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +252 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +6 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/comparison/prepare.sh +18 -0
  12. data/benchmark/comparison/python.py +156 -0
  13. data/benchmark/comparison/r.r +69 -0
  14. data/benchmark/comparison/ruby.rb +167 -0
  15. data/benchmark/comparison/run-all.sh +160 -0
  16. data/benchmark/comparison/scala.scala +181 -0
  17. data/benchmark/custom_marshal.rb +94 -0
  18. data/benchmark/digest.rb +150 -0
  19. data/benchmark/enumerator.rb +88 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/example/website_search.rb +83 -0
  27. data/ext/ruby_c/extconf.rb +3 -0
  28. data/ext/ruby_c/murmur.c +158 -0
  29. data/ext/ruby_c/murmur.h +9 -0
  30. data/ext/ruby_c/ruby-spark.c +18 -0
  31. data/ext/ruby_java/Digest.java +36 -0
  32. data/ext/ruby_java/Murmur2.java +98 -0
  33. data/ext/ruby_java/RubySparkExtService.java +28 -0
  34. data/ext/ruby_java/extconf.rb +3 -0
  35. data/ext/spark/build.sbt +73 -0
  36. data/ext/spark/project/plugins.sbt +9 -0
  37. data/ext/spark/sbt/sbt +34 -0
  38. data/ext/spark/src/main/scala/Exec.scala +91 -0
  39. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  40. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  41. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  42. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  43. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  44. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  46. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  47. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  48. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  49. data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
  50. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  51. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  52. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  53. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  54. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  55. data/lib/ruby-spark.rb +1 -0
  56. data/lib/spark.rb +198 -0
  57. data/lib/spark/accumulator.rb +260 -0
  58. data/lib/spark/broadcast.rb +98 -0
  59. data/lib/spark/build.rb +43 -0
  60. data/lib/spark/cli.rb +169 -0
  61. data/lib/spark/command.rb +86 -0
  62. data/lib/spark/command/base.rb +158 -0
  63. data/lib/spark/command/basic.rb +345 -0
  64. data/lib/spark/command/pair.rb +124 -0
  65. data/lib/spark/command/sort.rb +51 -0
  66. data/lib/spark/command/statistic.rb +144 -0
  67. data/lib/spark/command_builder.rb +141 -0
  68. data/lib/spark/command_validator.rb +34 -0
  69. data/lib/spark/config.rb +238 -0
  70. data/lib/spark/constant.rb +14 -0
  71. data/lib/spark/context.rb +322 -0
  72. data/lib/spark/error.rb +50 -0
  73. data/lib/spark/ext/hash.rb +41 -0
  74. data/lib/spark/ext/integer.rb +25 -0
  75. data/lib/spark/ext/io.rb +67 -0
  76. data/lib/spark/ext/ip_socket.rb +29 -0
  77. data/lib/spark/ext/module.rb +58 -0
  78. data/lib/spark/ext/object.rb +24 -0
  79. data/lib/spark/ext/string.rb +24 -0
  80. data/lib/spark/helper.rb +10 -0
  81. data/lib/spark/helper/logger.rb +40 -0
  82. data/lib/spark/helper/parser.rb +85 -0
  83. data/lib/spark/helper/serialize.rb +71 -0
  84. data/lib/spark/helper/statistic.rb +93 -0
  85. data/lib/spark/helper/system.rb +42 -0
  86. data/lib/spark/java_bridge.rb +19 -0
  87. data/lib/spark/java_bridge/base.rb +203 -0
  88. data/lib/spark/java_bridge/jruby.rb +23 -0
  89. data/lib/spark/java_bridge/rjb.rb +41 -0
  90. data/lib/spark/logger.rb +76 -0
  91. data/lib/spark/mllib.rb +100 -0
  92. data/lib/spark/mllib/classification/common.rb +31 -0
  93. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  94. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  95. data/lib/spark/mllib/classification/svm.rb +135 -0
  96. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  97. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  98. data/lib/spark/mllib/matrix.rb +120 -0
  99. data/lib/spark/mllib/regression/common.rb +73 -0
  100. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  101. data/lib/spark/mllib/regression/lasso.rb +100 -0
  102. data/lib/spark/mllib/regression/linear.rb +124 -0
  103. data/lib/spark/mllib/regression/ridge.rb +97 -0
  104. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  105. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  106. data/lib/spark/mllib/stat/distribution.rb +12 -0
  107. data/lib/spark/mllib/vector.rb +185 -0
  108. data/lib/spark/rdd.rb +1377 -0
  109. data/lib/spark/sampler.rb +92 -0
  110. data/lib/spark/serializer.rb +79 -0
  111. data/lib/spark/serializer/auto_batched.rb +59 -0
  112. data/lib/spark/serializer/base.rb +63 -0
  113. data/lib/spark/serializer/batched.rb +84 -0
  114. data/lib/spark/serializer/cartesian.rb +13 -0
  115. data/lib/spark/serializer/compressed.rb +27 -0
  116. data/lib/spark/serializer/marshal.rb +17 -0
  117. data/lib/spark/serializer/message_pack.rb +23 -0
  118. data/lib/spark/serializer/oj.rb +23 -0
  119. data/lib/spark/serializer/pair.rb +41 -0
  120. data/lib/spark/serializer/text.rb +25 -0
  121. data/lib/spark/sort.rb +189 -0
  122. data/lib/spark/stat_counter.rb +125 -0
  123. data/lib/spark/storage_level.rb +39 -0
  124. data/lib/spark/version.rb +3 -0
  125. data/lib/spark/worker/master.rb +144 -0
  126. data/lib/spark/worker/spark_files.rb +15 -0
  127. data/lib/spark/worker/worker.rb +200 -0
  128. data/ruby-spark.gemspec +47 -0
  129. data/spec/generator.rb +37 -0
  130. data/spec/inputs/lorem_300.txt +316 -0
  131. data/spec/inputs/numbers/1.txt +50 -0
  132. data/spec/inputs/numbers/10.txt +50 -0
  133. data/spec/inputs/numbers/11.txt +50 -0
  134. data/spec/inputs/numbers/12.txt +50 -0
  135. data/spec/inputs/numbers/13.txt +50 -0
  136. data/spec/inputs/numbers/14.txt +50 -0
  137. data/spec/inputs/numbers/15.txt +50 -0
  138. data/spec/inputs/numbers/16.txt +50 -0
  139. data/spec/inputs/numbers/17.txt +50 -0
  140. data/spec/inputs/numbers/18.txt +50 -0
  141. data/spec/inputs/numbers/19.txt +50 -0
  142. data/spec/inputs/numbers/2.txt +50 -0
  143. data/spec/inputs/numbers/20.txt +50 -0
  144. data/spec/inputs/numbers/3.txt +50 -0
  145. data/spec/inputs/numbers/4.txt +50 -0
  146. data/spec/inputs/numbers/5.txt +50 -0
  147. data/spec/inputs/numbers/6.txt +50 -0
  148. data/spec/inputs/numbers/7.txt +50 -0
  149. data/spec/inputs/numbers/8.txt +50 -0
  150. data/spec/inputs/numbers/9.txt +50 -0
  151. data/spec/inputs/numbers_0_100.txt +101 -0
  152. data/spec/inputs/numbers_1_100.txt +100 -0
  153. data/spec/lib/collect_spec.rb +42 -0
  154. data/spec/lib/command_spec.rb +68 -0
  155. data/spec/lib/config_spec.rb +64 -0
  156. data/spec/lib/context_spec.rb +165 -0
  157. data/spec/lib/ext_spec.rb +72 -0
  158. data/spec/lib/external_apps_spec.rb +45 -0
  159. data/spec/lib/filter_spec.rb +80 -0
  160. data/spec/lib/flat_map_spec.rb +100 -0
  161. data/spec/lib/group_spec.rb +109 -0
  162. data/spec/lib/helper_spec.rb +19 -0
  163. data/spec/lib/key_spec.rb +41 -0
  164. data/spec/lib/manipulation_spec.rb +122 -0
  165. data/spec/lib/map_partitions_spec.rb +87 -0
  166. data/spec/lib/map_spec.rb +91 -0
  167. data/spec/lib/mllib/classification_spec.rb +54 -0
  168. data/spec/lib/mllib/clustering_spec.rb +35 -0
  169. data/spec/lib/mllib/matrix_spec.rb +32 -0
  170. data/spec/lib/mllib/regression_spec.rb +116 -0
  171. data/spec/lib/mllib/vector_spec.rb +77 -0
  172. data/spec/lib/reduce_by_key_spec.rb +118 -0
  173. data/spec/lib/reduce_spec.rb +131 -0
  174. data/spec/lib/sample_spec.rb +46 -0
  175. data/spec/lib/serializer_spec.rb +88 -0
  176. data/spec/lib/sort_spec.rb +58 -0
  177. data/spec/lib/statistic_spec.rb +170 -0
  178. data/spec/lib/whole_text_files_spec.rb +33 -0
  179. data/spec/spec_helper.rb +38 -0
  180. metadata +389 -0
@@ -0,0 +1,124 @@
1
+ _Base = Spark::Command::Base
2
+
3
+ # -------------------------------------------------------------------------------------------------
4
+ # CombineByKey
5
+
6
+ class Spark::Command::CombineByKey
7
+
8
+ # ---------------
9
+
10
+ class Base < Spark::Command::Base
11
+ def run(iterator, *)
12
+ _run(iterator).to_a
13
+ end
14
+
15
+ def lazy_run(iterator, *)
16
+ _run(iterator).lazy
17
+ end
18
+ end
19
+
20
+ # ---------------
21
+
22
+ class Combine < Base
23
+ variable :create_combiner
24
+ variable :merge_value
25
+
26
+ def _run(iterator)
27
+ # Not use combiners[key] ||= ..
28
+ # it tests nil and not has_key?
29
+ combiners = {}
30
+ iterator.each do |key, value|
31
+ if combiners.has_key?(key)
32
+ combiners[key] = @merge_value.call(combiners[key], value)
33
+ else
34
+ combiners[key] = @create_combiner.call(value)
35
+ end
36
+ end
37
+ combiners
38
+ end
39
+ end
40
+
41
+ # ---------------
42
+
43
+ class Merge < Base
44
+ variable :merge_combiners
45
+
46
+ def _run(iterator, *)
47
+ combiners = {}
48
+ iterator.each do |key, value|
49
+ if combiners.has_key?(key)
50
+ combiners[key] = @merge_combiners.call(combiners[key], value)
51
+ else
52
+ combiners[key] = value
53
+ end
54
+ end
55
+ combiners
56
+ end
57
+ end
58
+
59
+ # ---------------
60
+
61
+ class CombineWithZero < Base
62
+ variable :zero_value, function: false, type: Object
63
+ variable :merge_value
64
+
65
+ def _run(iterator)
66
+ # Not use combiners[key] ||= ..
67
+ # it tests nil and not has_key?
68
+ combiners = {}
69
+ iterator.each do |key, value|
70
+ unless combiners.has_key?(key)
71
+ combiners[key] = @zero_value
72
+ end
73
+
74
+ combiners[key] = @merge_value.call(combiners[key], value)
75
+ end
76
+ combiners
77
+ end
78
+ end
79
+
80
+
81
+ # ---------------
82
+
83
+ end
84
+
85
+ # -------------------------------------------------------------------------------------------------
86
+ # MapValues
87
+
88
+ class Spark::Command::MapValues < _Base
89
+ variable :map_function
90
+
91
+ def run(iterator, *)
92
+ iterator.map! do |item|
93
+ item[1] = @map_function.call(item[1])
94
+ item
95
+ end
96
+ iterator
97
+ end
98
+
99
+ def lazy_run(iterator, *)
100
+ iterator.map do |item|
101
+ item[1] = @map_function.call(item[1])
102
+ item
103
+ end
104
+ end
105
+ end
106
+
107
+ # -------------------------------------------------------------------------------------------------
108
+ # FlatMapValues
109
+
110
+ class Spark::Command::FlatMapValues < _Base
111
+ variable :map_function
112
+
113
+ def run(iterator, *)
114
+ iterator.map! do |(key, values)|
115
+ values = @map_function.call(values)
116
+ values.flatten!(1)
117
+ values.map! do |value|
118
+ [key, value]
119
+ end
120
+ end
121
+ iterator.flatten!(1)
122
+ iterator
123
+ end
124
+ end
@@ -0,0 +1,51 @@
1
+ _Base = Spark::Command::Base
2
+
3
+ # -------------------------------------------------------------------------------------------------
4
+ # Sort
5
+
6
+ class Spark::Command::SortByKey < _Base
7
+ variable :key_function
8
+ variable :ascending, function: false, type: [TrueClass, FalseClass]
9
+ variable :spilling, function: false, type: [TrueClass, FalseClass]
10
+ variable :memory, function: false, type: [Numeric, NilClass]
11
+ variable :serializer, function: false, type: Spark::Serializer::Base
12
+
13
+ # Currently disabled
14
+ def before_run
15
+ @spilling = false
16
+ end
17
+
18
+ def run(iterator, _)
19
+ if @spilling
20
+ iterator = run_with_spilling(iterator.each)
21
+ else
22
+ run_without_spilling(iterator)
23
+ end
24
+
25
+ iterator
26
+ end
27
+
28
+ def run_with_enum(iterator, _)
29
+ if @spilling
30
+ iterator = run_with_spilling(iterator)
31
+ else
32
+ iterator = iterator.to_a
33
+ run_without_spilling(iterator)
34
+ end
35
+
36
+ iterator
37
+ end
38
+
39
+ private
40
+
41
+ def run_with_spilling(iterator)
42
+ sorter = Spark::ExternalSorter.new(@memory, @serializer)
43
+ sorter.sort_by(iterator, @ascending, @key_function)
44
+ end
45
+
46
+ def run_without_spilling(iterator)
47
+ iterator.sort_by!(&@key_function)
48
+ iterator.reverse! unless @ascending
49
+ end
50
+
51
+ end
@@ -0,0 +1,144 @@
1
+ _Base = Spark::Command::Base
2
+
3
+ # -------------------------------------------------------------------------------------------------
4
+ # Sample
5
+
6
+ class Spark::Command::Sample < _Base
7
+ variable :with_replacement, function: false, type: [TrueClass, FalseClass]
8
+ variable :fraction, function: false, type: Numeric
9
+ variable :seed, function: false, type: [NilClass, Numeric]
10
+
11
+ def run(iterator, _)
12
+ sampler.sample(iterator)
13
+ end
14
+
15
+ def lazy_run(iterator, _)
16
+ sampler.lazy_sample(iterator)
17
+ end
18
+
19
+ def sampler
20
+ @sampler ||= _sampler
21
+ end
22
+
23
+ def _sampler
24
+ if @with_replacement
25
+ sampler = Spark::Sampler::Poisson
26
+ else
27
+ sampler = Spark::Sampler::Uniform
28
+ end
29
+
30
+ sampler = sampler.new(@fraction, @seed)
31
+ end
32
+ end
33
+
34
+ # -------------------------------------------------------------------------------------------------
35
+ # Stats
36
+
37
+ class Spark::Command::Stats < _Base
38
+
39
+ def run(iterator, *)
40
+ [Spark::StatCounter.new(iterator)]
41
+ end
42
+
43
+ def lazy_run(iterator, *)
44
+ run(iterator)
45
+ end
46
+
47
+ end
48
+
49
+ # -------------------------------------------------------------------------------------------------
50
+ # Histogram
51
+
52
+ class Spark::Command::Histogram < _Base
53
+ include Spark::Helper::Statistic
54
+
55
+ variable :even, function: false, type: [TrueClass, FalseClass]
56
+ variable :buckets, function: false, type: Array
57
+
58
+ def run(iterator, *)
59
+ counters = Array.new(counter_size) { 0 }
60
+ iterator.each do |item|
61
+ if item.nil? || (item.is_a?(Float) && !item.finite?) || item > max || item < min
62
+ next
63
+ end
64
+
65
+ x = bucket_function.call(item)
66
+ if x.nil?
67
+ # next
68
+ else
69
+ counters[x] += 1
70
+ end
71
+ end
72
+ [counters]
73
+ end
74
+
75
+ def lazy_run(iterator, *)
76
+ run(iterator)
77
+ end
78
+
79
+ private
80
+
81
+ def min
82
+ @buckets.first
83
+ end
84
+
85
+ def max
86
+ @buckets.last
87
+ end
88
+
89
+ def counter_size
90
+ @buckets.size-1
91
+ end
92
+
93
+ def increment
94
+ @buckets[1]-@buckets[0]
95
+ end
96
+
97
+ # Decide which bucket function to pass. We decide here rather than having
98
+ # a general function so that the decission need only be made once.
99
+ def bucket_function
100
+ @bucket_function ||= _bucket_function
101
+ end
102
+
103
+ def _bucket_function
104
+ if @even
105
+ fast_bucket_function
106
+ else
107
+ basic_bucket_function
108
+ end
109
+ end
110
+
111
+ # Determine the bucket function in constant time.
112
+ # Requires that buckets are evenly spaced
113
+ def fast_bucket_function
114
+ Proc.new do |item|
115
+ if item.is_a?(Float) && item.nan?
116
+ nil
117
+ else
118
+ bucket_number = (item - min)/increment
119
+ if bucket_number > counter_size || bucket_number < 0
120
+ nil
121
+ else
122
+ [bucket_number.to_i, counter_size-1].min
123
+ end
124
+ end
125
+ end
126
+ end
127
+
128
+ # Basic bucket function. Same as right bisect.
129
+ def basic_bucket_function
130
+ Proc.new do |item|
131
+ bucket_number = bisect_right(@buckets, item) - 1
132
+
133
+ # Counters is @buckets.size - 1
134
+ # [bucket_number, counter_size-1].min
135
+
136
+ if bucket_number > counter_size-1
137
+ counter_size-1
138
+ else
139
+ bucket_number
140
+ end
141
+ end
142
+ end
143
+
144
+ end
@@ -0,0 +1,141 @@
1
+ require 'spark/command_validator'
2
+
3
+ module Spark
4
+ ##
5
+ # Builder for building correct {Spark::Command}
6
+ #
7
+ class CommandBuilder
8
+
9
+ extend Forwardable
10
+
11
+ include Spark::Helper::Serialize
12
+ include Spark::Helper::System
13
+ include Spark::CommandValidator
14
+
15
+ attr_reader :command
16
+
17
+ def_delegators :@command, :serializer, :serializer=, :deserializer, :deserializer=, :commands,
18
+ :commands=, :libraries, :libraries=, :bound_objects, :bound_objects=
19
+
20
+ def initialize(serializer, deserializer=nil)
21
+ create_command
22
+ self.serializer = serializer
23
+ self.deserializer = deserializer || serializer.dup
24
+ end
25
+
26
+ def create_command
27
+ @command = Spark::Command.new
28
+ end
29
+
30
+ # Do not user Marshal.dump(Marshal.load(self)) because some variables
31
+ # have marshal_dump prepared for worker.
32
+ def deep_copy
33
+ copy = self.dup
34
+ copy.create_command
35
+ copy.serializer = self.serializer.deep_copy
36
+ copy.deserializer = self.deserializer.deep_copy
37
+ copy.commands = self.commands.dup
38
+ copy.libraries = self.libraries.dup
39
+ copy.bound_objects = self.bound_objects.dup
40
+ copy
41
+ end
42
+
43
+ # Serialize Command class for worker
44
+ # Java use signed number
45
+ def build
46
+ unpack_chars(Marshal.dump(@command))
47
+ end
48
+
49
+ def add_command(klass, *args)
50
+ variables = klass.settings.variables
51
+ validate_size(variables, args)
52
+
53
+ built_args = []
54
+ variables.values.zip(args) do |var, arg|
55
+ if var[:function]
56
+ arg = serialize_function(arg)
57
+ end
58
+
59
+ validate(arg, var)
60
+ built_args << arg
61
+ end
62
+
63
+ comm = klass.new(*built_args)
64
+ @command.commands << comm
65
+ self
66
+ end
67
+
68
+ def add_library(*libraries)
69
+ @command.libraries += libraries
70
+ end
71
+
72
+ def bind(objects)
73
+ objects.symbolize_keys!
74
+ @command.bound_objects.merge!(objects)
75
+ end
76
+
77
+ private
78
+
79
+ # Serialized can be Proc and Method
80
+ #
81
+ # === Func
82
+ # * *string:* already serialized proc
83
+ # * *proc:* proc
84
+ # * *symbol:* name of method
85
+ # * *method:* Method class
86
+ #
87
+ def serialize_function(func)
88
+ case func
89
+ when String
90
+ serialize_function_from_string(func)
91
+ when Symbol
92
+ serialize_function_from_symbol(func)
93
+ when Proc
94
+ serialize_function_from_proc(func)
95
+ when Method
96
+ serialize_function_from_method(func)
97
+ else
98
+ raise Spark::CommandError, 'You must enter String, Symbol, Proc or Method.'
99
+ end
100
+ end
101
+
102
+ def serialize_function_from_string(string)
103
+ {type: 'proc', content: string}
104
+ end
105
+
106
+ def serialize_function_from_symbol(symbol)
107
+ {type: 'symbol', content: symbol}
108
+ end
109
+
110
+ # Serialize Proc as String
111
+ #
112
+ # lambda{|x| x*x}.to_source
113
+ # # => "proc { |x| (x * x) }"
114
+ #
115
+ def serialize_function_from_proc(proc)
116
+ serialize_function_from_string(proc.to_source)
117
+ rescue
118
+ raise Spark::SerializeError, 'Proc can not be serialized. Use String instead.'
119
+ end
120
+
121
+ # Serialize method as string
122
+ #
123
+ # def test(x)
124
+ # x*x
125
+ # end
126
+ # serialize_function_from_method(method(:test))
127
+ #
128
+ # # => "def test(x)\n x*x\nend\n"
129
+ #
130
+ def serialize_function_from_method(meth)
131
+ if pry?
132
+ meth = Pry::Method.new(meth)
133
+ end
134
+
135
+ {type: 'method', name: meth.name, content: meth.source}
136
+ rescue
137
+ raise Spark::SerializeError, 'Method can not be serialized. Use full path or Proc.'
138
+ end
139
+
140
+ end
141
+ end