ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,50 @@
1
+ module Spark
2
+ # Extension cannot be built
3
+ class BuildError < StandardError
4
+ end
5
+
6
+ # Proc.to_source
7
+ # Java object cannot be converted
8
+ class SerializeError < StandardError
9
+ end
10
+
11
+ # Serializer method
12
+ # Non-existing serializer
13
+ class NotImplemented < StandardError
14
+ end
15
+
16
+ # Missison app_name or master
17
+ class ConfigurationError < StandardError
18
+ end
19
+
20
+ # Wrong parameters
21
+ class RDDError < StandardError
22
+ end
23
+
24
+ # Validations
25
+ class CommandError < StandardError
26
+ end
27
+
28
+ # Parser helper
29
+ class ParseError < StandardError
30
+ end
31
+
32
+ # Validation in context
33
+ class ContextError < StandardError
34
+ end
35
+
36
+ # Broadcasts
37
+ # Missing path
38
+ class BroadcastError < StandardError
39
+ end
40
+
41
+ # Accumulators
42
+ # Existing keys
43
+ # Wrong ID
44
+ class AccumulatorError < StandardError
45
+ end
46
+
47
+ # Wrong instances
48
+ class MllibError < StandardError
49
+ end
50
+ end
@@ -0,0 +1,41 @@
1
+ module Spark
2
+ module CoreExtension
3
+ module Hash
4
+ module ClassMethods
5
+ end
6
+
7
+ module InstanceMethods
8
+ # Destructively convert all keys to strings.
9
+ def stringify_keys_with_spark!
10
+ transform_keys!{ |key| key.to_s }
11
+ end
12
+
13
+ # Destructively convert all keys to symbols, as long as they respond
14
+ def symbolize_keys_with_spark!
15
+ transform_keys!{ |key| key.to_sym rescue key }
16
+ end
17
+
18
+ # Destructively convert all keys using the block operations.
19
+ # Same as transform_keys but modifies +self+.
20
+ def transform_keys_with_spark!
21
+ keys.each do |key|
22
+ self[yield(key)] = delete(key)
23
+ end
24
+ self
25
+ end
26
+ end
27
+
28
+ def self.included(base)
29
+ base.extend(ClassMethods)
30
+ base.send(:include, InstanceMethods)
31
+ base.class_eval do
32
+ patch_unless_exist :stringify_keys!, :spark
33
+ patch_unless_exist :symbolize_keys!, :spark
34
+ patch_unless_exist :transform_keys!, :spark
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
40
+
41
+ Hash.__send__(:include, Spark::CoreExtension::Hash)
@@ -0,0 +1,25 @@
1
+ module Spark
2
+ module CoreExtension
3
+ module Integer
4
+ module ClassMethods
5
+ end
6
+
7
+ module InstanceMethods
8
+ end
9
+
10
+ def self.included(base)
11
+ base.extend(ClassMethods)
12
+ base.send(:include, InstanceMethods)
13
+ base.class_eval do
14
+ const_set :MAX_WITH_SPARK, 1 << (1.size * 8 - 2) - 1
15
+ const_set :MIN_WITH_SPARK, -const_get(:MAX_WITH_SPARK) - 1
16
+
17
+ path_const_unless_exist :MAX, :SPARK
18
+ path_const_unless_exist :MIN, :SPARK
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+
25
+ Integer.__send__(:include, Spark::CoreExtension::Integer)
@@ -0,0 +1,57 @@
1
+ module Spark
2
+ module CoreExtension
3
+ module IO
4
+ module ClassMethods
5
+ end
6
+
7
+ module InstanceMethods
8
+
9
+ # Reading
10
+
11
+ def read_int
12
+ unpack_int(read(4))
13
+ end
14
+
15
+ def read_long
16
+ unpack_long(read(8))
17
+ end
18
+
19
+ def read_string
20
+ read(read_int)
21
+ end
22
+
23
+ def read_data
24
+ Marshal.load(read_string)
25
+ end
26
+
27
+
28
+ # Writing
29
+
30
+ def write_int(data)
31
+ write(pack_int(data))
32
+ end
33
+
34
+ def write_long(data)
35
+ write(pack_long(data))
36
+ end
37
+
38
+ def write_string(data)
39
+ write_int(data.size)
40
+ write(data)
41
+ end
42
+
43
+ def write_data(data)
44
+ write_string(Marshal.dump(data))
45
+ end
46
+ end
47
+
48
+ def self.included(base)
49
+ base.extend(ClassMethods)
50
+ base.send(:include, Spark::Helper::Serialize)
51
+ base.send(:include, InstanceMethods)
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ IO.__send__(:include, Spark::CoreExtension::IO)
@@ -0,0 +1,29 @@
1
+ module Spark
2
+ module CoreExtension
3
+ module IPSocket
4
+ module ClassMethods
5
+ end
6
+
7
+ module InstanceMethods
8
+ def port
9
+ addr[1]
10
+ end
11
+
12
+ def hostname
13
+ addr(true)[2]
14
+ end
15
+
16
+ def numeric_address
17
+ addr[3]
18
+ end
19
+ end
20
+
21
+ def self.included(base)
22
+ base.extend(ClassMethods)
23
+ base.send(:include, InstanceMethods)
24
+ end
25
+ end
26
+ end
27
+ end
28
+
29
+ IPSocket.__send__(:include, Spark::CoreExtension::IPSocket)
@@ -0,0 +1,58 @@
1
+ module Spark
2
+ module CoreExtension
3
+ module Module
4
+
5
+ # Patch method to class unless already exist
6
+ #
7
+ # == Example:
8
+ #
9
+ # class Hash
10
+ # def a
11
+ # 1
12
+ # end
13
+ # end
14
+ #
15
+ # module HashExtension
16
+ # module InstanceMethods
17
+ # def a_with_spark
18
+ # 2
19
+ # end
20
+ #
21
+ # def b_with_spark
22
+ # 1
23
+ # end
24
+ # end
25
+ #
26
+ # def self.included(base)
27
+ # base.send(:include, InstanceMethods)
28
+ # base.class_eval do
29
+ # patch_unless_exist :a, :spark
30
+ # patch_unless_exist :b, :spark
31
+ # end
32
+ # end
33
+ # end
34
+ #
35
+ # Hash.include(HashExtension)
36
+ #
37
+ # Hash.new.a # => 1
38
+ # Hash.new.b # => 1
39
+ #
40
+ def patch_unless_exist(target, suffix)
41
+ unless method_defined?(target)
42
+ aliased_target, punctuation = target.to_s.sub(/([?!=])$/, ''), $1
43
+
44
+ alias_method target, "#{aliased_target}_with_#{suffix}#{punctuation}"
45
+ end
46
+ end
47
+
48
+ def path_const_unless_exist(target, suffix)
49
+ unless const_defined?(target)
50
+ const_set(target, const_get("#{target}_WITH_#{suffix}"))
51
+ end
52
+ end
53
+
54
+ end
55
+ end
56
+ end
57
+
58
+ Module.__send__(:include, Spark::CoreExtension::Module)
@@ -0,0 +1,24 @@
1
+ module Spark
2
+ module CoreExtension
3
+ module Object
4
+ module ClassMethods
5
+ end
6
+
7
+ module InstanceMethods
8
+ def deep_copy_with_spark
9
+ Marshal.load(Marshal.dump(self))
10
+ end
11
+ end
12
+
13
+ def self.included(base)
14
+ base.extend(ClassMethods)
15
+ base.send(:include, InstanceMethods)
16
+ base.class_eval do
17
+ patch_unless_exist :deep_copy, :spark
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ Object.__send__(:include, Spark::CoreExtension::Object)
@@ -0,0 +1,24 @@
1
+ module Spark
2
+ module CoreExtension
3
+ module String
4
+ module ClassMethods
5
+ end
6
+
7
+ module InstanceMethods
8
+ def camelize_with_spark
9
+ self.gsub(/\/(.?)/) { "::#{$1.upcase}" }.gsub(/(?:^|_)(.)/) { $1.upcase }
10
+ end
11
+ end
12
+
13
+ def self.included(base)
14
+ base.extend(ClassMethods)
15
+ base.send(:include, InstanceMethods)
16
+ base.class_eval do
17
+ patch_unless_exist :camelize, :spark
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ String.__send__(:include, Spark::CoreExtension::String)
@@ -0,0 +1,10 @@
1
+ module Spark
2
+ module Helper
3
+ autoload :System, "spark/helper/system"
4
+ autoload :Logger, "spark/helper/logger"
5
+ autoload :Statistic, "spark/helper/statistic"
6
+ autoload :Serialize, "spark/helper/serialize"
7
+ autoload :Partition, "spark/helper/partition"
8
+ autoload :Parser, "spark/helper/parser"
9
+ end
10
+ end
@@ -0,0 +1,40 @@
1
+ module Spark
2
+ module Helper
3
+ module Logger
4
+
5
+ def self.included(base)
6
+ base.send :extend, Methods
7
+ base.send :include, Methods
8
+ end
9
+
10
+ module Methods
11
+ def log_info(message)
12
+ Spark.logger.info(message)
13
+ end
14
+
15
+ def log_debug(message)
16
+ Spark.logger.debug(message)
17
+ end
18
+
19
+ def log_trace(message)
20
+ Spark.logger.trace(message)
21
+ end
22
+
23
+ def log_warning(message)
24
+ Spark.logger.warning(message)
25
+ end
26
+
27
+ def log_error(message)
28
+ Spark.logger.error(message)
29
+ end
30
+
31
+ alias_method :logInfo, :log_info
32
+ alias_method :logDebug, :log_debug
33
+ alias_method :logTrace, :log_trace
34
+ alias_method :logWarning, :log_warning
35
+ alias_method :logError, :log_error
36
+
37
+ end # Methods
38
+ end # Logger
39
+ end # Helper
40
+ end # Spark
@@ -0,0 +1,85 @@
1
+ module Spark
2
+ module Helper
3
+ module Parser
4
+
5
+ def self.included(base)
6
+ base.send :extend, Methods
7
+ base.send :include, Methods
8
+ end
9
+
10
+ module Methods
11
+ def to_java_hash(hash)
12
+ hash_map = HashMap.new
13
+ hash.each_pair do |key, value|
14
+ begin
15
+ # RJB raise Object is NULL (but new record is put correctly)
16
+ hash_map.put(key, value)
17
+ rescue RuntimeError
18
+ end
19
+ end
20
+ hash_map
21
+ end
22
+
23
+ def convert_to_java_int(data)
24
+ if data.is_a?(Array)
25
+ data.map{|x| JInteger.new(x)}
26
+ else
27
+ JInteger.new(data)
28
+ end
29
+ end
30
+
31
+ def to_java_array_list(array)
32
+ array_list = ArrayList.new
33
+ array.each do |item|
34
+ array_list.add(item)
35
+ end
36
+ array_list
37
+ end
38
+
39
+ # Parse and convert memory size. Shifting be better but Float doesn't support it.
40
+ #
41
+ # == Examples:
42
+ # to_memory_size("512mb")
43
+ # # => 524288
44
+ #
45
+ # to_memory_size("512 MB")
46
+ # # => 524288
47
+ #
48
+ # to_memory_size("512mb", "GB")
49
+ # # => 0.5
50
+ #
51
+ def to_memory_size(memory, result_unit="KB")
52
+ match = memory.match(/([\d]+)[\s]*([\w]*)/)
53
+ if match.nil?
54
+ raise Spark::ParseError, "Memory has wrong format. Use: 'SIZE UNIT'"
55
+ end
56
+
57
+ size = match[1].to_f
58
+ unit = match[2]
59
+
60
+ size *= memory_multiplier_based_kb(unit)
61
+ size /= memory_multiplier_based_kb(result_unit)
62
+ size.round(2)
63
+ end
64
+
65
+ # Based to KB
66
+ def memory_multiplier_based_kb(type)
67
+ case type.to_s.upcase
68
+ when "G", "GB"
69
+ 1048576
70
+ when "M", "MB"
71
+ 1024
72
+ when "K", "KB"
73
+ 1
74
+ else
75
+ raise Spark::ParseError, "Unsupported type #{type}"
76
+ end
77
+ end
78
+
79
+ end # Methods
80
+
81
+ end # Parser
82
+ end # Helper
83
+ end # Spark
84
+
85
+