ruby-spark 1.1.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +252 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +6 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/comparison/prepare.sh +18 -0
  12. data/benchmark/comparison/python.py +156 -0
  13. data/benchmark/comparison/r.r +69 -0
  14. data/benchmark/comparison/ruby.rb +167 -0
  15. data/benchmark/comparison/run-all.sh +160 -0
  16. data/benchmark/comparison/scala.scala +181 -0
  17. data/benchmark/custom_marshal.rb +94 -0
  18. data/benchmark/digest.rb +150 -0
  19. data/benchmark/enumerator.rb +88 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/example/website_search.rb +83 -0
  27. data/ext/ruby_c/extconf.rb +3 -0
  28. data/ext/ruby_c/murmur.c +158 -0
  29. data/ext/ruby_c/murmur.h +9 -0
  30. data/ext/ruby_c/ruby-spark.c +18 -0
  31. data/ext/ruby_java/Digest.java +36 -0
  32. data/ext/ruby_java/Murmur2.java +98 -0
  33. data/ext/ruby_java/RubySparkExtService.java +28 -0
  34. data/ext/ruby_java/extconf.rb +3 -0
  35. data/ext/spark/build.sbt +73 -0
  36. data/ext/spark/project/plugins.sbt +9 -0
  37. data/ext/spark/sbt/sbt +34 -0
  38. data/ext/spark/src/main/scala/Exec.scala +91 -0
  39. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  40. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  41. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  42. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  43. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  44. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  46. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  47. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  48. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  49. data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
  50. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  51. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  52. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  53. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  54. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  55. data/lib/ruby-spark.rb +1 -0
  56. data/lib/spark.rb +198 -0
  57. data/lib/spark/accumulator.rb +260 -0
  58. data/lib/spark/broadcast.rb +98 -0
  59. data/lib/spark/build.rb +43 -0
  60. data/lib/spark/cli.rb +169 -0
  61. data/lib/spark/command.rb +86 -0
  62. data/lib/spark/command/base.rb +158 -0
  63. data/lib/spark/command/basic.rb +345 -0
  64. data/lib/spark/command/pair.rb +124 -0
  65. data/lib/spark/command/sort.rb +51 -0
  66. data/lib/spark/command/statistic.rb +144 -0
  67. data/lib/spark/command_builder.rb +141 -0
  68. data/lib/spark/command_validator.rb +34 -0
  69. data/lib/spark/config.rb +238 -0
  70. data/lib/spark/constant.rb +14 -0
  71. data/lib/spark/context.rb +322 -0
  72. data/lib/spark/error.rb +50 -0
  73. data/lib/spark/ext/hash.rb +41 -0
  74. data/lib/spark/ext/integer.rb +25 -0
  75. data/lib/spark/ext/io.rb +67 -0
  76. data/lib/spark/ext/ip_socket.rb +29 -0
  77. data/lib/spark/ext/module.rb +58 -0
  78. data/lib/spark/ext/object.rb +24 -0
  79. data/lib/spark/ext/string.rb +24 -0
  80. data/lib/spark/helper.rb +10 -0
  81. data/lib/spark/helper/logger.rb +40 -0
  82. data/lib/spark/helper/parser.rb +85 -0
  83. data/lib/spark/helper/serialize.rb +71 -0
  84. data/lib/spark/helper/statistic.rb +93 -0
  85. data/lib/spark/helper/system.rb +42 -0
  86. data/lib/spark/java_bridge.rb +19 -0
  87. data/lib/spark/java_bridge/base.rb +203 -0
  88. data/lib/spark/java_bridge/jruby.rb +23 -0
  89. data/lib/spark/java_bridge/rjb.rb +41 -0
  90. data/lib/spark/logger.rb +76 -0
  91. data/lib/spark/mllib.rb +100 -0
  92. data/lib/spark/mllib/classification/common.rb +31 -0
  93. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  94. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  95. data/lib/spark/mllib/classification/svm.rb +135 -0
  96. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  97. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  98. data/lib/spark/mllib/matrix.rb +120 -0
  99. data/lib/spark/mllib/regression/common.rb +73 -0
  100. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  101. data/lib/spark/mllib/regression/lasso.rb +100 -0
  102. data/lib/spark/mllib/regression/linear.rb +124 -0
  103. data/lib/spark/mllib/regression/ridge.rb +97 -0
  104. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  105. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  106. data/lib/spark/mllib/stat/distribution.rb +12 -0
  107. data/lib/spark/mllib/vector.rb +185 -0
  108. data/lib/spark/rdd.rb +1377 -0
  109. data/lib/spark/sampler.rb +92 -0
  110. data/lib/spark/serializer.rb +79 -0
  111. data/lib/spark/serializer/auto_batched.rb +59 -0
  112. data/lib/spark/serializer/base.rb +63 -0
  113. data/lib/spark/serializer/batched.rb +84 -0
  114. data/lib/spark/serializer/cartesian.rb +13 -0
  115. data/lib/spark/serializer/compressed.rb +27 -0
  116. data/lib/spark/serializer/marshal.rb +17 -0
  117. data/lib/spark/serializer/message_pack.rb +23 -0
  118. data/lib/spark/serializer/oj.rb +23 -0
  119. data/lib/spark/serializer/pair.rb +41 -0
  120. data/lib/spark/serializer/text.rb +25 -0
  121. data/lib/spark/sort.rb +189 -0
  122. data/lib/spark/stat_counter.rb +125 -0
  123. data/lib/spark/storage_level.rb +39 -0
  124. data/lib/spark/version.rb +3 -0
  125. data/lib/spark/worker/master.rb +144 -0
  126. data/lib/spark/worker/spark_files.rb +15 -0
  127. data/lib/spark/worker/worker.rb +200 -0
  128. data/ruby-spark.gemspec +47 -0
  129. data/spec/generator.rb +37 -0
  130. data/spec/inputs/lorem_300.txt +316 -0
  131. data/spec/inputs/numbers/1.txt +50 -0
  132. data/spec/inputs/numbers/10.txt +50 -0
  133. data/spec/inputs/numbers/11.txt +50 -0
  134. data/spec/inputs/numbers/12.txt +50 -0
  135. data/spec/inputs/numbers/13.txt +50 -0
  136. data/spec/inputs/numbers/14.txt +50 -0
  137. data/spec/inputs/numbers/15.txt +50 -0
  138. data/spec/inputs/numbers/16.txt +50 -0
  139. data/spec/inputs/numbers/17.txt +50 -0
  140. data/spec/inputs/numbers/18.txt +50 -0
  141. data/spec/inputs/numbers/19.txt +50 -0
  142. data/spec/inputs/numbers/2.txt +50 -0
  143. data/spec/inputs/numbers/20.txt +50 -0
  144. data/spec/inputs/numbers/3.txt +50 -0
  145. data/spec/inputs/numbers/4.txt +50 -0
  146. data/spec/inputs/numbers/5.txt +50 -0
  147. data/spec/inputs/numbers/6.txt +50 -0
  148. data/spec/inputs/numbers/7.txt +50 -0
  149. data/spec/inputs/numbers/8.txt +50 -0
  150. data/spec/inputs/numbers/9.txt +50 -0
  151. data/spec/inputs/numbers_0_100.txt +101 -0
  152. data/spec/inputs/numbers_1_100.txt +100 -0
  153. data/spec/lib/collect_spec.rb +42 -0
  154. data/spec/lib/command_spec.rb +68 -0
  155. data/spec/lib/config_spec.rb +64 -0
  156. data/spec/lib/context_spec.rb +165 -0
  157. data/spec/lib/ext_spec.rb +72 -0
  158. data/spec/lib/external_apps_spec.rb +45 -0
  159. data/spec/lib/filter_spec.rb +80 -0
  160. data/spec/lib/flat_map_spec.rb +100 -0
  161. data/spec/lib/group_spec.rb +109 -0
  162. data/spec/lib/helper_spec.rb +19 -0
  163. data/spec/lib/key_spec.rb +41 -0
  164. data/spec/lib/manipulation_spec.rb +122 -0
  165. data/spec/lib/map_partitions_spec.rb +87 -0
  166. data/spec/lib/map_spec.rb +91 -0
  167. data/spec/lib/mllib/classification_spec.rb +54 -0
  168. data/spec/lib/mllib/clustering_spec.rb +35 -0
  169. data/spec/lib/mllib/matrix_spec.rb +32 -0
  170. data/spec/lib/mllib/regression_spec.rb +116 -0
  171. data/spec/lib/mllib/vector_spec.rb +77 -0
  172. data/spec/lib/reduce_by_key_spec.rb +118 -0
  173. data/spec/lib/reduce_spec.rb +131 -0
  174. data/spec/lib/sample_spec.rb +46 -0
  175. data/spec/lib/serializer_spec.rb +88 -0
  176. data/spec/lib/sort_spec.rb +58 -0
  177. data/spec/lib/statistic_spec.rb +170 -0
  178. data/spec/lib/whole_text_files_spec.rb +33 -0
  179. data/spec/spec_helper.rb +38 -0
  180. metadata +389 -0
@@ -0,0 +1,3 @@
1
+ module Spark
2
+ VERSION = '1.1.0.1'
3
+ end
@@ -0,0 +1,144 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $PROGRAM_NAME = 'RubySparkMaster'
4
+
5
+ require 'socket'
6
+ require 'io/wait'
7
+ require 'nio'
8
+
9
+ require_relative 'worker'
10
+
11
+ # New process group
12
+ # Otherwise master can be killed from pry console
13
+ Process.setsid
14
+
15
+ # =================================================================================================
16
+ # Master
17
+ #
18
+ module Master
19
+
20
+ def self.create
21
+ case ARGV[0].to_s.strip
22
+ when 'thread'
23
+ Master::Thread.new
24
+ else
25
+ Master::Process.new
26
+ end
27
+ end
28
+
29
+ class Base
30
+ include Spark::Constant
31
+
32
+ def initialize
33
+ @port = ARGV[1].to_s.strip.to_i
34
+ @socket = TCPSocket.open('localhost', @port)
35
+ @worker_arguments = @socket.read_string
36
+ end
37
+
38
+ def run
39
+ selector = NIO::Selector.new
40
+ monitor = selector.register(@socket, :r)
41
+ monitor.value = Proc.new { receive_message }
42
+ loop {
43
+ selector.select {|monitor| monitor.value.call}
44
+ }
45
+ end
46
+
47
+ def receive_message
48
+ command = @socket.read_int
49
+
50
+ case command
51
+ when CREATE_WORKER
52
+ create_worker
53
+ when KILL_WORKER
54
+ kill_worker
55
+ when KILL_WORKER_AND_WAIT
56
+ kill_worker_and_wait
57
+ end
58
+ end
59
+
60
+ def kill_worker_and_wait
61
+ if kill_worker
62
+ @socket.write_int(SUCCESSFULLY_KILLED)
63
+ else
64
+ @socket.write_int(UNSUCCESSFUL_KILLING)
65
+ end
66
+ end
67
+ end
68
+
69
+ # ===============================================================================================
70
+ # Worker::Process
71
+ #
72
+ class Process < Base
73
+
74
+ def create_worker
75
+ if fork?
76
+ pid = ::Process.fork do
77
+ Worker::Process.new(@port).run
78
+ end
79
+ else
80
+ pid = ::Process.spawn("ruby #{@worker_arguments} worker.rb #{@port}")
81
+ end
82
+
83
+ # Detach child from master to avoid zombie process
84
+ ::Process.detach(pid)
85
+ end
86
+
87
+ def kill_worker
88
+ worker_id = @socket.read_long
89
+ ::Process.kill('TERM', worker_id)
90
+ rescue
91
+ nil
92
+ end
93
+
94
+ def fork?
95
+ @can_fork ||= _fork?
96
+ end
97
+
98
+ def _fork?
99
+ return false if !::Process.respond_to?(:fork)
100
+
101
+ pid = ::Process.fork
102
+ exit unless pid # exit the child immediately
103
+ true
104
+ rescue NotImplementedError
105
+ false
106
+ end
107
+
108
+ end
109
+
110
+ # ===============================================================================================
111
+ # Worker::Thread
112
+ #
113
+ class Thread < Base
114
+
115
+ def initialize
116
+ ::Thread.abort_on_exception = true
117
+
118
+ # For synchronous access to socket IO
119
+ $mutex_for_command = Mutex.new
120
+ $mutex_for_iterator = Mutex.new
121
+
122
+ super
123
+ end
124
+
125
+ def create_worker
126
+ ::Thread.new do
127
+ Worker::Thread.new(@port).run
128
+ end
129
+ end
130
+
131
+ def kill_worker
132
+ worker_id = @socket.read_long
133
+
134
+ thread = ObjectSpace._id2ref(worker_id)
135
+ thread.kill
136
+ rescue
137
+ nil
138
+ end
139
+
140
+ end
141
+ end
142
+
143
+ # Create proper master by worker_type
144
+ Master.create.run
@@ -0,0 +1,15 @@
1
+ class SparkFiles
2
+
3
+ class << self
4
+ attr_accessor :root_directory
5
+ end
6
+
7
+ def self.get(file_name)
8
+ File.join(root_directory, file_name)
9
+ end
10
+
11
+ def self.get_content(file_name)
12
+ File.read(get(file_name))
13
+ end
14
+
15
+ end
@@ -0,0 +1,200 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Load root of the gem
4
+ lib = File.expand_path(File.join('..', '..'), File.dirname(__FILE__))
5
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
6
+
7
+ require 'ruby-spark.rb'
8
+ require 'socket'
9
+
10
+ require_relative 'spark_files'
11
+
12
+
13
+ # =================================================================================================
14
+ # Worker
15
+ #
16
+ # Iterator is LAZY !!!
17
+ #
18
+ module Worker
19
+ class Base
20
+
21
+ include Spark::Helper::Serialize
22
+ include Spark::Helper::System
23
+ include Spark::Constant
24
+
25
+ attr_accessor :socket
26
+
27
+ def initialize(port)
28
+ # Open socket to Spark
29
+ @socket = TCPSocket.open('localhost', port)
30
+
31
+ # Send back worker ID
32
+ socket.write_long(id)
33
+ end
34
+
35
+ def run
36
+ begin
37
+ compute
38
+ rescue => e
39
+ send_error(e)
40
+ else
41
+ successful_finish
42
+ end
43
+ end
44
+
45
+ private
46
+
47
+ def before_start
48
+ # Should be implemented in sub-classes
49
+ end
50
+
51
+ def before_end
52
+ # Should be implemented in sub-classes
53
+ end
54
+
55
+ # These methods must be on one method because iterator is Lazy
56
+ # which mean that exception can be raised at `serializer` or `compute`
57
+ def compute
58
+ before_start
59
+
60
+ # Load split index
61
+ @split_index = socket.read_int
62
+
63
+ # Load files
64
+ SparkFiles.root_directory = socket.read_string
65
+
66
+ # Load broadcast
67
+ count = socket.read_int
68
+ count.times do
69
+ Spark::Broadcast.register(socket.read_long, socket.read_string)
70
+ end
71
+
72
+ # Load command
73
+ @command = socket.read_data
74
+
75
+ # Load iterator
76
+ @iterator = @command.deserializer.load_from_io(socket).lazy
77
+
78
+ # Compute
79
+ @iterator = @command.execute(@iterator, @split_index)
80
+
81
+ # Result is not iterable
82
+ @iterator = [@iterator] unless @iterator.respond_to?(:each)
83
+
84
+ # Send result
85
+ @command.serializer.dump_to_io(@iterator, socket)
86
+ end
87
+
88
+ def send_error(e)
89
+ # Flag
90
+ socket.write_int(WORKER_ERROR)
91
+
92
+ # Message
93
+ socket.write_string(e.message)
94
+
95
+ # Backtrace
96
+ socket.write_int(e.backtrace.size)
97
+ e.backtrace.each do |item|
98
+ socket.write_string(item)
99
+ end
100
+
101
+ socket.flush
102
+
103
+ # Wait for spark
104
+ # Socket is closed before throwing an exception
105
+ # Singal that ruby exception was fully received
106
+ until socket.closed?
107
+ sleep(0.1)
108
+ end
109
+
110
+ # Depend on type of worker
111
+ kill_worker
112
+ end
113
+
114
+ def successful_finish
115
+ # Finish
116
+ socket.write_int(WORKER_DONE)
117
+
118
+ # Send changed accumulator
119
+ changed = Spark::Accumulator.changed
120
+ socket.write_int(changed.size)
121
+ changed.each do |accumulator|
122
+ socket.write_data([accumulator.id, accumulator.value])
123
+ end
124
+
125
+ # Send it
126
+ socket.flush
127
+
128
+ before_end
129
+ end
130
+
131
+ def log(message=nil)
132
+ return if !$DEBUG
133
+
134
+ $stdout.puts %{==> #{Time.now.strftime('%H:%M:%S')} [#{id}] #{message}}
135
+ $stdout.flush
136
+ end
137
+
138
+ end
139
+
140
+ # ===============================================================================================
141
+ # Worker::Process
142
+ #
143
+ class Process < Base
144
+
145
+ def id
146
+ ::Process.pid
147
+ end
148
+
149
+ private
150
+
151
+ def before_start
152
+ $PROGRAM_NAME = 'RubySparkWorker'
153
+ end
154
+
155
+ def kill_worker
156
+ Process.exit(false)
157
+ end
158
+
159
+ end
160
+
161
+ # ===============================================================================================
162
+ # Worker::Thread
163
+ #
164
+ class Thread < Base
165
+
166
+ def id
167
+ ::Thread.current.object_id
168
+ end
169
+
170
+ private
171
+
172
+ def load_command
173
+ $mutex_for_command.synchronize { super }
174
+ end
175
+
176
+ # Threads changing for reading is very slow
177
+ # Faster way is do it one by one
178
+ def load_iterator
179
+ # Wait for incoming connection for preventing deadlock
180
+ if jruby?
181
+ socket.io_wait
182
+ else
183
+ socket.wait_readable
184
+ end
185
+
186
+ $mutex_for_iterator.synchronize { super }
187
+ end
188
+
189
+ def kill_worker
190
+ Thread.current.kill
191
+ end
192
+
193
+ end
194
+ end
195
+
196
+ # Worker is loaded as standalone
197
+ if $PROGRAM_NAME == __FILE__
198
+ worker = Worker::Process.new(ARGV[0])
199
+ worker.run
200
+ end
@@ -0,0 +1,47 @@
1
+ # coding: utf-8
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+
6
+ require 'spark/version'
7
+
8
+ Gem::Specification.new do |spec|
9
+ spec.name = 'ruby-spark'
10
+ spec.version = Spark::VERSION
11
+ spec.authors = ['Ondřej Moravčík']
12
+ spec.email = ['moravcik.ondrej@gmail.com']
13
+ spec.summary = %q{Ruby wrapper for Apache Spark}
14
+ spec.description = %q{}
15
+ spec.homepage = ''
16
+ spec.license = 'MIT'
17
+
18
+ spec.files = `git ls-files -z`.split("\x0")
19
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
21
+ spec.require_paths = ['lib']
22
+
23
+ if RUBY_PLATFORM =~ /java/
24
+ spec.platform = 'java'
25
+
26
+ extensions = ['ext/ruby_java/extconf.rb']
27
+ else
28
+ extensions = ['ext/ruby_c/extconf.rb']
29
+
30
+ spec.add_dependency 'rjb'
31
+ end
32
+
33
+ spec.extensions = extensions
34
+ spec.required_ruby_version = '>= 2.0'
35
+
36
+ spec.requirements << 'java, scala'
37
+
38
+ spec.add_dependency 'sourcify', '0.6.0.rc4'
39
+ spec.add_dependency 'method_source'
40
+ spec.add_dependency 'commander'
41
+ spec.add_dependency 'pry'
42
+ spec.add_dependency 'nio4r'
43
+ spec.add_dependency 'distribution'
44
+
45
+ spec.add_development_dependency 'bundler', '~> 1.6'
46
+ spec.add_development_dependency 'rake'
47
+ end
@@ -0,0 +1,37 @@
1
+ class Generator
2
+ def self.numbers(size=1000)
3
+ Array.new(size){ rand(1..1000) }
4
+ end
5
+
6
+ def self.numbers_with_zero(size=1000)
7
+ Array.new(size){ rand(0..1000) }
8
+ end
9
+
10
+ def self.words(size=1000)
11
+ Array.new(size) { word }
12
+ end
13
+
14
+ def self.word(size=10)
15
+ Array.new(rand(1..size)){(97+rand(26)).chr}.join
16
+ end
17
+
18
+ def self.lines(size=1000, letters=3)
19
+ Array.new(size) do
20
+ Array.new(rand(50..100)){
21
+ (97+rand(letters)).chr + (" " * (rand(10) == 0 ? 1 : 0))
22
+ }.join
23
+ end
24
+ end
25
+
26
+ def self.hash(size=1000)
27
+ Array.new(size) do
28
+ [word(2), rand(1..10)]
29
+ end
30
+ end
31
+
32
+ def self.hash_with_values(size=1000, values_count=10)
33
+ Array.new(size) do
34
+ [word(2), Array.new(values_count) { rand(1..10) }]
35
+ end
36
+ end
37
+ end