ruby-spark 1.1.0.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +252 -0
- data/Rakefile +35 -0
- data/TODO.md +6 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/comparison/prepare.sh +18 -0
- data/benchmark/comparison/python.py +156 -0
- data/benchmark/comparison/r.r +69 -0
- data/benchmark/comparison/ruby.rb +167 -0
- data/benchmark/comparison/run-all.sh +160 -0
- data/benchmark/comparison/scala.scala +181 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/example/website_search.rb +83 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +158 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +238 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +322 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +67 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1377 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +79 -0
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +63 -0
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +13 -0
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +17 -0
- data/lib/spark/serializer/message_pack.rb +23 -0
- data/lib/spark/serializer/oj.rb +23 -0
- data/lib/spark/serializer/pair.rb +41 -0
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +200 -0
- data/ruby-spark.gemspec +47 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +165 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +122 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +88 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +170 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +38 -0
- metadata +389 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
$PROGRAM_NAME = 'RubySparkMaster'
|
|
4
|
+
|
|
5
|
+
require 'socket'
|
|
6
|
+
require 'io/wait'
|
|
7
|
+
require 'nio'
|
|
8
|
+
|
|
9
|
+
require_relative 'worker'
|
|
10
|
+
|
|
11
|
+
# New process group
|
|
12
|
+
# Otherwise master can be killed from pry console
|
|
13
|
+
Process.setsid
|
|
14
|
+
|
|
15
|
+
# =================================================================================================
|
|
16
|
+
# Master
|
|
17
|
+
#
|
|
18
|
+
module Master
|
|
19
|
+
|
|
20
|
+
def self.create
|
|
21
|
+
case ARGV[0].to_s.strip
|
|
22
|
+
when 'thread'
|
|
23
|
+
Master::Thread.new
|
|
24
|
+
else
|
|
25
|
+
Master::Process.new
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
class Base
|
|
30
|
+
include Spark::Constant
|
|
31
|
+
|
|
32
|
+
def initialize
|
|
33
|
+
@port = ARGV[1].to_s.strip.to_i
|
|
34
|
+
@socket = TCPSocket.open('localhost', @port)
|
|
35
|
+
@worker_arguments = @socket.read_string
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def run
|
|
39
|
+
selector = NIO::Selector.new
|
|
40
|
+
monitor = selector.register(@socket, :r)
|
|
41
|
+
monitor.value = Proc.new { receive_message }
|
|
42
|
+
loop {
|
|
43
|
+
selector.select {|monitor| monitor.value.call}
|
|
44
|
+
}
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def receive_message
|
|
48
|
+
command = @socket.read_int
|
|
49
|
+
|
|
50
|
+
case command
|
|
51
|
+
when CREATE_WORKER
|
|
52
|
+
create_worker
|
|
53
|
+
when KILL_WORKER
|
|
54
|
+
kill_worker
|
|
55
|
+
when KILL_WORKER_AND_WAIT
|
|
56
|
+
kill_worker_and_wait
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def kill_worker_and_wait
|
|
61
|
+
if kill_worker
|
|
62
|
+
@socket.write_int(SUCCESSFULLY_KILLED)
|
|
63
|
+
else
|
|
64
|
+
@socket.write_int(UNSUCCESSFUL_KILLING)
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# ===============================================================================================
|
|
70
|
+
# Worker::Process
|
|
71
|
+
#
|
|
72
|
+
class Process < Base
|
|
73
|
+
|
|
74
|
+
def create_worker
|
|
75
|
+
if fork?
|
|
76
|
+
pid = ::Process.fork do
|
|
77
|
+
Worker::Process.new(@port).run
|
|
78
|
+
end
|
|
79
|
+
else
|
|
80
|
+
pid = ::Process.spawn("ruby #{@worker_arguments} worker.rb #{@port}")
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Detach child from master to avoid zombie process
|
|
84
|
+
::Process.detach(pid)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def kill_worker
|
|
88
|
+
worker_id = @socket.read_long
|
|
89
|
+
::Process.kill('TERM', worker_id)
|
|
90
|
+
rescue
|
|
91
|
+
nil
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def fork?
|
|
95
|
+
@can_fork ||= _fork?
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def _fork?
|
|
99
|
+
return false if !::Process.respond_to?(:fork)
|
|
100
|
+
|
|
101
|
+
pid = ::Process.fork
|
|
102
|
+
exit unless pid # exit the child immediately
|
|
103
|
+
true
|
|
104
|
+
rescue NotImplementedError
|
|
105
|
+
false
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# ===============================================================================================
|
|
111
|
+
# Worker::Thread
|
|
112
|
+
#
|
|
113
|
+
class Thread < Base
|
|
114
|
+
|
|
115
|
+
def initialize
|
|
116
|
+
::Thread.abort_on_exception = true
|
|
117
|
+
|
|
118
|
+
# For synchronous access to socket IO
|
|
119
|
+
$mutex_for_command = Mutex.new
|
|
120
|
+
$mutex_for_iterator = Mutex.new
|
|
121
|
+
|
|
122
|
+
super
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def create_worker
|
|
126
|
+
::Thread.new do
|
|
127
|
+
Worker::Thread.new(@port).run
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def kill_worker
|
|
132
|
+
worker_id = @socket.read_long
|
|
133
|
+
|
|
134
|
+
thread = ObjectSpace._id2ref(worker_id)
|
|
135
|
+
thread.kill
|
|
136
|
+
rescue
|
|
137
|
+
nil
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Create proper master by worker_type
|
|
144
|
+
Master.create.run
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
# Load root of the gem
|
|
4
|
+
lib = File.expand_path(File.join('..', '..'), File.dirname(__FILE__))
|
|
5
|
+
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
|
|
6
|
+
|
|
7
|
+
require 'ruby-spark.rb'
|
|
8
|
+
require 'socket'
|
|
9
|
+
|
|
10
|
+
require_relative 'spark_files'
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# =================================================================================================
|
|
14
|
+
# Worker
|
|
15
|
+
#
|
|
16
|
+
# Iterator is LAZY !!!
|
|
17
|
+
#
|
|
18
|
+
module Worker
|
|
19
|
+
class Base
|
|
20
|
+
|
|
21
|
+
include Spark::Helper::Serialize
|
|
22
|
+
include Spark::Helper::System
|
|
23
|
+
include Spark::Constant
|
|
24
|
+
|
|
25
|
+
attr_accessor :socket
|
|
26
|
+
|
|
27
|
+
def initialize(port)
|
|
28
|
+
# Open socket to Spark
|
|
29
|
+
@socket = TCPSocket.open('localhost', port)
|
|
30
|
+
|
|
31
|
+
# Send back worker ID
|
|
32
|
+
socket.write_long(id)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def run
|
|
36
|
+
begin
|
|
37
|
+
compute
|
|
38
|
+
rescue => e
|
|
39
|
+
send_error(e)
|
|
40
|
+
else
|
|
41
|
+
successful_finish
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def before_start
|
|
48
|
+
# Should be implemented in sub-classes
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def before_end
|
|
52
|
+
# Should be implemented in sub-classes
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# These methods must be on one method because iterator is Lazy
|
|
56
|
+
# which mean that exception can be raised at `serializer` or `compute`
|
|
57
|
+
def compute
|
|
58
|
+
before_start
|
|
59
|
+
|
|
60
|
+
# Load split index
|
|
61
|
+
@split_index = socket.read_int
|
|
62
|
+
|
|
63
|
+
# Load files
|
|
64
|
+
SparkFiles.root_directory = socket.read_string
|
|
65
|
+
|
|
66
|
+
# Load broadcast
|
|
67
|
+
count = socket.read_int
|
|
68
|
+
count.times do
|
|
69
|
+
Spark::Broadcast.register(socket.read_long, socket.read_string)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Load command
|
|
73
|
+
@command = socket.read_data
|
|
74
|
+
|
|
75
|
+
# Load iterator
|
|
76
|
+
@iterator = @command.deserializer.load_from_io(socket).lazy
|
|
77
|
+
|
|
78
|
+
# Compute
|
|
79
|
+
@iterator = @command.execute(@iterator, @split_index)
|
|
80
|
+
|
|
81
|
+
# Result is not iterable
|
|
82
|
+
@iterator = [@iterator] unless @iterator.respond_to?(:each)
|
|
83
|
+
|
|
84
|
+
# Send result
|
|
85
|
+
@command.serializer.dump_to_io(@iterator, socket)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def send_error(e)
|
|
89
|
+
# Flag
|
|
90
|
+
socket.write_int(WORKER_ERROR)
|
|
91
|
+
|
|
92
|
+
# Message
|
|
93
|
+
socket.write_string(e.message)
|
|
94
|
+
|
|
95
|
+
# Backtrace
|
|
96
|
+
socket.write_int(e.backtrace.size)
|
|
97
|
+
e.backtrace.each do |item|
|
|
98
|
+
socket.write_string(item)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
socket.flush
|
|
102
|
+
|
|
103
|
+
# Wait for spark
|
|
104
|
+
# Socket is closed before throwing an exception
|
|
105
|
+
# Singal that ruby exception was fully received
|
|
106
|
+
until socket.closed?
|
|
107
|
+
sleep(0.1)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Depend on type of worker
|
|
111
|
+
kill_worker
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def successful_finish
|
|
115
|
+
# Finish
|
|
116
|
+
socket.write_int(WORKER_DONE)
|
|
117
|
+
|
|
118
|
+
# Send changed accumulator
|
|
119
|
+
changed = Spark::Accumulator.changed
|
|
120
|
+
socket.write_int(changed.size)
|
|
121
|
+
changed.each do |accumulator|
|
|
122
|
+
socket.write_data([accumulator.id, accumulator.value])
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Send it
|
|
126
|
+
socket.flush
|
|
127
|
+
|
|
128
|
+
before_end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def log(message=nil)
|
|
132
|
+
return if !$DEBUG
|
|
133
|
+
|
|
134
|
+
$stdout.puts %{==> #{Time.now.strftime('%H:%M:%S')} [#{id}] #{message}}
|
|
135
|
+
$stdout.flush
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# ===============================================================================================
|
|
141
|
+
# Worker::Process
|
|
142
|
+
#
|
|
143
|
+
class Process < Base
|
|
144
|
+
|
|
145
|
+
def id
|
|
146
|
+
::Process.pid
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
private
|
|
150
|
+
|
|
151
|
+
def before_start
|
|
152
|
+
$PROGRAM_NAME = 'RubySparkWorker'
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def kill_worker
|
|
156
|
+
Process.exit(false)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# ===============================================================================================
|
|
162
|
+
# Worker::Thread
|
|
163
|
+
#
|
|
164
|
+
class Thread < Base
|
|
165
|
+
|
|
166
|
+
def id
|
|
167
|
+
::Thread.current.object_id
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
def load_command
|
|
173
|
+
$mutex_for_command.synchronize { super }
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
# Threads changing for reading is very slow
|
|
177
|
+
# Faster way is do it one by one
|
|
178
|
+
def load_iterator
|
|
179
|
+
# Wait for incoming connection for preventing deadlock
|
|
180
|
+
if jruby?
|
|
181
|
+
socket.io_wait
|
|
182
|
+
else
|
|
183
|
+
socket.wait_readable
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
$mutex_for_iterator.synchronize { super }
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def kill_worker
|
|
190
|
+
Thread.current.kill
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Worker is loaded as standalone
|
|
197
|
+
if $PROGRAM_NAME == __FILE__
|
|
198
|
+
worker = Worker::Process.new(ARGV[0])
|
|
199
|
+
worker.run
|
|
200
|
+
end
|
data/ruby-spark.gemspec
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
|
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
5
|
+
|
|
6
|
+
require 'spark/version'
|
|
7
|
+
|
|
8
|
+
Gem::Specification.new do |spec|
|
|
9
|
+
spec.name = 'ruby-spark'
|
|
10
|
+
spec.version = Spark::VERSION
|
|
11
|
+
spec.authors = ['Ondřej Moravčík']
|
|
12
|
+
spec.email = ['moravcik.ondrej@gmail.com']
|
|
13
|
+
spec.summary = %q{Ruby wrapper for Apache Spark}
|
|
14
|
+
spec.description = %q{}
|
|
15
|
+
spec.homepage = ''
|
|
16
|
+
spec.license = 'MIT'
|
|
17
|
+
|
|
18
|
+
spec.files = `git ls-files -z`.split("\x0")
|
|
19
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
20
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
21
|
+
spec.require_paths = ['lib']
|
|
22
|
+
|
|
23
|
+
if RUBY_PLATFORM =~ /java/
|
|
24
|
+
spec.platform = 'java'
|
|
25
|
+
|
|
26
|
+
extensions = ['ext/ruby_java/extconf.rb']
|
|
27
|
+
else
|
|
28
|
+
extensions = ['ext/ruby_c/extconf.rb']
|
|
29
|
+
|
|
30
|
+
spec.add_dependency 'rjb'
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
spec.extensions = extensions
|
|
34
|
+
spec.required_ruby_version = '>= 2.0'
|
|
35
|
+
|
|
36
|
+
spec.requirements << 'java, scala'
|
|
37
|
+
|
|
38
|
+
spec.add_dependency 'sourcify', '0.6.0.rc4'
|
|
39
|
+
spec.add_dependency 'method_source'
|
|
40
|
+
spec.add_dependency 'commander'
|
|
41
|
+
spec.add_dependency 'pry'
|
|
42
|
+
spec.add_dependency 'nio4r'
|
|
43
|
+
spec.add_dependency 'distribution'
|
|
44
|
+
|
|
45
|
+
spec.add_development_dependency 'bundler', '~> 1.6'
|
|
46
|
+
spec.add_development_dependency 'rake'
|
|
47
|
+
end
|
data/spec/generator.rb
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
class Generator
|
|
2
|
+
def self.numbers(size=1000)
|
|
3
|
+
Array.new(size){ rand(1..1000) }
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
def self.numbers_with_zero(size=1000)
|
|
7
|
+
Array.new(size){ rand(0..1000) }
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def self.words(size=1000)
|
|
11
|
+
Array.new(size) { word }
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def self.word(size=10)
|
|
15
|
+
Array.new(rand(1..size)){(97+rand(26)).chr}.join
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def self.lines(size=1000, letters=3)
|
|
19
|
+
Array.new(size) do
|
|
20
|
+
Array.new(rand(50..100)){
|
|
21
|
+
(97+rand(letters)).chr + (" " * (rand(10) == 0 ? 1 : 0))
|
|
22
|
+
}.join
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def self.hash(size=1000)
|
|
27
|
+
Array.new(size) do
|
|
28
|
+
[word(2), rand(1..10)]
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def self.hash_with_values(size=1000, values_count=10)
|
|
33
|
+
Array.new(size) do
|
|
34
|
+
[word(2), Array.new(values_count) { rand(1..10) }]
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|