ruby-spark 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +185 -0
- data/Rakefile +35 -0
- data/TODO.md +7 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/performance/prepare.sh +18 -0
- data/benchmark/performance/python.py +156 -0
- data/benchmark/performance/r.r +69 -0
- data/benchmark/performance/ruby.rb +167 -0
- data/benchmark/performance/run-all.sh +160 -0
- data/benchmark/performance/scala.scala +181 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +154 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +244 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +304 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +57 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1328 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +24 -0
- data/lib/spark/serializer/base.rb +170 -0
- data/lib/spark/serializer/cartesian.rb +37 -0
- data/lib/spark/serializer/marshal.rb +19 -0
- data/lib/spark/serializer/message_pack.rb +25 -0
- data/lib/spark/serializer/oj.rb +25 -0
- data/lib/spark/serializer/pair.rb +27 -0
- data/lib/spark/serializer/utf8.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +197 -0
- data/ruby-spark.gemspec +36 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +163 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +114 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +13 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +168 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +39 -0
- metadata +301 -0
@@ -0,0 +1,82 @@
|
|
1
|
+
require "benchmark"
|
2
|
+
require "yaml"
|
3
|
+
require "msgpack"
|
4
|
+
require "oj"
|
5
|
+
# require "thrift"
|
6
|
+
|
7
|
+
puts "Simple"
|
8
|
+
|
9
|
+
data = (0..100000).to_a
|
10
|
+
|
11
|
+
Benchmark.bmbm do |x|
|
12
|
+
x.report("YAML") do
|
13
|
+
serialized = YAML.dump(data)
|
14
|
+
deserialized = YAML.load(serialized)
|
15
|
+
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
|
16
|
+
end
|
17
|
+
|
18
|
+
x.report("Marshal") do
|
19
|
+
serialized = Marshal.dump(data)
|
20
|
+
deserialized = Marshal.load(serialized)
|
21
|
+
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
|
22
|
+
end
|
23
|
+
|
24
|
+
x.report("MessagePack") do
|
25
|
+
serialized = MessagePack.dump(data)
|
26
|
+
deserialized = MessagePack.load(serialized)
|
27
|
+
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
|
28
|
+
end
|
29
|
+
|
30
|
+
x.report("Oj") do
|
31
|
+
serialized = Oj.dump(data)
|
32
|
+
deserialized = Oj.load(serialized)
|
33
|
+
puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
|
34
|
+
end
|
35
|
+
|
36
|
+
# x.report("Thrift") do
|
37
|
+
# serializer = Thrift::Serializer.new
|
38
|
+
# deserializer = Thrift::Deserializer.new
|
39
|
+
|
40
|
+
# serialized = serializer.serialize(data)
|
41
|
+
# end
|
42
|
+
end
|
43
|
+
|
44
|
+
puts ""
|
45
|
+
puts "More complex"
|
46
|
+
|
47
|
+
data = Array.new(10000000) {
|
48
|
+
[rand(97..122).chr, rand(10000000)]
|
49
|
+
}
|
50
|
+
|
51
|
+
Benchmark.bm do |x|
|
52
|
+
# Take too long
|
53
|
+
# x.report("YAML") do
|
54
|
+
# serialized = YAML.dump(data)
|
55
|
+
# YAML.load(serialized)
|
56
|
+
# end
|
57
|
+
|
58
|
+
x.report("Marshal") do
|
59
|
+
serialized = Marshal.dump(data)
|
60
|
+
deserialized = Marshal.load(serialized)
|
61
|
+
puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
|
62
|
+
end
|
63
|
+
|
64
|
+
x.report("MessagePack") do
|
65
|
+
serialized = MessagePack.dump(data)
|
66
|
+
deserialized = MessagePack.load(serialized)
|
67
|
+
puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
|
68
|
+
end
|
69
|
+
|
70
|
+
x.report("Oj") do
|
71
|
+
serialized = Oj.dump(data)
|
72
|
+
deserialized = Oj.load(serialized)
|
73
|
+
puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
|
74
|
+
end
|
75
|
+
|
76
|
+
# x.report("Thrift") do
|
77
|
+
# serializer = Thrift::Serializer.new
|
78
|
+
# deserializer = Thrift::Deserializer.new
|
79
|
+
|
80
|
+
# serialized = serializer.serialize(data)
|
81
|
+
# end
|
82
|
+
end
|
data/benchmark/sort.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require "benchmark"
|
2
|
+
|
3
|
+
array = []
|
4
|
+
1000.times {
|
5
|
+
array << {:bar => rand(1000)}
|
6
|
+
}
|
7
|
+
|
8
|
+
n = 500
|
9
|
+
Benchmark.bm(20) do |x|
|
10
|
+
x.report("sort") { n.times { array.sort{ |a,b| b[:bar] <=> a[:bar] } } }
|
11
|
+
x.report("sort reverse") { n.times { array.sort{ |a,b| a[:bar] <=> b[:bar] }.reverse } }
|
12
|
+
x.report("sort_by -a[:bar]") { n.times { array.sort_by{ |a| -a[:bar] } } }
|
13
|
+
x.report("sort_by a[:bar]*-1") { n.times { array.sort_by{ |a| a[:bar]*-1 } } }
|
14
|
+
x.report("sort_by.reverse!") { n.times { array.sort_by{ |a| a[:bar] }.reverse } }
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
array = Array.new(10000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join }
|
19
|
+
|
20
|
+
Benchmark.bm(20) do |x|
|
21
|
+
x.report("sort asc") { n.times { array.sort } }
|
22
|
+
x.report("sort asc block") { n.times { array.sort{|a,b| a <=> b} } }
|
23
|
+
x.report("sort desc") { n.times { array.sort{|a,b| b <=> a} } }
|
24
|
+
x.report("sort asc reverse") { n.times { array.sort.reverse } }
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
key_value = Struct.new(:key, :value) do
|
29
|
+
def <=>(other)
|
30
|
+
key <=> other.key
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
count = 10000
|
35
|
+
item_range = 1000000
|
36
|
+
array1 = Array.new(count) { [rand(item_range), rand(item_range)] }
|
37
|
+
array2 = Array.new(count) { key_value.new rand(item_range), rand(item_range) }
|
38
|
+
|
39
|
+
Benchmark.bm(20) do |x|
|
40
|
+
x.report("sort_by") { n.times { array1.sort_by {|a| a[0]} } }
|
41
|
+
x.report("sort struct") { n.times { array2.sort } }
|
42
|
+
end
|
43
|
+
|
data/benchmark/sort2.rb
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
require "benchmark"
|
2
|
+
require "algorithms"
|
3
|
+
|
4
|
+
NUMBER_OF_SORTING = 1
|
5
|
+
NUMBER_OF_ARRAY = 10
|
6
|
+
WORDS_IN_ARRAY = 100000
|
7
|
+
MAX_WORD_SIZE = 10
|
8
|
+
EVAL_N_VALUES = 10
|
9
|
+
|
10
|
+
puts "NUMBER_OF_SORTING: #{NUMBER_OF_SORTING}"
|
11
|
+
puts "NUMBER_OF_ARRAY: #{NUMBER_OF_ARRAY}"
|
12
|
+
puts "WORDS_IN_ARRAY: #{WORDS_IN_ARRAY}"
|
13
|
+
puts "MAX_WORD_SIZE: #{MAX_WORD_SIZE}"
|
14
|
+
puts "EVAL_N_VALUES: #{EVAL_N_VALUES}"
|
15
|
+
|
16
|
+
def words
|
17
|
+
Array.new(WORDS_IN_ARRAY) { word }
|
18
|
+
end
|
19
|
+
|
20
|
+
def word
|
21
|
+
Array.new(rand(1..MAX_WORD_SIZE)){(97+rand(26)).chr}.join
|
22
|
+
end
|
23
|
+
|
24
|
+
@array = Array.new(NUMBER_OF_ARRAY) { words.sort }
|
25
|
+
|
26
|
+
|
27
|
+
# =================================================================================================
|
28
|
+
# Sort1
|
29
|
+
|
30
|
+
# Vrátí nový (nevyhodnocený) enumerator
|
31
|
+
def sort1(data)
|
32
|
+
return to_enum(__callee__, data) unless block_given?
|
33
|
+
|
34
|
+
heap = []
|
35
|
+
|
36
|
+
# Inicializuji heap s prvními položkami
|
37
|
+
# připojím samotné enumeratory pro volání .next
|
38
|
+
data.each do |a|
|
39
|
+
heap << [a.next, a]
|
40
|
+
end
|
41
|
+
|
42
|
+
while data.any?
|
43
|
+
begin
|
44
|
+
# Seřadím pole podle hodnot
|
45
|
+
heap.sort_by!{|(item,_)| item}
|
46
|
+
# Uložím si hodnotu a enumerator
|
47
|
+
item, enum = heap.shift
|
48
|
+
# Hodnota půjde do výsledku
|
49
|
+
yield item
|
50
|
+
# Místo odstraněné položky nahradí další ze stejného seznamu
|
51
|
+
heap << [enum.next, enum]
|
52
|
+
rescue StopIteration
|
53
|
+
# Enumerator je prázdný
|
54
|
+
data.delete(enum)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
# =================================================================================================
|
61
|
+
# Sort1_2
|
62
|
+
|
63
|
+
# Vrátí nový (nevyhodnocený) enumerator
|
64
|
+
def sort1_2(data)
|
65
|
+
return to_enum(__callee__, data) unless block_given?
|
66
|
+
|
67
|
+
heap = []
|
68
|
+
enums = []
|
69
|
+
|
70
|
+
# Inicializuji heap s prvními položkami
|
71
|
+
# připojím samotné enumeratory pro volání .next
|
72
|
+
data.each do |a|
|
73
|
+
EVAL_N_VALUES.times {
|
74
|
+
begin
|
75
|
+
heap << [a.next, a]
|
76
|
+
rescue StopIteration
|
77
|
+
end
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
while data.any? || heap.any?
|
82
|
+
# Seřadím pole podle hodnot
|
83
|
+
heap.sort_by!{|(item,_)| item}
|
84
|
+
|
85
|
+
# Minimálně můžu vzít EVAL_N_VALUES
|
86
|
+
EVAL_N_VALUES.times {
|
87
|
+
break if heap.empty?
|
88
|
+
|
89
|
+
# Uložím si hodnotu a enumerator
|
90
|
+
item, enum = heap.shift
|
91
|
+
# Hodnota půjde do výsledku
|
92
|
+
yield item
|
93
|
+
|
94
|
+
enums << enum
|
95
|
+
}
|
96
|
+
|
97
|
+
while (enum = enums.shift)
|
98
|
+
begin
|
99
|
+
heap << [enum.next, enum]
|
100
|
+
rescue StopIteration
|
101
|
+
data.delete(enum)
|
102
|
+
enums.delete(enum)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
|
110
|
+
# =================================================================================================
|
111
|
+
# Sort 2
|
112
|
+
|
113
|
+
def sort2(data)
|
114
|
+
return to_enum(__callee__, data) unless block_given?
|
115
|
+
|
116
|
+
heap = Containers::Heap.new
|
117
|
+
|
118
|
+
data.each do |enum|
|
119
|
+
item = enum.next
|
120
|
+
heap.push(item, [item, enum])
|
121
|
+
end
|
122
|
+
|
123
|
+
while data.any?
|
124
|
+
begin
|
125
|
+
item, enum = heap.pop
|
126
|
+
yield item
|
127
|
+
|
128
|
+
item = enum.next
|
129
|
+
heap.push(item, [item, enum])
|
130
|
+
rescue StopIteration
|
131
|
+
data.delete(enum)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
|
137
|
+
# =================================================================================================
|
138
|
+
# Benchmark
|
139
|
+
|
140
|
+
Benchmark.bm(10) do |x|
|
141
|
+
x.report("sort") do
|
142
|
+
NUMBER_OF_SORTING.times {
|
143
|
+
@result = @array.flatten.sort
|
144
|
+
}
|
145
|
+
end
|
146
|
+
|
147
|
+
x.report("sort 1") do
|
148
|
+
NUMBER_OF_SORTING.times {
|
149
|
+
raise "Bad sorting" if @result != sort1(@array.map(&:each)).to_a
|
150
|
+
}
|
151
|
+
end
|
152
|
+
|
153
|
+
x.report("sort 1_2") do
|
154
|
+
NUMBER_OF_SORTING.times {
|
155
|
+
raise "Bad sorting" if @result != sort1_2(@array.map(&:each)).to_a
|
156
|
+
}
|
157
|
+
end
|
158
|
+
|
159
|
+
# x.report("sort 2") do
|
160
|
+
# NUMBER_OF_SORTING.times {
|
161
|
+
# raise "Bad sorting" if @result != sort2(@array.map(&:each)).to_a
|
162
|
+
# }
|
163
|
+
# end
|
164
|
+
end
|
data/benchmark/take.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require "benchmark"
|
2
|
+
|
3
|
+
SIZE = 100_000_000
|
4
|
+
|
5
|
+
@array1 = (0..SIZE).to_a;
|
6
|
+
@array2 = (0..SIZE).to_a;
|
7
|
+
@array3 = (0..SIZE).to_a;
|
8
|
+
|
9
|
+
TAKE = 100_000
|
10
|
+
|
11
|
+
Benchmark.bm(15) do |x|
|
12
|
+
# Fastest
|
13
|
+
x.report("take"){
|
14
|
+
a=@array1.take(TAKE)
|
15
|
+
}
|
16
|
+
|
17
|
+
# Slowest and take most memory
|
18
|
+
x.report("reverse drop"){
|
19
|
+
@array2.reverse!
|
20
|
+
@array2.drop(@array2.size - TAKE)
|
21
|
+
@array2.reverse!
|
22
|
+
}
|
23
|
+
|
24
|
+
# Least memory
|
25
|
+
x.report("splice"){
|
26
|
+
a=@array2.slice!(0, TAKE)
|
27
|
+
}
|
28
|
+
end
|
data/bin/ruby-spark
ADDED
data/example/pi.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
|
4
|
+
$LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
|
5
|
+
|
6
|
+
require 'ruby-spark'
|
7
|
+
|
8
|
+
Spark.logger.disable
|
9
|
+
Spark.start
|
10
|
+
|
11
|
+
slices = 3
|
12
|
+
n = 100000 * slices
|
13
|
+
|
14
|
+
def map(_)
|
15
|
+
x = rand * 2 - 1
|
16
|
+
y = rand * 2 - 1
|
17
|
+
|
18
|
+
if x**2 + y**2 < 1
|
19
|
+
return 1
|
20
|
+
else
|
21
|
+
return 0
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')
|
26
|
+
rdd = rdd.map(method(:map))
|
27
|
+
|
28
|
+
puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
|
data/ext/ruby_c/murmur.c
ADDED
@@ -0,0 +1,158 @@
|
|
1
|
+
#include "murmur.h"
|
2
|
+
|
3
|
+
#if defined(_MSC_VER)
|
4
|
+
#define BIG_CONSTANT(x) (x)
|
5
|
+
#else
|
6
|
+
#define BIG_CONSTANT(x) (x##LLU)
|
7
|
+
#endif
|
8
|
+
|
9
|
+
/*-----------------------------------------------------------------------------
|
10
|
+
// MurmurHash2, 64-bit versions, by Austin Appleby
|
11
|
+
//
|
12
|
+
// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
|
13
|
+
// and endian-ness issues if used across multiple platforms.
|
14
|
+
//
|
15
|
+
// 64-bit hash for 64-bit platforms
|
16
|
+
*/
|
17
|
+
|
18
|
+
uint64_t MurmurHash64A(const void * key, int len, uint64_t seed)
|
19
|
+
{
|
20
|
+
const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
|
21
|
+
const int r = 47;
|
22
|
+
|
23
|
+
uint64_t h = seed ^ (len * m);
|
24
|
+
|
25
|
+
const uint64_t * data = (const uint64_t *)key;
|
26
|
+
const uint64_t * end = data + (len/8);
|
27
|
+
|
28
|
+
while(data != end)
|
29
|
+
{
|
30
|
+
uint64_t k = *data++;
|
31
|
+
|
32
|
+
k *= m;
|
33
|
+
k ^= k >> r;
|
34
|
+
k *= m;
|
35
|
+
|
36
|
+
h ^= k;
|
37
|
+
h *= m;
|
38
|
+
}
|
39
|
+
|
40
|
+
const unsigned char * data2 = (const unsigned char*)data;
|
41
|
+
|
42
|
+
switch(len & 7)
|
43
|
+
{
|
44
|
+
case 7: h ^= ((uint64_t) data2[6]) << 48;
|
45
|
+
case 6: h ^= ((uint64_t) data2[5]) << 40;
|
46
|
+
case 5: h ^= ((uint64_t) data2[4]) << 32;
|
47
|
+
case 4: h ^= ((uint64_t) data2[3]) << 24;
|
48
|
+
case 3: h ^= ((uint64_t) data2[2]) << 16;
|
49
|
+
case 2: h ^= ((uint64_t) data2[1]) << 8;
|
50
|
+
case 1: h ^= ((uint64_t) data2[0]);
|
51
|
+
h *= m;
|
52
|
+
};
|
53
|
+
|
54
|
+
h ^= h >> r;
|
55
|
+
h *= m;
|
56
|
+
h ^= h >> r;
|
57
|
+
|
58
|
+
return h;
|
59
|
+
}
|
60
|
+
|
61
|
+
/* 64-bit hash for 32-bit platforms */
|
62
|
+
|
63
|
+
uint64_t MurmurHash64B(const void * key, int len, uint64_t seed)
|
64
|
+
{
|
65
|
+
const uint32_t m = 0x5bd1e995;
|
66
|
+
const int r = 24;
|
67
|
+
|
68
|
+
uint32_t h1 = ((uint32_t) seed) ^ len;
|
69
|
+
uint32_t h2 = ((uint32_t) (seed >> 32));
|
70
|
+
|
71
|
+
const uint32_t * data = (const uint32_t *)key;
|
72
|
+
|
73
|
+
while(len >= 8)
|
74
|
+
{
|
75
|
+
uint32_t k1 = *data++;
|
76
|
+
k1 *= m; k1 ^= k1 >> r; k1 *= m;
|
77
|
+
h1 *= m; h1 ^= k1;
|
78
|
+
len -= 4;
|
79
|
+
|
80
|
+
uint32_t k2 = *data++;
|
81
|
+
k2 *= m; k2 ^= k2 >> r; k2 *= m;
|
82
|
+
h2 *= m; h2 ^= k2;
|
83
|
+
len -= 4;
|
84
|
+
}
|
85
|
+
|
86
|
+
if(len >= 4)
|
87
|
+
{
|
88
|
+
uint32_t k1 = *data++;
|
89
|
+
k1 *= m; k1 ^= k1 >> r; k1 *= m;
|
90
|
+
h1 *= m; h1 ^= k1;
|
91
|
+
len -= 4;
|
92
|
+
}
|
93
|
+
|
94
|
+
switch(len)
|
95
|
+
{
|
96
|
+
case 3: h2 ^= ((unsigned char*)data)[2] << 16;
|
97
|
+
case 2: h2 ^= ((unsigned char*)data)[1] << 8;
|
98
|
+
case 1: h2 ^= ((unsigned char*)data)[0];
|
99
|
+
h2 *= m;
|
100
|
+
};
|
101
|
+
|
102
|
+
h1 ^= h2 >> 18; h1 *= m;
|
103
|
+
h2 ^= h1 >> 22; h2 *= m;
|
104
|
+
h1 ^= h2 >> 17; h1 *= m;
|
105
|
+
h2 ^= h1 >> 19; h2 *= m;
|
106
|
+
|
107
|
+
uint64_t h = h1;
|
108
|
+
|
109
|
+
h = (h << 32) | h2;
|
110
|
+
|
111
|
+
return h;
|
112
|
+
}
|
113
|
+
|
114
|
+
|
115
|
+
|
116
|
+
// ================================================================================================
|
117
|
+
// Ruby methods
|
118
|
+
|
119
|
+
#define PORTABLE_HASH_SEED 16154832
|
120
|
+
|
121
|
+
|
122
|
+
VALUE murmur2_digest(VALUE rb_str, uint64_t seed)
|
123
|
+
{
|
124
|
+
StringValue(rb_str);
|
125
|
+
|
126
|
+
void * key = RSTRING_PTR(rb_str);
|
127
|
+
long len = RSTRING_LEN(rb_str);
|
128
|
+
|
129
|
+
uint64_t result = MurmurHash64A(key, len, seed);
|
130
|
+
|
131
|
+
return LONG2FIX(result);
|
132
|
+
}
|
133
|
+
|
134
|
+
// ------------------------------------------------------------------------------------------------
|
135
|
+
// Spark::Digest::Murmur2.digest
|
136
|
+
|
137
|
+
VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass)
|
138
|
+
{
|
139
|
+
if(argc == 0 || argc > 2){
|
140
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
|
141
|
+
}
|
142
|
+
|
143
|
+
uint64_t seed = (argc == 1 ? 0 : NUM2UINT(argv[1]));
|
144
|
+
|
145
|
+
return murmur2_digest(argv[0], seed);
|
146
|
+
}
|
147
|
+
|
148
|
+
// ------------------------------------------------------------------------------------------------
|
149
|
+
// Spark::Digest.portable_hash
|
150
|
+
|
151
|
+
VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass)
|
152
|
+
{
|
153
|
+
if(argc != 1){
|
154
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
|
155
|
+
}
|
156
|
+
|
157
|
+
return murmur2_digest(argv[0], PORTABLE_HASH_SEED);
|
158
|
+
}
|