ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,82 @@
1
+ require "benchmark"
2
+ require "yaml"
3
+ require "msgpack"
4
+ require "oj"
5
+ # require "thrift"
6
+
7
+ puts "Simple"
8
+
9
+ data = (0..100000).to_a
10
+
11
+ Benchmark.bmbm do |x|
12
+ x.report("YAML") do
13
+ serialized = YAML.dump(data)
14
+ deserialized = YAML.load(serialized)
15
+ puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
16
+ end
17
+
18
+ x.report("Marshal") do
19
+ serialized = Marshal.dump(data)
20
+ deserialized = Marshal.load(serialized)
21
+ puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
22
+ end
23
+
24
+ x.report("MessagePack") do
25
+ serialized = MessagePack.dump(data)
26
+ deserialized = MessagePack.load(serialized)
27
+ puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
28
+ end
29
+
30
+ x.report("Oj") do
31
+ serialized = Oj.dump(data)
32
+ deserialized = Oj.load(serialized)
33
+ puts "Size: #{serialized.size}, Equal: #{deserialized == data}"
34
+ end
35
+
36
+ # x.report("Thrift") do
37
+ # serializer = Thrift::Serializer.new
38
+ # deserializer = Thrift::Deserializer.new
39
+
40
+ # serialized = serializer.serialize(data)
41
+ # end
42
+ end
43
+
44
+ puts ""
45
+ puts "More complex"
46
+
47
+ data = Array.new(10000000) {
48
+ [rand(97..122).chr, rand(10000000)]
49
+ }
50
+
51
+ Benchmark.bm do |x|
52
+ # Take too long
53
+ # x.report("YAML") do
54
+ # serialized = YAML.dump(data)
55
+ # YAML.load(serialized)
56
+ # end
57
+
58
+ x.report("Marshal") do
59
+ serialized = Marshal.dump(data)
60
+ deserialized = Marshal.load(serialized)
61
+ puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
62
+ end
63
+
64
+ x.report("MessagePack") do
65
+ serialized = MessagePack.dump(data)
66
+ deserialized = MessagePack.load(serialized)
67
+ puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
68
+ end
69
+
70
+ x.report("Oj") do
71
+ serialized = Oj.dump(data)
72
+ deserialized = Oj.load(serialized)
73
+ puts " Size: #{serialized.size}, Equal: #{deserialized == data}"
74
+ end
75
+
76
+ # x.report("Thrift") do
77
+ # serializer = Thrift::Serializer.new
78
+ # deserializer = Thrift::Deserializer.new
79
+
80
+ # serialized = serializer.serialize(data)
81
+ # end
82
+ end
data/benchmark/sort.rb ADDED
@@ -0,0 +1,43 @@
1
+ require "benchmark"
2
+
3
+ array = []
4
+ 1000.times {
5
+ array << {:bar => rand(1000)}
6
+ }
7
+
8
+ n = 500
9
+ Benchmark.bm(20) do |x|
10
+ x.report("sort") { n.times { array.sort{ |a,b| b[:bar] <=> a[:bar] } } }
11
+ x.report("sort reverse") { n.times { array.sort{ |a,b| a[:bar] <=> b[:bar] }.reverse } }
12
+ x.report("sort_by -a[:bar]") { n.times { array.sort_by{ |a| -a[:bar] } } }
13
+ x.report("sort_by a[:bar]*-1") { n.times { array.sort_by{ |a| a[:bar]*-1 } } }
14
+ x.report("sort_by.reverse!") { n.times { array.sort_by{ |a| a[:bar] }.reverse } }
15
+ end
16
+
17
+
18
+ array = Array.new(10000) { Array.new(rand(1..10)){(97+rand(26)).chr}.join }
19
+
20
+ Benchmark.bm(20) do |x|
21
+ x.report("sort asc") { n.times { array.sort } }
22
+ x.report("sort asc block") { n.times { array.sort{|a,b| a <=> b} } }
23
+ x.report("sort desc") { n.times { array.sort{|a,b| b <=> a} } }
24
+ x.report("sort asc reverse") { n.times { array.sort.reverse } }
25
+ end
26
+
27
+
28
+ key_value = Struct.new(:key, :value) do
29
+ def <=>(other)
30
+ key <=> other.key
31
+ end
32
+ end
33
+
34
+ count = 10000
35
+ item_range = 1000000
36
+ array1 = Array.new(count) { [rand(item_range), rand(item_range)] }
37
+ array2 = Array.new(count) { key_value.new rand(item_range), rand(item_range) }
38
+
39
+ Benchmark.bm(20) do |x|
40
+ x.report("sort_by") { n.times { array1.sort_by {|a| a[0]} } }
41
+ x.report("sort struct") { n.times { array2.sort } }
42
+ end
43
+
@@ -0,0 +1,164 @@
1
+ require "benchmark"
2
+ require "algorithms"
3
+
4
+ NUMBER_OF_SORTING = 1
5
+ NUMBER_OF_ARRAY = 10
6
+ WORDS_IN_ARRAY = 100000
7
+ MAX_WORD_SIZE = 10
8
+ EVAL_N_VALUES = 10
9
+
10
+ puts "NUMBER_OF_SORTING: #{NUMBER_OF_SORTING}"
11
+ puts "NUMBER_OF_ARRAY: #{NUMBER_OF_ARRAY}"
12
+ puts "WORDS_IN_ARRAY: #{WORDS_IN_ARRAY}"
13
+ puts "MAX_WORD_SIZE: #{MAX_WORD_SIZE}"
14
+ puts "EVAL_N_VALUES: #{EVAL_N_VALUES}"
15
+
16
+ def words
17
+ Array.new(WORDS_IN_ARRAY) { word }
18
+ end
19
+
20
+ def word
21
+ Array.new(rand(1..MAX_WORD_SIZE)){(97+rand(26)).chr}.join
22
+ end
23
+
24
+ @array = Array.new(NUMBER_OF_ARRAY) { words.sort }
25
+
26
+
27
+ # =================================================================================================
28
+ # Sort1
29
+
30
+ # Vrátí nový (nevyhodnocený) enumerator
31
+ def sort1(data)
32
+ return to_enum(__callee__, data) unless block_given?
33
+
34
+ heap = []
35
+
36
+ # Inicializuji heap s prvními položkami
37
+ # připojím samotné enumeratory pro volání .next
38
+ data.each do |a|
39
+ heap << [a.next, a]
40
+ end
41
+
42
+ while data.any?
43
+ begin
44
+ # Seřadím pole podle hodnot
45
+ heap.sort_by!{|(item,_)| item}
46
+ # Uložím si hodnotu a enumerator
47
+ item, enum = heap.shift
48
+ # Hodnota půjde do výsledku
49
+ yield item
50
+ # Místo odstraněné položky nahradí další ze stejného seznamu
51
+ heap << [enum.next, enum]
52
+ rescue StopIteration
53
+ # Enumerator je prázdný
54
+ data.delete(enum)
55
+ end
56
+ end
57
+ end
58
+
59
+
60
+ # =================================================================================================
61
+ # Sort1_2
62
+
63
+ # Vrátí nový (nevyhodnocený) enumerator
64
+ def sort1_2(data)
65
+ return to_enum(__callee__, data) unless block_given?
66
+
67
+ heap = []
68
+ enums = []
69
+
70
+ # Inicializuji heap s prvními položkami
71
+ # připojím samotné enumeratory pro volání .next
72
+ data.each do |a|
73
+ EVAL_N_VALUES.times {
74
+ begin
75
+ heap << [a.next, a]
76
+ rescue StopIteration
77
+ end
78
+ }
79
+ end
80
+
81
+ while data.any? || heap.any?
82
+ # Seřadím pole podle hodnot
83
+ heap.sort_by!{|(item,_)| item}
84
+
85
+ # Minimálně můžu vzít EVAL_N_VALUES
86
+ EVAL_N_VALUES.times {
87
+ break if heap.empty?
88
+
89
+ # Uložím si hodnotu a enumerator
90
+ item, enum = heap.shift
91
+ # Hodnota půjde do výsledku
92
+ yield item
93
+
94
+ enums << enum
95
+ }
96
+
97
+ while (enum = enums.shift)
98
+ begin
99
+ heap << [enum.next, enum]
100
+ rescue StopIteration
101
+ data.delete(enum)
102
+ enums.delete(enum)
103
+ end
104
+ end
105
+
106
+ end
107
+ end
108
+
109
+
110
+ # =================================================================================================
111
+ # Sort 2
112
+
113
+ def sort2(data)
114
+ return to_enum(__callee__, data) unless block_given?
115
+
116
+ heap = Containers::Heap.new
117
+
118
+ data.each do |enum|
119
+ item = enum.next
120
+ heap.push(item, [item, enum])
121
+ end
122
+
123
+ while data.any?
124
+ begin
125
+ item, enum = heap.pop
126
+ yield item
127
+
128
+ item = enum.next
129
+ heap.push(item, [item, enum])
130
+ rescue StopIteration
131
+ data.delete(enum)
132
+ end
133
+ end
134
+ end
135
+
136
+
137
+ # =================================================================================================
138
+ # Benchmark
139
+
140
+ Benchmark.bm(10) do |x|
141
+ x.report("sort") do
142
+ NUMBER_OF_SORTING.times {
143
+ @result = @array.flatten.sort
144
+ }
145
+ end
146
+
147
+ x.report("sort 1") do
148
+ NUMBER_OF_SORTING.times {
149
+ raise "Bad sorting" if @result != sort1(@array.map(&:each)).to_a
150
+ }
151
+ end
152
+
153
+ x.report("sort 1_2") do
154
+ NUMBER_OF_SORTING.times {
155
+ raise "Bad sorting" if @result != sort1_2(@array.map(&:each)).to_a
156
+ }
157
+ end
158
+
159
+ # x.report("sort 2") do
160
+ # NUMBER_OF_SORTING.times {
161
+ # raise "Bad sorting" if @result != sort2(@array.map(&:each)).to_a
162
+ # }
163
+ # end
164
+ end
data/benchmark/take.rb ADDED
@@ -0,0 +1,28 @@
1
+ require "benchmark"
2
+
3
+ SIZE = 100_000_000
4
+
5
+ @array1 = (0..SIZE).to_a;
6
+ @array2 = (0..SIZE).to_a;
7
+ @array3 = (0..SIZE).to_a;
8
+
9
+ TAKE = 100_000
10
+
11
+ Benchmark.bm(15) do |x|
12
+ # Fastest
13
+ x.report("take"){
14
+ a=@array1.take(TAKE)
15
+ }
16
+
17
+ # Slowest and take most memory
18
+ x.report("reverse drop"){
19
+ @array2.reverse!
20
+ @array2.drop(@array2.size - TAKE)
21
+ @array2.reverse!
22
+ }
23
+
24
+ # Least memory
25
+ x.report("splice"){
26
+ a=@array2.slice!(0, TAKE)
27
+ }
28
+ end
data/bin/ruby-spark ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
5
+
6
+ require 'ruby-spark'
7
+
8
+ Spark::CLI.new.run
data/example/pi.rb ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
4
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
5
+
6
+ require 'ruby-spark'
7
+
8
+ Spark.logger.disable
9
+ Spark.start
10
+
11
+ slices = 3
12
+ n = 100000 * slices
13
+
14
+ def map(_)
15
+ x = rand * 2 - 1
16
+ y = rand * 2 - 1
17
+
18
+ if x**2 + y**2 < 1
19
+ return 1
20
+ else
21
+ return 0
22
+ end
23
+ end
24
+
25
+ rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')
26
+ rdd = rdd.map(method(:map))
27
+
28
+ puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile("ruby_spark_ext")
@@ -0,0 +1,158 @@
1
+ #include "murmur.h"
2
+
3
+ #if defined(_MSC_VER)
4
+ #define BIG_CONSTANT(x) (x)
5
+ #else
6
+ #define BIG_CONSTANT(x) (x##LLU)
7
+ #endif
8
+
9
+ /*-----------------------------------------------------------------------------
10
+ // MurmurHash2, 64-bit versions, by Austin Appleby
11
+ //
12
+ // The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
13
+ // and endian-ness issues if used across multiple platforms.
14
+ //
15
+ // 64-bit hash for 64-bit platforms
16
+ */
17
+
18
+ uint64_t MurmurHash64A(const void * key, int len, uint64_t seed)
19
+ {
20
+ const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
21
+ const int r = 47;
22
+
23
+ uint64_t h = seed ^ (len * m);
24
+
25
+ const uint64_t * data = (const uint64_t *)key;
26
+ const uint64_t * end = data + (len/8);
27
+
28
+ while(data != end)
29
+ {
30
+ uint64_t k = *data++;
31
+
32
+ k *= m;
33
+ k ^= k >> r;
34
+ k *= m;
35
+
36
+ h ^= k;
37
+ h *= m;
38
+ }
39
+
40
+ const unsigned char * data2 = (const unsigned char*)data;
41
+
42
+ switch(len & 7)
43
+ {
44
+ case 7: h ^= ((uint64_t) data2[6]) << 48;
45
+ case 6: h ^= ((uint64_t) data2[5]) << 40;
46
+ case 5: h ^= ((uint64_t) data2[4]) << 32;
47
+ case 4: h ^= ((uint64_t) data2[3]) << 24;
48
+ case 3: h ^= ((uint64_t) data2[2]) << 16;
49
+ case 2: h ^= ((uint64_t) data2[1]) << 8;
50
+ case 1: h ^= ((uint64_t) data2[0]);
51
+ h *= m;
52
+ };
53
+
54
+ h ^= h >> r;
55
+ h *= m;
56
+ h ^= h >> r;
57
+
58
+ return h;
59
+ }
60
+
61
+ /* 64-bit hash for 32-bit platforms */
62
+
63
+ uint64_t MurmurHash64B(const void * key, int len, uint64_t seed)
64
+ {
65
+ const uint32_t m = 0x5bd1e995;
66
+ const int r = 24;
67
+
68
+ uint32_t h1 = ((uint32_t) seed) ^ len;
69
+ uint32_t h2 = ((uint32_t) (seed >> 32));
70
+
71
+ const uint32_t * data = (const uint32_t *)key;
72
+
73
+ while(len >= 8)
74
+ {
75
+ uint32_t k1 = *data++;
76
+ k1 *= m; k1 ^= k1 >> r; k1 *= m;
77
+ h1 *= m; h1 ^= k1;
78
+ len -= 4;
79
+
80
+ uint32_t k2 = *data++;
81
+ k2 *= m; k2 ^= k2 >> r; k2 *= m;
82
+ h2 *= m; h2 ^= k2;
83
+ len -= 4;
84
+ }
85
+
86
+ if(len >= 4)
87
+ {
88
+ uint32_t k1 = *data++;
89
+ k1 *= m; k1 ^= k1 >> r; k1 *= m;
90
+ h1 *= m; h1 ^= k1;
91
+ len -= 4;
92
+ }
93
+
94
+ switch(len)
95
+ {
96
+ case 3: h2 ^= ((unsigned char*)data)[2] << 16;
97
+ case 2: h2 ^= ((unsigned char*)data)[1] << 8;
98
+ case 1: h2 ^= ((unsigned char*)data)[0];
99
+ h2 *= m;
100
+ };
101
+
102
+ h1 ^= h2 >> 18; h1 *= m;
103
+ h2 ^= h1 >> 22; h2 *= m;
104
+ h1 ^= h2 >> 17; h1 *= m;
105
+ h2 ^= h1 >> 19; h2 *= m;
106
+
107
+ uint64_t h = h1;
108
+
109
+ h = (h << 32) | h2;
110
+
111
+ return h;
112
+ }
113
+
114
+
115
+
116
+ // ================================================================================================
117
+ // Ruby methods
118
+
119
+ #define PORTABLE_HASH_SEED 16154832
120
+
121
+
122
+ VALUE murmur2_digest(VALUE rb_str, uint64_t seed)
123
+ {
124
+ StringValue(rb_str);
125
+
126
+ void * key = RSTRING_PTR(rb_str);
127
+ long len = RSTRING_LEN(rb_str);
128
+
129
+ uint64_t result = MurmurHash64A(key, len, seed);
130
+
131
+ return LONG2FIX(result);
132
+ }
133
+
134
+ // ------------------------------------------------------------------------------------------------
135
+ // Spark::Digest::Murmur2.digest
136
+
137
+ VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass)
138
+ {
139
+ if(argc == 0 || argc > 2){
140
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
141
+ }
142
+
143
+ uint64_t seed = (argc == 1 ? 0 : NUM2UINT(argv[1]));
144
+
145
+ return murmur2_digest(argv[0], seed);
146
+ }
147
+
148
+ // ------------------------------------------------------------------------------------------------
149
+ // Spark::Digest.portable_hash
150
+
151
+ VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass)
152
+ {
153
+ if(argc != 1){
154
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
155
+ }
156
+
157
+ return murmur2_digest(argv[0], PORTABLE_HASH_SEED);
158
+ }