ruby-spark 1.1.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +252 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +6 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/comparison/prepare.sh +18 -0
  12. data/benchmark/comparison/python.py +156 -0
  13. data/benchmark/comparison/r.r +69 -0
  14. data/benchmark/comparison/ruby.rb +167 -0
  15. data/benchmark/comparison/run-all.sh +160 -0
  16. data/benchmark/comparison/scala.scala +181 -0
  17. data/benchmark/custom_marshal.rb +94 -0
  18. data/benchmark/digest.rb +150 -0
  19. data/benchmark/enumerator.rb +88 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/example/website_search.rb +83 -0
  27. data/ext/ruby_c/extconf.rb +3 -0
  28. data/ext/ruby_c/murmur.c +158 -0
  29. data/ext/ruby_c/murmur.h +9 -0
  30. data/ext/ruby_c/ruby-spark.c +18 -0
  31. data/ext/ruby_java/Digest.java +36 -0
  32. data/ext/ruby_java/Murmur2.java +98 -0
  33. data/ext/ruby_java/RubySparkExtService.java +28 -0
  34. data/ext/ruby_java/extconf.rb +3 -0
  35. data/ext/spark/build.sbt +73 -0
  36. data/ext/spark/project/plugins.sbt +9 -0
  37. data/ext/spark/sbt/sbt +34 -0
  38. data/ext/spark/src/main/scala/Exec.scala +91 -0
  39. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  40. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  41. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  42. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  43. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  44. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  46. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  47. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  48. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  49. data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
  50. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  51. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  52. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  53. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  54. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  55. data/lib/ruby-spark.rb +1 -0
  56. data/lib/spark.rb +198 -0
  57. data/lib/spark/accumulator.rb +260 -0
  58. data/lib/spark/broadcast.rb +98 -0
  59. data/lib/spark/build.rb +43 -0
  60. data/lib/spark/cli.rb +169 -0
  61. data/lib/spark/command.rb +86 -0
  62. data/lib/spark/command/base.rb +158 -0
  63. data/lib/spark/command/basic.rb +345 -0
  64. data/lib/spark/command/pair.rb +124 -0
  65. data/lib/spark/command/sort.rb +51 -0
  66. data/lib/spark/command/statistic.rb +144 -0
  67. data/lib/spark/command_builder.rb +141 -0
  68. data/lib/spark/command_validator.rb +34 -0
  69. data/lib/spark/config.rb +238 -0
  70. data/lib/spark/constant.rb +14 -0
  71. data/lib/spark/context.rb +322 -0
  72. data/lib/spark/error.rb +50 -0
  73. data/lib/spark/ext/hash.rb +41 -0
  74. data/lib/spark/ext/integer.rb +25 -0
  75. data/lib/spark/ext/io.rb +67 -0
  76. data/lib/spark/ext/ip_socket.rb +29 -0
  77. data/lib/spark/ext/module.rb +58 -0
  78. data/lib/spark/ext/object.rb +24 -0
  79. data/lib/spark/ext/string.rb +24 -0
  80. data/lib/spark/helper.rb +10 -0
  81. data/lib/spark/helper/logger.rb +40 -0
  82. data/lib/spark/helper/parser.rb +85 -0
  83. data/lib/spark/helper/serialize.rb +71 -0
  84. data/lib/spark/helper/statistic.rb +93 -0
  85. data/lib/spark/helper/system.rb +42 -0
  86. data/lib/spark/java_bridge.rb +19 -0
  87. data/lib/spark/java_bridge/base.rb +203 -0
  88. data/lib/spark/java_bridge/jruby.rb +23 -0
  89. data/lib/spark/java_bridge/rjb.rb +41 -0
  90. data/lib/spark/logger.rb +76 -0
  91. data/lib/spark/mllib.rb +100 -0
  92. data/lib/spark/mllib/classification/common.rb +31 -0
  93. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  94. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  95. data/lib/spark/mllib/classification/svm.rb +135 -0
  96. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  97. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  98. data/lib/spark/mllib/matrix.rb +120 -0
  99. data/lib/spark/mllib/regression/common.rb +73 -0
  100. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  101. data/lib/spark/mllib/regression/lasso.rb +100 -0
  102. data/lib/spark/mllib/regression/linear.rb +124 -0
  103. data/lib/spark/mllib/regression/ridge.rb +97 -0
  104. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  105. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  106. data/lib/spark/mllib/stat/distribution.rb +12 -0
  107. data/lib/spark/mllib/vector.rb +185 -0
  108. data/lib/spark/rdd.rb +1377 -0
  109. data/lib/spark/sampler.rb +92 -0
  110. data/lib/spark/serializer.rb +79 -0
  111. data/lib/spark/serializer/auto_batched.rb +59 -0
  112. data/lib/spark/serializer/base.rb +63 -0
  113. data/lib/spark/serializer/batched.rb +84 -0
  114. data/lib/spark/serializer/cartesian.rb +13 -0
  115. data/lib/spark/serializer/compressed.rb +27 -0
  116. data/lib/spark/serializer/marshal.rb +17 -0
  117. data/lib/spark/serializer/message_pack.rb +23 -0
  118. data/lib/spark/serializer/oj.rb +23 -0
  119. data/lib/spark/serializer/pair.rb +41 -0
  120. data/lib/spark/serializer/text.rb +25 -0
  121. data/lib/spark/sort.rb +189 -0
  122. data/lib/spark/stat_counter.rb +125 -0
  123. data/lib/spark/storage_level.rb +39 -0
  124. data/lib/spark/version.rb +3 -0
  125. data/lib/spark/worker/master.rb +144 -0
  126. data/lib/spark/worker/spark_files.rb +15 -0
  127. data/lib/spark/worker/worker.rb +200 -0
  128. data/ruby-spark.gemspec +47 -0
  129. data/spec/generator.rb +37 -0
  130. data/spec/inputs/lorem_300.txt +316 -0
  131. data/spec/inputs/numbers/1.txt +50 -0
  132. data/spec/inputs/numbers/10.txt +50 -0
  133. data/spec/inputs/numbers/11.txt +50 -0
  134. data/spec/inputs/numbers/12.txt +50 -0
  135. data/spec/inputs/numbers/13.txt +50 -0
  136. data/spec/inputs/numbers/14.txt +50 -0
  137. data/spec/inputs/numbers/15.txt +50 -0
  138. data/spec/inputs/numbers/16.txt +50 -0
  139. data/spec/inputs/numbers/17.txt +50 -0
  140. data/spec/inputs/numbers/18.txt +50 -0
  141. data/spec/inputs/numbers/19.txt +50 -0
  142. data/spec/inputs/numbers/2.txt +50 -0
  143. data/spec/inputs/numbers/20.txt +50 -0
  144. data/spec/inputs/numbers/3.txt +50 -0
  145. data/spec/inputs/numbers/4.txt +50 -0
  146. data/spec/inputs/numbers/5.txt +50 -0
  147. data/spec/inputs/numbers/6.txt +50 -0
  148. data/spec/inputs/numbers/7.txt +50 -0
  149. data/spec/inputs/numbers/8.txt +50 -0
  150. data/spec/inputs/numbers/9.txt +50 -0
  151. data/spec/inputs/numbers_0_100.txt +101 -0
  152. data/spec/inputs/numbers_1_100.txt +100 -0
  153. data/spec/lib/collect_spec.rb +42 -0
  154. data/spec/lib/command_spec.rb +68 -0
  155. data/spec/lib/config_spec.rb +64 -0
  156. data/spec/lib/context_spec.rb +165 -0
  157. data/spec/lib/ext_spec.rb +72 -0
  158. data/spec/lib/external_apps_spec.rb +45 -0
  159. data/spec/lib/filter_spec.rb +80 -0
  160. data/spec/lib/flat_map_spec.rb +100 -0
  161. data/spec/lib/group_spec.rb +109 -0
  162. data/spec/lib/helper_spec.rb +19 -0
  163. data/spec/lib/key_spec.rb +41 -0
  164. data/spec/lib/manipulation_spec.rb +122 -0
  165. data/spec/lib/map_partitions_spec.rb +87 -0
  166. data/spec/lib/map_spec.rb +91 -0
  167. data/spec/lib/mllib/classification_spec.rb +54 -0
  168. data/spec/lib/mllib/clustering_spec.rb +35 -0
  169. data/spec/lib/mllib/matrix_spec.rb +32 -0
  170. data/spec/lib/mllib/regression_spec.rb +116 -0
  171. data/spec/lib/mllib/vector_spec.rb +77 -0
  172. data/spec/lib/reduce_by_key_spec.rb +118 -0
  173. data/spec/lib/reduce_spec.rb +131 -0
  174. data/spec/lib/sample_spec.rb +46 -0
  175. data/spec/lib/serializer_spec.rb +88 -0
  176. data/spec/lib/sort_spec.rb +58 -0
  177. data/spec/lib/statistic_spec.rb +170 -0
  178. data/spec/lib/whole_text_files_spec.rb +33 -0
  179. data/spec/spec_helper.rb +38 -0
  180. metadata +389 -0
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Current dir
4
+ cd "$(dirname "$0")"
5
+
6
+ # Exit immediately if a pipeline returns a non-zero status.
7
+ set -e
8
+
9
+ # Settings
10
+ export WORKERS=2
11
+ export MATRIX_SIZE=100
12
+ export NUMBERS_COUNT=1000000
13
+ export TEXT_FILE=$(mktemp)
14
+ export PI_DIGIT=1000
15
+ export RUBY_BATCH_SIZE=2048
16
+
17
+ text_file_rows=10
18
+ text_file_per_line=10
19
+ text_file_duplicates=50
20
+
21
+ mx="4096m"
22
+ ms="4096m"
23
+
24
+
25
+ # Parse arguments
26
+ while (( "$#" )); do
27
+ case $1 in
28
+ --workers)
29
+ WORKERS="$2"
30
+ shift
31
+ ;;
32
+ --matrix-size)
33
+ MATRIX_SIZE="$2"
34
+ shift
35
+ ;;
36
+ --numbers-count)
37
+ NUMBERS_COUNT="$2"
38
+ shift
39
+ ;;
40
+ --random-file-rows)
41
+ text_file_rows="$2"
42
+ shift
43
+ ;;
44
+ --text-file-per-line)
45
+ text_file_per_line="$2"
46
+ shift
47
+ ;;
48
+ --text-file-duplicates)
49
+ text_file_duplicates="$2"
50
+ shift
51
+ ;;
52
+ --pi-digit)
53
+ PI_DIGIT="$2"
54
+ shift
55
+ ;;
56
+ --ruby-batch-size)
57
+ RUBY_BATCH_SIZE="$2"
58
+ shift
59
+ ;;
60
+ --mx)
61
+ mx="$2"
62
+ shift
63
+ ;;
64
+ --ms)
65
+ ms="$2"
66
+ shift
67
+ ;;
68
+ *)
69
+ break
70
+ ;;
71
+ esac
72
+ shift
73
+ done
74
+
75
+
76
+ # Generating
77
+ file=$(mktemp)
78
+
79
+ for (( i=0; i<$text_file_rows; i++ ))
80
+ do
81
+ shuf -n $text_file_per_line /usr/share/dict/words | tr '\n' ' ' >> $file
82
+ echo >> $file
83
+ done
84
+
85
+ for (( i=0; i<$text_file_duplicates; i++ ))
86
+ do
87
+ cat $file >> $TEXT_FILE
88
+ done
89
+
90
+
91
+ # Before run
92
+ if [[ -z "$SPARK_HOME" ]]; then
93
+ export SPARK_HOME=$(pwd)/spark
94
+ fi
95
+
96
+ if [[ -z "$RSPARK_HOME" ]]; then
97
+ export RSPARK_HOME=$(pwd)/rspark
98
+ fi
99
+
100
+ export SPARK_RUBY_BATCH_SIZE="$RUBY_BATCH_SIZE"
101
+ SPARK_CLASSPATH=$($SPARK_HOME/bin/compute-classpath.sh 2>/dev/null)
102
+
103
+ export _JAVA_OPTIONS="$_JAVA_OPTIONS -Xms$ms -Xmx$mx"
104
+
105
+
106
+ # Log files
107
+ export RUBY_MARSHAL_LOG=$(mktemp)
108
+ export RUBY_OJ_LOG=$(mktemp)
109
+ export PYTHON_LOG=$(mktemp)
110
+ export SCALA_LOG=$(mktemp)
111
+ export R_LOG=$(mktemp)
112
+
113
+
114
+ # Run:
115
+ echo "Workers: $WORKERS"
116
+ echo "Matrix size: $MATRIX_SIZE"
117
+ echo "Numbers count: $NUMBERS_COUNT"
118
+ echo "Pi digits: $PI_DIGIT"
119
+ echo "File: rows = $(($text_file_rows * $text_file_duplicates))"
120
+ echo " per line = $text_file_per_line"
121
+
122
+ # --- Ruby
123
+ export SPARK_RUBY_SERIALIZER='marshal'
124
+ export RUBY_LOG="$RUBY_MARSHAL_LOG"
125
+ /usr/bin/env ruby ruby.rb &>/dev/null
126
+
127
+ export SPARK_RUBY_SERIALIZER='oj'
128
+ export RUBY_LOG="$RUBY_OJ_LOG"
129
+ /usr/bin/env ruby ruby.rb &>/dev/null
130
+
131
+ # # --- Python
132
+ "$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/python.py &>/dev/null
133
+
134
+ # # --- Scala
135
+ /usr/bin/env scalac -cp $SPARK_CLASSPATH scala.scala -d scala.jar &>/dev/null
136
+ "$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/scala.jar &>/dev/null
137
+
138
+ # --- R
139
+ # "$RSPARK_HOME"/sparkR r.r #&>/dev/null
140
+
141
+
142
+ # Parse results
143
+ echo "# Ruby (Marshal)"
144
+ cat $RUBY_MARSHAL_LOG
145
+ echo ""
146
+
147
+ echo "# Ruby (Oj)"
148
+ cat $RUBY_OJ_LOG
149
+ echo ""
150
+
151
+ echo "# Python"
152
+ cat $PYTHON_LOG
153
+ echo ""
154
+
155
+ echo "# Scala"
156
+ cat $SCALA_LOG
157
+ echo ""
158
+
159
+ echo "# R"
160
+ cat $R_LOG
@@ -0,0 +1,181 @@
1
+ import java.io._
2
+ import scala.math
3
+ import scala.io.Source
4
+ import org.apache.spark._
5
+
6
+ object Scala {
7
+
8
+ val logFile = new PrintWriter(new File(System.getenv("SCALA_LOG")))
9
+
10
+ def log(args: Any*) {
11
+ logFile.write(args.mkString(";"))
12
+ logFile.write("\n")
13
+ }
14
+
15
+ def main(args: Array[String]) {
16
+ val conf = new SparkConf().setAppName("Scala")
17
+ val sc = new SparkContext(conf)
18
+
19
+ val workers = System.getenv("WORKERS").toInt
20
+ val numbersCount = System.getenv("NUMBERS_COUNT").toInt
21
+ val textFile = System.getenv("TEXT_FILE")
22
+
23
+ val numbers = 0 until numbersCount
24
+ val floats = numbers.map(_.toDouble)
25
+ val strings = Source.fromFile(textFile).mkString.split("\n")
26
+
27
+
28
+ // =============================================================================
29
+ // Serialization
30
+ // =============================================================================
31
+
32
+ var time: Long = 0
33
+
34
+ time = System.currentTimeMillis
35
+ val rddNumbers = sc.parallelize(numbers, workers)
36
+ time = System.currentTimeMillis - time
37
+
38
+ log("NumbersSerialization", time/1000.0)
39
+
40
+
41
+ time = System.currentTimeMillis
42
+ val rddFloats = sc.parallelize(floats, workers)
43
+ time = System.currentTimeMillis - time
44
+
45
+ log("FloatsSerialization", time/1000.0)
46
+
47
+
48
+ time = System.currentTimeMillis
49
+ val rddStrings = sc.parallelize(strings, workers)
50
+ time = System.currentTimeMillis - time
51
+
52
+ log("StringsSerialization", time/1000.0)
53
+
54
+
55
+ // =============================================================================
56
+ // Computing
57
+ // =============================================================================
58
+
59
+ // --- Is prime? ---------------------------------------------------------------
60
+
61
+ time = System.currentTimeMillis
62
+ val primes = rddNumbers.map{ x =>
63
+ if(x < 2){
64
+ (x, false)
65
+ }
66
+ else if(x == 2){
67
+ (x, true)
68
+ }
69
+ else if(x % 2 == 0){
70
+ (x, false)
71
+ }
72
+ else{
73
+ val upper = math.sqrt(x.toDouble).toInt
74
+ var result = true
75
+
76
+ var i = 3
77
+ while(i <= upper && result == true){
78
+ if(x % i == 0){
79
+ result = false
80
+ }
81
+ else{
82
+ i += 2
83
+ }
84
+ }
85
+
86
+ (x, result)
87
+ }
88
+ }
89
+ primes.collect()
90
+ time = System.currentTimeMillis - time
91
+
92
+ log("IsPrime", time/1000.0)
93
+
94
+
95
+ // --- Matrix multiplication ---------------------------------------------------
96
+
97
+ val matrixSize = System.getenv("MATRIX_SIZE").toInt
98
+
99
+ val matrix = new Array[Array[Long]](matrixSize)
100
+
101
+ for( row <- 0 until matrixSize ) {
102
+ matrix(row) = new Array[Long](matrixSize)
103
+ for( col <- 0 until matrixSize ) {
104
+ matrix(row)(col) = row + col
105
+ }
106
+ }
107
+
108
+ time = System.currentTimeMillis
109
+ val rdd = sc.parallelize(matrix, 1)
110
+ rdd.mapPartitions { it =>
111
+ val matrix = it.toArray
112
+ val size = matrix.size
113
+
114
+ val newMatrix = new Array[Array[Long]](size)
115
+
116
+ for( row <- 0 until size ) {
117
+ newMatrix(row) = new Array[Long](size)
118
+ for( col <- 0 until size ) {
119
+
120
+ var result: Long = 0
121
+ for( i <- 0 until size ) {
122
+ result += matrix(row)(i) * matrix(col)(i)
123
+ }
124
+ newMatrix(row)(col) = result
125
+ }
126
+ }
127
+
128
+ newMatrix.toIterator
129
+ }
130
+ time = System.currentTimeMillis - time
131
+
132
+ log("MatrixMultiplication", time/1000.0)
133
+
134
+
135
+ // --- Pi digits ---------------------------------------------------------------
136
+ // http://rosettacode.org/wiki/Pi#Scala
137
+
138
+ val piDigit = System.getenv("PI_DIGIT").toInt
139
+
140
+ time = System.currentTimeMillis
141
+ val piDigits = sc.parallelize(Array(piDigit), 1)
142
+ piDigits.mapPartitions { it =>
143
+ var size = it.toArray.asInstanceOf[Array[Int]](0)
144
+ var result = ""
145
+
146
+ var r: BigInt = 0
147
+ var q, t, k: BigInt = 1
148
+ var n, l: BigInt = 3
149
+ var nr, nn: BigInt = 0
150
+
151
+ while(size > 0){
152
+ while((4*q+r-t) >= (n*t)){
153
+ nr = (2*q+r)*l
154
+ nn = (q*(7*k)+2+(r*l))/(t*l)
155
+ q = q * k
156
+ t = t * l
157
+ l = l + 2
158
+ k = k + 1
159
+ n = nn
160
+ r = nr
161
+ }
162
+
163
+ result += n.toString
164
+ size -= 1
165
+ nr = 10*(r-n*t)
166
+ n = ((10*(3*q+r))/t)-(10*n)
167
+ q = q * 10
168
+ r = nr
169
+ }
170
+
171
+ Iterator(result)
172
+ }
173
+ time = System.currentTimeMillis - time
174
+
175
+ log("PiDigit", time/1000.0)
176
+
177
+
178
+ sc.stop()
179
+ logFile.close()
180
+ }
181
+ }
@@ -0,0 +1,94 @@
1
+ require 'benchmark'
2
+ require 'benchmark/ips'
3
+
4
+ def pack_int(data)
5
+ [data].pack('l>')
6
+ end
7
+
8
+ def pack_long(data)
9
+ [data].pack('q>')
10
+ end
11
+
12
+ def pack_doubles(data)
13
+ data.pack('G*')
14
+ end
15
+
16
+ module Standard
17
+ class LabeledPoint
18
+ def initialize(label, features)
19
+ @label = label
20
+ @features = Standard::Vector.new(features)
21
+ end
22
+
23
+ def marshal_dump
24
+ [@label, @features]
25
+ end
26
+
27
+ def marshal_load(*)
28
+ end
29
+ end
30
+
31
+ class Vector
32
+ def initialize(array)
33
+ @values = array
34
+ end
35
+
36
+ def marshal_dump
37
+ [@values]
38
+ end
39
+
40
+ def marshal_load(*)
41
+ end
42
+ end
43
+ end
44
+
45
+ module Custom
46
+ class LabeledPoint
47
+ def initialize(label, features)
48
+ @label = label
49
+ @features = Custom::Vector.new(features)
50
+ end
51
+
52
+ def _dump(*)
53
+ pack_long(@label) + @features._dump
54
+ end
55
+
56
+ def self._load(*)
57
+ end
58
+ end
59
+
60
+ class Vector
61
+ def initialize(array)
62
+ @values = array
63
+ end
64
+
65
+ def _dump(*)
66
+ result = 'v'
67
+ result << pack_int(@values.size)
68
+ result << pack_doubles(@values)
69
+ result.encode(Encoding::ASCII_8BIT)
70
+ end
71
+
72
+ def self._load(*)
73
+ end
74
+ end
75
+ end
76
+
77
+ data_size = 10_000
78
+ vector_size = 1_000
79
+ values = Array.new(vector_size) { |x| rand(10_000..100_000) }
80
+
81
+ @data1 = Array.new(data_size) {|i| Standard::LabeledPoint.new(i, values)}
82
+ @data2 = Array.new(data_size) {|i| Custom::LabeledPoint.new(i, values)}
83
+
84
+ Benchmark.ips do |r|
85
+ r.report('standard') do
86
+ Marshal.dump(@data1)
87
+ end
88
+
89
+ r.report('custom') do
90
+ Marshal.dump(@data2)
91
+ end
92
+
93
+ r.compare!
94
+ end
@@ -0,0 +1,150 @@
1
+ lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
2
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
3
+
4
+ def java?
5
+ RUBY_PLATFORM =~ /java/
6
+ end
7
+
8
+ unless java?
9
+ require 'murmurhash3'
10
+ end
11
+
12
+ require 'digest'
13
+ require 'benchmark'
14
+ require 'ruby-spark'
15
+
16
+ TEST = 5_000_000
17
+ WORDS = ["wefwefwef", "rgwefiwefwe", "a", "rujfwgrethrzjrhgawf", "irncrnuggo"]
18
+
19
+ puts "TEST COUNT = #{TEST*WORDS.size}"
20
+
21
+ # =================================================================================================
22
+ # Pure ruby mumrumur
23
+ # funny-falcon/murmurhash3-ruby
24
+
25
+ MASK32 = 0xffffffff
26
+
27
+ def murmur3_32_rotl(x, r)
28
+ ((x << r) | (x >> (32 - r))) & MASK32
29
+ end
30
+
31
+ def murmur3_32_fmix(h)
32
+ h &= MASK32
33
+ h ^= h >> 16
34
+ h = (h * 0x85ebca6b) & MASK32
35
+ h ^= h >> 13
36
+ h = (h * 0xc2b2ae35) & MASK32
37
+ h ^ (h >> 16)
38
+ end
39
+
40
+ def murmur3_32__mmix(k1)
41
+ k1 = (k1 * 0xcc9e2d51) & MASK32
42
+ k1 = murmur3_32_rotl(k1, 15)
43
+ (k1 * 0x1b873593) & MASK32
44
+ end
45
+
46
+ def murmur3_32_str_hash(str, seed=0)
47
+ h1 = seed
48
+ numbers = str.unpack('V*C*')
49
+ tailn = str.bytesize % 4
50
+ tail = numbers.slice!(numbers.size - tailn, tailn)
51
+ for k1 in numbers
52
+ h1 ^= murmur3_32__mmix(k1)
53
+ h1 = murmur3_32_rotl(h1, 13)
54
+ h1 = (h1*5 + 0xe6546b64) & MASK32
55
+ end
56
+
57
+ unless tail.empty?
58
+ k1 = 0
59
+ tail.reverse_each do |c1|
60
+ k1 = (k1 << 8) | c1
61
+ end
62
+ h1 ^= murmur3_32__mmix(k1)
63
+ end
64
+
65
+ h1 ^= str.bytesize
66
+ murmur3_32_fmix(h1)
67
+ end
68
+
69
+
70
+ # =================================================================================================
71
+ # Benchmark
72
+
73
+ Benchmark.bm(18) do |x|
74
+
75
+ x.report("ruby hash"){
76
+ TEST.times{
77
+ WORDS.each{ |word|
78
+ word.hash
79
+ }
80
+ }
81
+ }
82
+
83
+ x.report("ext portable"){
84
+ TEST.times{
85
+ WORDS.each{ |word|
86
+ Spark::Digest.portable_hash(word)
87
+ }
88
+ }
89
+ }
90
+
91
+ x.report("murmur3 32"){
92
+ TEST.times{
93
+ WORDS.each{ |word|
94
+ # MurmurHash3::V128.str_hash(word)
95
+ # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
96
+ # MurmurHash3::V128.str_hash(word)
97
+ # a = MurmurHash3::V32.str_hash(word).to_s
98
+ # a.slice!(0,8)
99
+
100
+ MurmurHash3::V32.str_hash(word)
101
+ }
102
+ }
103
+ } unless java?
104
+
105
+ # Too slow
106
+ # x.report("murmur3 32 (ruby)"){
107
+ # TEST.times{
108
+ # WORDS.each{ |word|
109
+ # # MurmurHash3::V128.str_hash(word)
110
+ # # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
111
+ # # MurmurHash3::V128.str_hash(word)
112
+ # # a = murmur3_32_str_hash(word).to_s
113
+ # # a.slice!(0,8)
114
+
115
+ # murmur3_32_str_hash(word)
116
+ # }
117
+ # }
118
+ # }
119
+
120
+ x.report("murmur3 128"){
121
+ TEST.times{
122
+ WORDS.each{ |word|
123
+ # MurmurHash3::V128.str_hash(word)
124
+ # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
125
+ # a = MurmurHash3::V128.str_hash(word).to_s
126
+ # a.slice!(0,8)
127
+
128
+ MurmurHash3::V128.str_hash(word)
129
+ }
130
+ }
131
+ } unless java?
132
+
133
+ # x.report("sha256"){
134
+ # TEST.times{
135
+ # WORDS.each{ |word|
136
+ # a = Digest::SHA256.digest(word)
137
+ # # a.slice!(0,8)
138
+ # }
139
+ # }
140
+ # }
141
+
142
+ # x.report("md5"){
143
+ # TEST.times{
144
+ # WORDS.each{ |word|
145
+ # a = Digest::MD5.digest(word)
146
+ # # a.slice!(0,8)
147
+ # }
148
+ # }
149
+ # }
150
+ end