ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,69 @@
1
+ library(SparkR)
2
+ sc <- sparkR.init(master="local[*]")
3
+
4
+ logFile <- file(Sys.getenv("R_LOG"), "w")
5
+
6
+ logInfo <- function(...){
7
+ args <- list(...)
8
+ line <- paste(args, collapse = ";")
9
+ writeLines(line, logFile)
10
+ }
11
+
12
+ workers <- as.integer(Sys.getenv('WORKERS'))
13
+ numbersCount <- as.integer(Sys.getenv('NUMBERS_COUNT'))
14
+ textFile <- Sys.getenv('TEXT_FILE')
15
+
16
+
17
+ # =============================================================================
18
+ # Serialization
19
+ # =============================================================================
20
+
21
+ time <- proc.time()
22
+ rddNumbers <- parallelize(sc, as.numeric(seq(0, numbersCount)), workers)
23
+ time <- as.double(proc.time()-time)[3]
24
+
25
+ logInfo('NumbersSerialization', time)
26
+
27
+
28
+ # =============================================================================
29
+ # Computing
30
+ # =============================================================================
31
+
32
+ isPrime = function(x) {
33
+ if(x < 2){
34
+ c(x, FALSE)
35
+ }
36
+ else if(x == 2){
37
+ c(x, TRUE)
38
+ }
39
+ else if(x %% 2 == 0){
40
+ c(x, FALSE)
41
+ }
42
+ else{
43
+ upper <- as.numeric(sqrt(as.double(x)))
44
+ result <- TRUE
45
+
46
+ i <- 3
47
+ while(i <= upper){
48
+ if(x %% i == 0){
49
+ result = FALSE
50
+ break
51
+ }
52
+
53
+ i <- i+2
54
+ }
55
+
56
+ c(x, result)
57
+ }
58
+ }
59
+
60
+ time <- proc.time()
61
+ rdd <- map(rddNumbers, isPrime)
62
+ capture.output(collect(rdd), file='/dev/null')
63
+ time <- as.double(proc.time()-time)[3]
64
+
65
+ logInfo('IsPrime', time)
66
+
67
+
68
+ close(logFile)
69
+ sparkR.stop()
@@ -0,0 +1,167 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ lib = File.expand_path(File.dirname(__FILE__) + '/../../lib')
4
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
5
+
6
+ require 'ruby-spark'
7
+ require 'benchmark'
8
+
9
+ Spark.start
10
+ sc = Spark.context
11
+
12
+ $log_file = File.open(ENV['RUBY_LOG'], 'w')
13
+
14
+ def log(*values)
15
+ $log_file.puts(values.join(';'))
16
+ end
17
+
18
+ workers = ENV['WORKERS'].to_i
19
+ numbers_count = ENV['NUMBERS_COUNT'].to_i
20
+ text_file = ENV['TEXT_FILE']
21
+
22
+ numbers = (0...numbers_count).to_a
23
+ floats = numbers.map(&:to_f)
24
+ strings = File.read(text_file).split("\n")
25
+
26
+
27
+ # =============================================================================
28
+ # Serialization
29
+ # =============================================================================
30
+
31
+ time = Benchmark.realtime do
32
+ @rdd_numbers = sc.parallelize(numbers, workers)
33
+ end
34
+
35
+ log('NumbersSerialization', time)
36
+
37
+
38
+ time = Benchmark.realtime do
39
+ @rdd_floats = sc.parallelize(floats, workers)
40
+ end
41
+
42
+ log('FloatsSerialization', time)
43
+
44
+
45
+ time = Benchmark.realtime do
46
+ @rdd_strings = sc.parallelize(strings, workers)
47
+ end
48
+
49
+ log('StringsSerialization', time)
50
+
51
+
52
+ # =============================================================================
53
+ # Computing
54
+ # =============================================================================
55
+
56
+
57
+ # --- Is prime? ---------------------------------------------------------------
58
+
59
+ is_prime = Proc.new do |x|
60
+ case
61
+ when x < 2
62
+ [x, false]
63
+ when x == 2
64
+ [x, true]
65
+ when x % 2 == 0
66
+ [x, false]
67
+ else
68
+ upper = Math.sqrt(x.to_f).to_i
69
+ result = true
70
+
71
+ i = 3
72
+ while i <= upper
73
+ if x % i == 0
74
+ result = false
75
+ break
76
+ end
77
+
78
+ i += 2
79
+ end
80
+
81
+ [x, result]
82
+ end
83
+ end
84
+
85
+ time = Benchmark.realtime do
86
+ @rdd_numbers.map(is_prime).collect
87
+ end
88
+
89
+ log('IsPrime', time)
90
+
91
+
92
+ # --- Matrix multiplication ---------------------------------------------------
93
+
94
+ matrix_size = ENV['MATRIX_SIZE'].to_i
95
+
96
+ matrix = Array.new(matrix_size) do |row|
97
+ Array.new(matrix_size) do |col|
98
+ row+col
99
+ end
100
+ end;
101
+
102
+ multiplication_func = Proc.new do |matrix|
103
+ size = matrix.size
104
+
105
+ Array.new(size) do |row|
106
+ Array.new(size) do |col|
107
+ matrix[row]
108
+
109
+ result = 0
110
+ size.times do |i|
111
+ result += matrix[row][i] * matrix[col][i]
112
+ end
113
+ result
114
+ end
115
+ end
116
+ end
117
+
118
+ time = Benchmark.realtime do
119
+ rdd = sc.parallelize(matrix, 1)
120
+ rdd.map_partitions(multiplication_func).collect
121
+ end
122
+
123
+ log('MatrixMultiplication', time)
124
+
125
+
126
+ # --- Pi digits ---------------------------------------------------------------
127
+ # http://rosettacode.org/wiki/Pi#Ruby
128
+
129
+ pi_digit = ENV['PI_DIGIT'].to_i
130
+
131
+ pi_func = Proc.new do |size|
132
+ size = size.first
133
+ result = ''
134
+
135
+ q, r, t, k, n, l = 1, 0, 1, 1, 3, 3
136
+ while size > 0
137
+ if 4*q+r-t < n*t
138
+ result << n.to_s
139
+ size -= 1
140
+ nr = 10*(r-n*t)
141
+ n = ((10*(3*q+r)) / t) - 10*n
142
+ q *= 10
143
+ r = nr
144
+ else
145
+ nr = (2*q+r) * l
146
+ nn = (q*(7*k+2)+r*l) / (t*l)
147
+ q *= k
148
+ t *= l
149
+ l += 2
150
+ k += 1
151
+ n = nn
152
+ r = nr
153
+ end
154
+ end
155
+
156
+ [result]
157
+ end
158
+
159
+ time = Benchmark.realtime do
160
+ rdd = sc.parallelize([pi_digit], 1)
161
+ rdd.map_partitions(pi_func).collect
162
+ end
163
+
164
+ log('PiDigit', time)
165
+
166
+
167
+ $log_file.close
@@ -0,0 +1,160 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Current dir
4
+ cd "$(dirname "$0")"
5
+
6
+ # Exit immediately if a pipeline returns a non-zero status.
7
+ set -e
8
+
9
+ # Settings
10
+ export WORKERS=2
11
+ export MATRIX_SIZE=100
12
+ export NUMBERS_COUNT=1000000
13
+ export TEXT_FILE=$(mktemp)
14
+ export PI_DIGIT=1000
15
+ export RUBY_BATCH_SIZE=2048
16
+
17
+ text_file_rows=10
18
+ text_file_per_line=10
19
+ text_file_duplicates=50
20
+
21
+ mx="4096m"
22
+ ms="4096m"
23
+
24
+
25
+ # Parse arguments
26
+ while (( "$#" )); do
27
+ case $1 in
28
+ --workers)
29
+ WORKERS="$2"
30
+ shift
31
+ ;;
32
+ --matrix-size)
33
+ MATRIX_SIZE="$2"
34
+ shift
35
+ ;;
36
+ --numbers-count)
37
+ NUMBERS_COUNT="$2"
38
+ shift
39
+ ;;
40
+ --random-file-rows)
41
+ text_file_rows="$2"
42
+ shift
43
+ ;;
44
+ --text-file-per-line)
45
+ text_file_per_line="$2"
46
+ shift
47
+ ;;
48
+ --text-file-duplicates)
49
+ text_file_duplicates="$2"
50
+ shift
51
+ ;;
52
+ --pi-digit)
53
+ PI_DIGIT="$2"
54
+ shift
55
+ ;;
56
+ --ruby-batch-size)
57
+ RUBY_BATCH_SIZE="$2"
58
+ shift
59
+ ;;
60
+ --mx)
61
+ mx="$2"
62
+ shift
63
+ ;;
64
+ --ms)
65
+ ms="$2"
66
+ shift
67
+ ;;
68
+ *)
69
+ break
70
+ ;;
71
+ esac
72
+ shift
73
+ done
74
+
75
+
76
+ # Generating
77
+ file=$(mktemp)
78
+
79
+ for (( i=0; i<$text_file_rows; i++ ))
80
+ do
81
+ shuf -n $text_file_per_line /usr/share/dict/words | tr '\n' ' ' >> $file
82
+ echo >> $file
83
+ done
84
+
85
+ for (( i=0; i<$text_file_duplicates; i++ ))
86
+ do
87
+ cat $file >> $TEXT_FILE
88
+ done
89
+
90
+
91
+ # Before run
92
+ if [[ -z "$SPARK_HOME" ]]; then
93
+ export SPARK_HOME=$(pwd)/spark
94
+ fi
95
+
96
+ if [[ -z "$RSPARK_HOME" ]]; then
97
+ export RSPARK_HOME=$(pwd)/rspark
98
+ fi
99
+
100
+ export SPARK_RUBY_BATCH_SIZE="$RUBY_BATCH_SIZE"
101
+ SPARK_CLASSPATH=$($SPARK_HOME/bin/compute-classpath.sh 2>/dev/null)
102
+
103
+ export _JAVA_OPTIONS="$_JAVA_OPTIONS -Xms$ms -Xmx$mx"
104
+
105
+
106
+ # Log files
107
+ export RUBY_MARSHAL_LOG=$(mktemp)
108
+ export RUBY_OJ_LOG=$(mktemp)
109
+ export PYTHON_LOG=$(mktemp)
110
+ export SCALA_LOG=$(mktemp)
111
+ export R_LOG=$(mktemp)
112
+
113
+
114
+ # Run:
115
+ echo "Workers: $WORKERS"
116
+ echo "Matrix size: $MATRIX_SIZE"
117
+ echo "Numbers count: $NUMBERS_COUNT"
118
+ echo "Pi digits: $PI_DIGIT"
119
+ echo "File: rows = $(($text_file_rows * $text_file_duplicates))"
120
+ echo " per line = $text_file_per_line"
121
+
122
+ # --- Ruby
123
+ export SPARK_RUBY_SERIALIZER='marshal'
124
+ export RUBY_LOG="$RUBY_MARSHAL_LOG"
125
+ /usr/bin/env ruby ruby.rb &>/dev/null
126
+
127
+ export SPARK_RUBY_SERIALIZER='oj'
128
+ export RUBY_LOG="$RUBY_OJ_LOG"
129
+ /usr/bin/env ruby ruby.rb &>/dev/null
130
+
131
+ # # --- Python
132
+ "$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/python.py &>/dev/null
133
+
134
+ # # --- Scala
135
+ /usr/bin/env scalac -cp $SPARK_CLASSPATH scala.scala -d scala.jar &>/dev/null
136
+ "$SPARK_HOME"/bin/spark-submit --master "local[*]" $(pwd)/scala.jar &>/dev/null
137
+
138
+ # --- R
139
+ # "$RSPARK_HOME"/sparkR r.r #&>/dev/null
140
+
141
+
142
+ # Parse results
143
+ echo "# Ruby (Marshal)"
144
+ cat $RUBY_MARSHAL_LOG
145
+ echo ""
146
+
147
+ echo "# Ruby (Oj)"
148
+ cat $RUBY_OJ_LOG
149
+ echo ""
150
+
151
+ echo "# Python"
152
+ cat $PYTHON_LOG
153
+ echo ""
154
+
155
+ echo "# Scala"
156
+ cat $SCALA_LOG
157
+ echo ""
158
+
159
+ echo "# R"
160
+ cat $R_LOG
@@ -0,0 +1,181 @@
1
+ import java.io._
2
+ import scala.math
3
+ import scala.io.Source
4
+ import org.apache.spark._
5
+
6
+ object Scala {
7
+
8
+ val logFile = new PrintWriter(new File(System.getenv("SCALA_LOG")))
9
+
10
+ def log(args: Any*) {
11
+ logFile.write(args.mkString(";"))
12
+ logFile.write("\n")
13
+ }
14
+
15
+ def main(args: Array[String]) {
16
+ val conf = new SparkConf().setAppName("Scala")
17
+ val sc = new SparkContext(conf)
18
+
19
+ val workers = System.getenv("WORKERS").toInt
20
+ val numbersCount = System.getenv("NUMBERS_COUNT").toInt
21
+ val textFile = System.getenv("TEXT_FILE")
22
+
23
+ val numbers = 0 until numbersCount
24
+ val floats = numbers.map(_.toDouble)
25
+ val strings = Source.fromFile(textFile).mkString.split("\n")
26
+
27
+
28
+ // =============================================================================
29
+ // Serialization
30
+ // =============================================================================
31
+
32
+ var time: Long = 0
33
+
34
+ time = System.currentTimeMillis
35
+ val rddNumbers = sc.parallelize(numbers, workers)
36
+ time = System.currentTimeMillis - time
37
+
38
+ log("NumbersSerialization", time/1000.0)
39
+
40
+
41
+ time = System.currentTimeMillis
42
+ val rddFloats = sc.parallelize(floats, workers)
43
+ time = System.currentTimeMillis - time
44
+
45
+ log("FloatsSerialization", time/1000.0)
46
+
47
+
48
+ time = System.currentTimeMillis
49
+ val rddStrings = sc.parallelize(strings, workers)
50
+ time = System.currentTimeMillis - time
51
+
52
+ log("StringsSerialization", time/1000.0)
53
+
54
+
55
+ // =============================================================================
56
+ // Computing
57
+ // =============================================================================
58
+
59
+ // --- Is prime? ---------------------------------------------------------------
60
+
61
+ time = System.currentTimeMillis
62
+ val primes = rddNumbers.map{ x =>
63
+ if(x < 2){
64
+ (x, false)
65
+ }
66
+ else if(x == 2){
67
+ (x, true)
68
+ }
69
+ else if(x % 2 == 0){
70
+ (x, false)
71
+ }
72
+ else{
73
+ val upper = math.sqrt(x.toDouble).toInt
74
+ var result = true
75
+
76
+ var i = 3
77
+ while(i <= upper && result == true){
78
+ if(x % i == 0){
79
+ result = false
80
+ }
81
+ else{
82
+ i += 2
83
+ }
84
+ }
85
+
86
+ (x, result)
87
+ }
88
+ }
89
+ primes.collect()
90
+ time = System.currentTimeMillis - time
91
+
92
+ log("IsPrime", time/1000.0)
93
+
94
+
95
+ // --- Matrix multiplication ---------------------------------------------------
96
+
97
+ val matrixSize = System.getenv("MATRIX_SIZE").toInt
98
+
99
+ val matrix = new Array[Array[Long]](matrixSize)
100
+
101
+ for( row <- 0 until matrixSize ) {
102
+ matrix(row) = new Array[Long](matrixSize)
103
+ for( col <- 0 until matrixSize ) {
104
+ matrix(row)(col) = row + col
105
+ }
106
+ }
107
+
108
+ time = System.currentTimeMillis
109
+ val rdd = sc.parallelize(matrix, 1)
110
+ rdd.mapPartitions { it =>
111
+ val matrix = it.toArray
112
+ val size = matrix.size
113
+
114
+ val newMatrix = new Array[Array[Long]](size)
115
+
116
+ for( row <- 0 until size ) {
117
+ newMatrix(row) = new Array[Long](size)
118
+ for( col <- 0 until size ) {
119
+
120
+ var result: Long = 0
121
+ for( i <- 0 until size ) {
122
+ result += matrix(row)(i) * matrix(col)(i)
123
+ }
124
+ newMatrix(row)(col) = result
125
+ }
126
+ }
127
+
128
+ newMatrix.toIterator
129
+ }
130
+ time = System.currentTimeMillis - time
131
+
132
+ log("MatrixMultiplication", time/1000.0)
133
+
134
+
135
+ // --- Pi digits ---------------------------------------------------------------
136
+ // http://rosettacode.org/wiki/Pi#Scala
137
+
138
+ val piDigit = System.getenv("PI_DIGIT").toInt
139
+
140
+ time = System.currentTimeMillis
141
+ val piDigits = sc.parallelize(Array(piDigit), 1)
142
+ piDigits.mapPartitions { it =>
143
+ var size = it.toArray.asInstanceOf[Array[Int]](0)
144
+ var result = ""
145
+
146
+ var r: BigInt = 0
147
+ var q, t, k: BigInt = 1
148
+ var n, l: BigInt = 3
149
+ var nr, nn: BigInt = 0
150
+
151
+ while(size > 0){
152
+ while((4*q+r-t) >= (n*t)){
153
+ nr = (2*q+r)*l
154
+ nn = (q*(7*k)+2+(r*l))/(t*l)
155
+ q = q * k
156
+ t = t * l
157
+ l = l + 2
158
+ k = k + 1
159
+ n = nn
160
+ r = nr
161
+ }
162
+
163
+ result += n.toString
164
+ size -= 1
165
+ nr = 10*(r-n*t)
166
+ n = ((10*(3*q+r))/t)-(10*n)
167
+ q = q * 10
168
+ r = nr
169
+ }
170
+
171
+ Iterator(result)
172
+ }
173
+ time = System.currentTimeMillis - time
174
+
175
+ log("PiDigit", time/1000.0)
176
+
177
+
178
+ sc.stop()
179
+ logFile.close()
180
+ }
181
+ }