ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,94 @@
1
+ require 'benchmark'
2
+ require 'benchmark/ips'
3
+
4
+ def pack_int(data)
5
+ [data].pack('l>')
6
+ end
7
+
8
+ def pack_long(data)
9
+ [data].pack('q>')
10
+ end
11
+
12
+ def pack_doubles(data)
13
+ data.pack('G*')
14
+ end
15
+
16
+ module Standard
17
+ class LabeledPoint
18
+ def initialize(label, features)
19
+ @label = label
20
+ @features = Standard::Vector.new(features)
21
+ end
22
+
23
+ def marshal_dump
24
+ [@label, @features]
25
+ end
26
+
27
+ def marshal_load(*)
28
+ end
29
+ end
30
+
31
+ class Vector
32
+ def initialize(array)
33
+ @values = array
34
+ end
35
+
36
+ def marshal_dump
37
+ [@values]
38
+ end
39
+
40
+ def marshal_load(*)
41
+ end
42
+ end
43
+ end
44
+
45
+ module Custom
46
+ class LabeledPoint
47
+ def initialize(label, features)
48
+ @label = label
49
+ @features = Custom::Vector.new(features)
50
+ end
51
+
52
+ def _dump(*)
53
+ pack_long(@label) + @features._dump
54
+ end
55
+
56
+ def self._load(*)
57
+ end
58
+ end
59
+
60
+ class Vector
61
+ def initialize(array)
62
+ @values = array
63
+ end
64
+
65
+ def _dump(*)
66
+ result = 'v'
67
+ result << pack_int(@values.size)
68
+ result << pack_doubles(@values)
69
+ result.encode(Encoding::ASCII_8BIT)
70
+ end
71
+
72
+ def self._load(*)
73
+ end
74
+ end
75
+ end
76
+
77
+ data_size = 10_000
78
+ vector_size = 1_000
79
+ values = Array.new(vector_size) { |x| rand(10_000..100_000) }
80
+
81
+ @data1 = Array.new(data_size) {|i| Standard::LabeledPoint.new(i, values)}
82
+ @data2 = Array.new(data_size) {|i| Custom::LabeledPoint.new(i, values)}
83
+
84
+ Benchmark.ips do |r|
85
+ r.report('standard') do
86
+ Marshal.dump(@data1)
87
+ end
88
+
89
+ r.report('custom') do
90
+ Marshal.dump(@data2)
91
+ end
92
+
93
+ r.compare!
94
+ end
@@ -0,0 +1,150 @@
1
+ lib = File.expand_path(File.dirname(__FILE__) + '/../lib')
2
+ $LOAD_PATH.unshift(lib) if File.directory?(lib) && !$LOAD_PATH.include?(lib)
3
+
4
+ def java?
5
+ RUBY_PLATFORM =~ /java/
6
+ end
7
+
8
+ unless java?
9
+ require 'murmurhash3'
10
+ end
11
+
12
+ require 'digest'
13
+ require 'benchmark'
14
+ require 'ruby-spark'
15
+
16
+ TEST = 5_000_000
17
+ WORDS = ["wefwefwef", "rgwefiwefwe", "a", "rujfwgrethrzjrhgawf", "irncrnuggo"]
18
+
19
+ puts "TEST COUNT = #{TEST*WORDS.size}"
20
+
21
+ # =================================================================================================
22
+ # Pure ruby mumrumur
23
+ # funny-falcon/murmurhash3-ruby
24
+
25
+ MASK32 = 0xffffffff
26
+
27
+ def murmur3_32_rotl(x, r)
28
+ ((x << r) | (x >> (32 - r))) & MASK32
29
+ end
30
+
31
+ def murmur3_32_fmix(h)
32
+ h &= MASK32
33
+ h ^= h >> 16
34
+ h = (h * 0x85ebca6b) & MASK32
35
+ h ^= h >> 13
36
+ h = (h * 0xc2b2ae35) & MASK32
37
+ h ^ (h >> 16)
38
+ end
39
+
40
+ def murmur3_32__mmix(k1)
41
+ k1 = (k1 * 0xcc9e2d51) & MASK32
42
+ k1 = murmur3_32_rotl(k1, 15)
43
+ (k1 * 0x1b873593) & MASK32
44
+ end
45
+
46
+ def murmur3_32_str_hash(str, seed=0)
47
+ h1 = seed
48
+ numbers = str.unpack('V*C*')
49
+ tailn = str.bytesize % 4
50
+ tail = numbers.slice!(numbers.size - tailn, tailn)
51
+ for k1 in numbers
52
+ h1 ^= murmur3_32__mmix(k1)
53
+ h1 = murmur3_32_rotl(h1, 13)
54
+ h1 = (h1*5 + 0xe6546b64) & MASK32
55
+ end
56
+
57
+ unless tail.empty?
58
+ k1 = 0
59
+ tail.reverse_each do |c1|
60
+ k1 = (k1 << 8) | c1
61
+ end
62
+ h1 ^= murmur3_32__mmix(k1)
63
+ end
64
+
65
+ h1 ^= str.bytesize
66
+ murmur3_32_fmix(h1)
67
+ end
68
+
69
+
70
+ # =================================================================================================
71
+ # Benchmark
72
+
73
+ Benchmark.bm(18) do |x|
74
+
75
+ x.report("ruby hash"){
76
+ TEST.times{
77
+ WORDS.each{ |word|
78
+ word.hash
79
+ }
80
+ }
81
+ }
82
+
83
+ x.report("ext portable"){
84
+ TEST.times{
85
+ WORDS.each{ |word|
86
+ Spark::Digest.portable_hash(word)
87
+ }
88
+ }
89
+ }
90
+
91
+ x.report("murmur3 32"){
92
+ TEST.times{
93
+ WORDS.each{ |word|
94
+ # MurmurHash3::V128.str_hash(word)
95
+ # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
96
+ # MurmurHash3::V128.str_hash(word)
97
+ # a = MurmurHash3::V32.str_hash(word).to_s
98
+ # a.slice!(0,8)
99
+
100
+ MurmurHash3::V32.str_hash(word)
101
+ }
102
+ }
103
+ } unless java?
104
+
105
+ # Too slow
106
+ # x.report("murmur3 32 (ruby)"){
107
+ # TEST.times{
108
+ # WORDS.each{ |word|
109
+ # # MurmurHash3::V128.str_hash(word)
110
+ # # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
111
+ # # MurmurHash3::V128.str_hash(word)
112
+ # # a = murmur3_32_str_hash(word).to_s
113
+ # # a.slice!(0,8)
114
+
115
+ # murmur3_32_str_hash(word)
116
+ # }
117
+ # }
118
+ # }
119
+
120
+ x.report("murmur3 128"){
121
+ TEST.times{
122
+ WORDS.each{ |word|
123
+ # MurmurHash3::V128.str_hash(word)
124
+ # [MurmurHash3::V128.str_hash(word).join.to_i].pack("q>")
125
+ # a = MurmurHash3::V128.str_hash(word).to_s
126
+ # a.slice!(0,8)
127
+
128
+ MurmurHash3::V128.str_hash(word)
129
+ }
130
+ }
131
+ } unless java?
132
+
133
+ # x.report("sha256"){
134
+ # TEST.times{
135
+ # WORDS.each{ |word|
136
+ # a = Digest::SHA256.digest(word)
137
+ # # a.slice!(0,8)
138
+ # }
139
+ # }
140
+ # }
141
+
142
+ # x.report("md5"){
143
+ # TEST.times{
144
+ # WORDS.each{ |word|
145
+ # a = Digest::MD5.digest(word)
146
+ # # a.slice!(0,8)
147
+ # }
148
+ # }
149
+ # }
150
+ end
@@ -0,0 +1,88 @@
1
+ require "benchmark"
2
+
3
+ class Enumerator
4
+ def defer(&blk)
5
+ self.class.new do |y|
6
+ each do |*input|
7
+ blk.call(y, *input)
8
+ end
9
+ end
10
+ end
11
+ end
12
+
13
+ ARRAY_SIZE = 50_000_000
14
+
15
+ def type_yield
16
+ return to_enum(__callee__) unless block_given?
17
+
18
+ ARRAY_SIZE.times { |i|
19
+ yield i
20
+ }
21
+ end
22
+
23
+ def yield_map_x2(enum)
24
+ return to_enum(__callee__, enum) unless block_given?
25
+
26
+ enum.each do |item|
27
+ yield item*2
28
+ end
29
+ end
30
+
31
+ def type_enumerator_new
32
+ Enumerator.new do |e|
33
+ ARRAY_SIZE.times { |i|
34
+ e << i
35
+ }
36
+ end
37
+ end
38
+
39
+ def enumerator_new_map_x2(enum)
40
+ Enumerator.new do |e|
41
+ enum.each do |item|
42
+ e << item*2
43
+ end
44
+ end
45
+ end
46
+
47
+ def enumerator_defer_x2(enum)
48
+ enum.defer do |out, inp|
49
+ out << inp*2
50
+ end
51
+ end
52
+
53
+ Benchmark.bm(26) do |x|
54
+ x.report("yield max") do
55
+ type_yield.max
56
+ end
57
+
58
+ x.report("yield sum") do
59
+ type_yield.reduce(:+)
60
+ end
61
+
62
+ x.report("yield map x*2 sum") do
63
+ yield_map_x2(type_yield).reduce(:+)
64
+ end
65
+
66
+ x.report("yield defer map x*2 sum") do
67
+ enumerator_defer_x2(type_yield).reduce(:+)
68
+ end
69
+
70
+ x.report("-----"){}
71
+
72
+ x.report("Enum.new max") do
73
+ type_enumerator_new.max
74
+ end
75
+
76
+ x.report("Enum.new sum") do
77
+ type_enumerator_new.reduce(:+)
78
+ end
79
+
80
+ x.report("Enum.new map x*2 sum") do
81
+ enumerator_new_map_x2(type_enumerator_new).reduce(:+)
82
+ end
83
+
84
+ x.report("Enum.new defer map x*2 sum") do
85
+ enumerator_defer_x2(type_enumerator_new).reduce(:+)
86
+ end
87
+
88
+ end
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # Current dir
4
+ cd "$(dirname "$0")"
5
+
6
+ # Exit immediately if a pipeline returns a non-zero status.
7
+ set -e
8
+
9
+ # Spark
10
+ wget "http://d3kbcqa49mib13.cloudfront.net/spark-1.3.0-bin-hadoop2.4.tgz" -O spark.tgz
11
+ tar xvzf spark.tgz
12
+ mv spark-1.3.0-bin-hadoop2.4 spark
13
+ rm spark.tgz
14
+
15
+ # RSpark (only for 1.3.0)
16
+ git clone git@github.com:amplab-extras/SparkR-pkg.git rspark
17
+ cd rspark
18
+ SPARK_VERSION=1.3.0 ./install-dev.sh
@@ -0,0 +1,156 @@
1
+ import os
2
+ import math
3
+ from time import time
4
+ from random import random
5
+ from operator import add
6
+ from pyspark import SparkContext
7
+
8
+ sc = SparkContext(appName="Python", master="local[*]")
9
+
10
+ log_file = open(os.environ.get('PYTHON_LOG'), 'w')
11
+
12
+ def log(*values):
13
+ values = map(lambda x: str(x), values)
14
+ log_file.write(';'.join(values))
15
+ log_file.write('\n')
16
+
17
+ workers = int(os.environ.get('WORKERS'))
18
+ numbers_count = int(os.environ.get('NUMBERS_COUNT'))
19
+ text_file = os.environ.get('TEXT_FILE')
20
+
21
+ numbers = range(numbers_count)
22
+ floats = [float(i) for i in numbers]
23
+ with open(text_file) as t:
24
+ strings = t.read().split("\n")
25
+
26
+
27
+ # =============================================================================
28
+ # Serialization
29
+ # =============================================================================
30
+
31
+ t = time()
32
+ rdd_numbers = sc.parallelize(numbers, workers)
33
+ t = time() - t
34
+ log('NumbersSerialization', t)
35
+
36
+
37
+ t = time()
38
+ rdd_floats = sc.parallelize(floats, workers)
39
+ t = time() - t
40
+ log('FloatsSerialization', t)
41
+
42
+
43
+ t = time()
44
+ rdd_strings = sc.parallelize(strings, workers)
45
+ t = time() - t
46
+ log('StringsSerialization', t)
47
+
48
+
49
+ # =============================================================================
50
+ # Computing
51
+ # =============================================================================
52
+
53
+
54
+ # --- Is prime? ---------------------------------------------------------------
55
+
56
+ def is_prime(x):
57
+ if x < 2:
58
+ return [x, False]
59
+ elif x == 2:
60
+ return [x, True]
61
+ elif x % 2 == 0:
62
+ return [x, False]
63
+ else:
64
+ upper = int(math.sqrt(float(x)))
65
+ result = True
66
+
67
+ i = 3
68
+ while i <= upper:
69
+ if x % i == 0:
70
+ result = False
71
+ break
72
+
73
+ i += 2
74
+
75
+ return [x, result]
76
+
77
+ t = time()
78
+ rdd_numbers.map(is_prime).collect()
79
+ t = time() - t
80
+
81
+ log('IsPrime', t)
82
+
83
+
84
+ # --- Matrix multiplication ---------------------------------------------------
85
+
86
+ matrix_size = int(os.environ.get('MATRIX_SIZE'))
87
+
88
+ matrix = []
89
+ for row in range(matrix_size):
90
+ matrix.append([])
91
+ for col in range(matrix_size):
92
+ matrix[row].append(row+col)
93
+
94
+ def multiplication_func(matrix):
95
+ matrix = list(matrix)
96
+ size = len(matrix)
97
+
98
+ new_matrix = []
99
+ for row in range(size):
100
+ new_matrix.append([])
101
+ for col in range(size):
102
+
103
+ result = 0
104
+ for i in range(size):
105
+ result += matrix[row][i] * matrix[col][i]
106
+ new_matrix[row].append(result)
107
+
108
+ return new_matrix
109
+
110
+ t = time()
111
+ rdd = sc.parallelize(matrix, 1)
112
+ rdd.mapPartitions(multiplication_func).collect()
113
+ t = time() - t
114
+
115
+ log('MatrixMultiplication', t)
116
+
117
+
118
+ # --- Pi digits ---------------------------------------------------------------
119
+ # http://rosettacode.org/wiki/Pi#Python
120
+
121
+ pi_digit = int(os.environ.get('PI_DIGIT'))
122
+
123
+ def pi_func(size):
124
+ size = size.next()
125
+ result = ''
126
+
127
+ q, r, t, k, n, l = 1, 0, 1, 1, 3, 3
128
+ while size > 0:
129
+ if 4*q+r-t < n*t:
130
+ result += str(n)
131
+ size -= 1
132
+ nr = 10*(r-n*t)
133
+ n = ((10*(3*q+r))//t)-10*n
134
+ q *= 10
135
+ r = nr
136
+ else:
137
+ nr = (2*q+r)*l
138
+ nn = (q*(7*k)+2+(r*l))//(t*l)
139
+ q *= k
140
+ t *= l
141
+ l += 2
142
+ k += 1
143
+ n = nn
144
+ r = nr
145
+
146
+ return [result]
147
+
148
+ t = time()
149
+ rdd = sc.parallelize([pi_digit], 1)
150
+ rdd.mapPartitions(pi_func).collect()
151
+ t = time() - t
152
+
153
+ log('PiDigit', t)
154
+
155
+
156
+ log_file.close()