ruby-spark 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,34 @@
1
+ module Spark
2
+ module CommandValidator
3
+
4
+ def validate(value, options)
5
+ validate_type(value, options[:type])
6
+ end
7
+
8
+ def valid?(value, options)
9
+ begin
10
+ validate(value, options)
11
+ return true
12
+ rescue
13
+ return false
14
+ end
15
+ end
16
+
17
+ def validate_type(value, types)
18
+ types = [types] if !types.is_a?(Array)
19
+
20
+ types.each do |type|
21
+ return if value.is_a?(type)
22
+ end
23
+
24
+ error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}."
25
+ end
26
+
27
+ def validate_size(array1, array2)
28
+ if array1.size != array2.size
29
+ error "Wrong number of arguments (#{array1.size} for #{array2.size})"
30
+ end
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,244 @@
1
+ # Necessary libraries
2
+ Spark.load_lib
3
+
4
+ module Spark
5
+ # Common configuration for RubySpark and Spark
6
+ class Config
7
+
8
+ include Spark::Helper::System
9
+
10
+ TYPES = {
11
+ 'spark.shuffle.spill' => :boolean,
12
+ 'spark.ruby.batch_size' => :integer
13
+ }
14
+
15
+ # Initialize java SparkConf and load default configuration.
16
+ def initialize
17
+ @spark_conf = SparkConf.new(true)
18
+ set_default
19
+ end
20
+
21
+ def from_file(file)
22
+ check_read_only
23
+
24
+ if file && File.exist?(file)
25
+ file = File.expand_path(file)
26
+ RubyUtils.loadPropertiesFile(spark_conf, file)
27
+ end
28
+ end
29
+
30
+ def [](key)
31
+ get(key)
32
+ end
33
+
34
+ def []=(key, value)
35
+ set(key, value)
36
+ end
37
+
38
+ def spark_conf
39
+ if Spark.started?
40
+ # Get latest configuration
41
+ Spark.context.jcontext.conf
42
+ else
43
+ @spark_conf
44
+ end
45
+ end
46
+
47
+ def valid!
48
+ errors = []
49
+
50
+ if !contains?('spark.app.name')
51
+ errors << 'An application name must be set in your configuration.'
52
+ end
53
+
54
+ if !contains?('spark.master')
55
+ errors << 'A master URL must be set in your configuration.'
56
+ end
57
+
58
+ if Spark::Serializer.get(get('spark.ruby.serializer')).nil?
59
+ errors << 'Default serializer must be set in your configuration.'
60
+ end
61
+
62
+ scanned = get('spark.ruby.executor.command').scan('%s')
63
+
64
+ if scanned.size == 0
65
+ errors << "Executor command must contain '%s'."
66
+ end
67
+
68
+ if scanned.size > 1
69
+ errors << "Executor command can contain only one '%s'."
70
+ end
71
+
72
+ if errors.any?
73
+ errors.map!{|error| "- #{error}"}
74
+
75
+ raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}"
76
+ end
77
+ end
78
+
79
+ def read_only?
80
+ Spark.started?
81
+ end
82
+
83
+ # Rescue from NoSuchElementException
84
+ def get(key)
85
+ value = spark_conf.get(key.to_s)
86
+
87
+ case TYPES[key]
88
+ when :boolean
89
+ parse_boolean(value)
90
+ when :integer
91
+ parse_integer(value)
92
+ else
93
+ value
94
+ end
95
+ rescue
96
+ nil
97
+ end
98
+
99
+ def get_all
100
+ Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]
101
+ end
102
+
103
+ def contains?(key)
104
+ spark_conf.contains(key.to_s)
105
+ end
106
+
107
+ def set(key, value)
108
+ check_read_only
109
+ spark_conf.set(key.to_s, value.to_s)
110
+ end
111
+
112
+ def set_app_name(name)
113
+ set('spark.app.name', name)
114
+ end
115
+
116
+ def set_master(master)
117
+ set('spark.master', master)
118
+ end
119
+
120
+ def parse_boolean(value)
121
+ case value
122
+ when 'true'
123
+ true
124
+ when 'false'
125
+ false
126
+ end
127
+ end
128
+
129
+ def parse_integer(value)
130
+ value.to_i
131
+ end
132
+
133
+ # =============================================================================
134
+ # Defaults
135
+
136
+ def set_default
137
+ set_app_name('RubySpark')
138
+ set_master('local[*]')
139
+ set('spark.ruby.driver_home', Spark.home)
140
+ set('spark.ruby.parallelize_strategy', default_parallelize_strategy)
141
+ set('spark.ruby.serializer', default_serializer)
142
+ set('spark.ruby.batch_size', default_batch_size)
143
+ set('spark.ruby.executor.uri', default_executor_uri)
144
+ set('spark.ruby.executor.command', default_executor_command)
145
+ set('spark.ruby.executor.options', default_executor_options)
146
+ set('spark.ruby.worker.type', default_worker_type)
147
+ load_executor_envs
148
+ end
149
+
150
+ # How to handle with data in method parallelize.
151
+ #
152
+ # == Possible options:
153
+ # inplace:: data are changed directly to save memory
154
+ # deep_copy:: data are cloned fist
155
+ #
156
+ def default_parallelize_strategy
157
+ ENV['SPARK_RUBY_PARALLELIZE_STRATEGY'] || 'inplace'
158
+ end
159
+
160
+ def default_serializer
161
+ ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
162
+ end
163
+
164
+ def default_batch_size
165
+ ENV['SPARK_RUBY_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE.to_s
166
+ end
167
+
168
+ # Ruby executor.
169
+ #
170
+ # == Options:
171
+ # nil::
172
+ # System's gem is loaded (ruby-spark).
173
+ #
174
+ # other::
175
+ # Path of library which will be used.
176
+ # Current ruby-spark gem is used.
177
+ # (default)
178
+ #
179
+ def default_executor_uri
180
+ ENV['SPARK_RUBY_EXECUTOR_URI'] || ''
181
+ end
182
+
183
+ # Command template which is applied when scala want create a ruby
184
+ # process (e.g. master, home request). Command is represented by '%s'.
185
+ #
186
+ # == Example:
187
+ # bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s"
188
+ #
189
+ def default_executor_command
190
+ ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'
191
+ end
192
+
193
+ # Options for every worker.
194
+ #
195
+ # == Examples:
196
+ # -J-Xmx512m
197
+ #
198
+ def default_executor_options
199
+ ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
200
+ end
201
+
202
+ # Type of worker.
203
+ #
204
+ # == Options:
205
+ # process:: (default)
206
+ # thread:: (experimental)
207
+ #
208
+ def default_worker_type
209
+ ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'
210
+ end
211
+
212
+ # Load environment variables for executor from ENV.
213
+ #
214
+ # == Examples:
215
+ # SPARK_RUBY_EXECUTOR_ENV_KEY1="1"
216
+ # SPARK_RUBY_EXECUTOR_ENV_KEY2="2"
217
+ #
218
+ def load_executor_envs
219
+ prefix = 'SPARK_RUBY_EXECUTOR_ENV_'
220
+
221
+ envs = ENV.select{|key, _| key.start_with?(prefix)}
222
+ envs.each do |key, value|
223
+ key = key.dup # ENV keys are frozen
224
+ key.slice!(0, prefix.size)
225
+
226
+ set("spark.ruby.executor.env.#{key}", value)
227
+ end
228
+ end
229
+
230
+ # Aliases
231
+ alias_method :getAll, :get_all
232
+ alias_method :setAppName, :set_app_name
233
+ alias_method :setMaster, :set_master
234
+
235
+ private
236
+
237
+ def check_read_only
238
+ if read_only?
239
+ raise Spark::ConfigurationError, 'Configuration is ready only'
240
+ end
241
+ end
242
+
243
+ end
244
+ end
@@ -0,0 +1,14 @@
1
+ module Spark
2
+ # Commond constant for Ruby and Spark
3
+ module Constant
4
+ DATA_EOF = -2
5
+ WORKER_ERROR = -1
6
+ WORKER_DONE = 0
7
+ CREATE_WORKER = 1
8
+ KILL_WORKER = 2
9
+ KILL_WORKER_AND_WAIT = 3
10
+ SUCCESSFULLY_KILLED = 4
11
+ UNSUCCESSFUL_KILLING = 5
12
+ ACCUMULATOR_ACK = 6
13
+ end
14
+ end
@@ -0,0 +1,304 @@
1
+ # Necessary libraries
2
+ Spark.load_lib
3
+
4
+ module Spark
5
+ # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
6
+ # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
7
+ #
8
+ class Context
9
+
10
+ include Spark::Helper::System
11
+ include Spark::Helper::Parser
12
+ include Spark::Helper::Logger
13
+
14
+ attr_reader :jcontext, :jaccumulator, :temp_dir
15
+
16
+ # Constructor for Ruby context. Configuration is automatically is taken
17
+ # from Spark. Config will be automatically set to default if user start
18
+ # context first.
19
+ #
20
+ def initialize
21
+ Spark.config.valid!
22
+ @jcontext = JavaSparkContext.new(Spark.config.spark_conf)
23
+ @jcontext.addJar(Spark.ruby_spark_jar)
24
+
25
+ # Does not work on 1.2
26
+ # ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))
27
+
28
+ spark_local_dir = JUtils.getLocalDir(sc.conf)
29
+ @temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath
30
+
31
+ accum_server = Spark::Accumulator::Server
32
+ accum_server.start
33
+ @jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))
34
+
35
+ log_info("Ruby accumulator server is running on port #{accum_server.port}")
36
+
37
+ set_call_site('Ruby') # description of stage
38
+ end
39
+
40
+ def stop
41
+ Spark::Accumulator::Server.stop
42
+ log_info('Ruby accumulator server was stopped')
43
+ @jcontext.stop
44
+ end
45
+
46
+ def sc
47
+ @jcontext.sc
48
+ end
49
+
50
+ def ui
51
+ sc.ui
52
+ end
53
+
54
+ # Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)
55
+ #
56
+ def default_parallelism
57
+ sc.defaultParallelism
58
+ end
59
+
60
+ def get_serializer(serializer, *args)
61
+ serializer = Spark::Serializer.get(serializer)
62
+ serializer ||= Spark::Serializer.get(config['spark.ruby.serializer'])
63
+ serializer.new(config['spark.ruby.batch_size']).set(*args)
64
+ end
65
+
66
+ # Set a local property that affects jobs submitted from this thread, such as the
67
+ # Spark fair scheduler pool.
68
+ #
69
+ def set_local_property(key, value)
70
+ jcontext.setLocalProperty(key, value)
71
+ end
72
+
73
+ # Get a local property set in this thread, or null if it is missing
74
+ #
75
+ def get_local_property(key)
76
+ jcontext.getLocalProperty(key)
77
+ end
78
+
79
+ # Support function for API backtraces.
80
+ #
81
+ def set_call_site(site)
82
+ set_local_property('externalCallSite', site)
83
+ end
84
+
85
+ # Capture the current user callsite and return a formatted version for printing. If the user
86
+ # has overridden the call site, this will return the user's version.
87
+ #
88
+ def get_call_site
89
+ jcontext.getCallSite
90
+ end
91
+
92
+ # Return a copy of this SparkContext's configuration. The configuration *cannot*
93
+ # be changed at runtime.
94
+ #
95
+ def config(key=nil)
96
+ # if key
97
+ # Spark.config[key]
98
+ # else
99
+ # Spark.config.get_all
100
+ # end
101
+ Spark.config
102
+ end
103
+
104
+ # Add a file to be downloaded with this Spark job on every node.
105
+ # The path of file passed can be either a local file, a file in HDFS
106
+ # (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
107
+ #
108
+ # To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the
109
+ # filename to find its download location.
110
+ #
111
+ # == Example:
112
+ # `echo 10 > test.txt`
113
+ #
114
+ # $sc.add_file('test.txt')
115
+ # $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect
116
+ # # => [0, 10, 20, 30, 40, 50]
117
+ #
118
+ def add_file(*files)
119
+ files.each do |file|
120
+ sc.addFile(file)
121
+ end
122
+ end
123
+
124
+ # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
125
+ # object for reading it in distributed functions. The variable will
126
+ # be sent to each cluster only once.
127
+ #
128
+ # == Example:
129
+ # broadcast1 = $sc.broadcast('a')
130
+ # broadcast2 = $sc.broadcast('b')
131
+ #
132
+ # rdd = $sc.parallelize(0..5, 4)
133
+ # rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
134
+ # rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })
135
+ # rdd.collect
136
+ # # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"]
137
+ #
138
+ def broadcast(value)
139
+ Spark::Broadcast.new(self, value)
140
+ end
141
+
142
+ # Create an Accumulator with the given initial value, using a given
143
+ # accum_param helper object to define how to add values of the
144
+ # data type if provided.
145
+ #
146
+ # == Example:
147
+ # accum = $sc.accumulator(7)
148
+ #
149
+ # rdd = $sc.parallelize(0..5, 4)
150
+ # rdd = rdd.bind(accum: accum)
151
+ # rdd = rdd.map_partitions(lambda{|_| accum.add(1) })
152
+ # rdd = rdd.collect
153
+ #
154
+ # accum.value
155
+ # # => 11
156
+ #
157
+ def accumulator(value, accum_param=:+, zero_value=0)
158
+ Spark::Accumulator.new(value, accum_param, zero_value)
159
+ end
160
+
161
+ # Distribute a local Ruby collection to form an RDD
162
+ # Direct method can be slow so be careful, this method update data inplace
163
+ #
164
+ # == Parameters:
165
+ # data:: Range or Array
166
+ # num_slices:: number of slice
167
+ # options::
168
+ # - use
169
+ # - serializer
170
+ # - batch_size
171
+ #
172
+ # == Examples:
173
+ # $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
174
+ # #=> [1, 2, 3]
175
+ #
176
+ # $sc.parallelize(1..3).map(:to_s).collect
177
+ # #=> ["1", "2", "3"]
178
+ #
179
+ def parallelize(data, num_slices=nil, options={})
180
+ num_slices ||= default_parallelism
181
+
182
+ # use = jruby? ? (options[:use] || :direct) : :file
183
+ use = :file
184
+ serializer = get_serializer(options[:serializer], options[:batch_size])
185
+
186
+ if data.is_a?(Array) && config['spark.ruby.parallelize_strategy'] == 'deep_copy'
187
+ data = data.deep_copy
188
+ else
189
+ # For enumerator or range
190
+ data = data.to_a
191
+ end
192
+
193
+ case use
194
+ when :direct
195
+ serializer.dump_to_java(data)
196
+ jrdd = jcontext.parallelize(data, num_slices)
197
+ when :file
198
+ file = Tempfile.new('to_parallelize', temp_dir)
199
+ serializer.dump(data, file)
200
+ file.close # not unlink
201
+ jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
202
+ file.unlink
203
+ end
204
+
205
+ Spark::RDD.new(jrdd, self, serializer)
206
+ end
207
+
208
+ # Read a text file from HDFS, a local file system (available on all nodes), or any
209
+ # Hadoop-supported file system URI, and return it as an RDD of Strings.
210
+ #
211
+ # == Example:
212
+ # f = Tempfile.new("test")
213
+ # f.puts("1")
214
+ # f.puts("2")
215
+ # f.close
216
+ #
217
+ # $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
218
+ # # => [1, 2]
219
+ #
220
+ def text_file(path, min_partitions=nil, options={})
221
+ min_partitions ||= default_parallelism
222
+ serializer = get_serializer(options[:serializer], options[:batch_size])
223
+
224
+ Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, get_serializer('UTF8'))
225
+ end
226
+
227
+ # Read a directory of text files from HDFS, a local file system (available on all nodes), or any
228
+ # Hadoop-supported file system URI. Each file is read as a single record and returned in a
229
+ # key-value pair, where the key is the path of each file, the value is the content of each file.
230
+ #
231
+ # == Example:
232
+ # dir = Dir.mktmpdir
233
+ # f1 = Tempfile.new("test1", dir)
234
+ # f2 = Tempfile.new("test2", dir)
235
+ # f1.puts("1"); f1.puts("2");
236
+ # f2.puts("3"); f2.puts("4");
237
+ # f1.close
238
+ # f2.close
239
+ #
240
+ # $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
241
+ # # => ["1", "2", "3", "4"]
242
+ #
243
+ def whole_text_files(path, min_partitions=nil, options={})
244
+ min_partitions ||= default_parallelism
245
+ serializer = get_serializer(options[:serializer], options[:batch_size])
246
+ deserializer = get_serializer('Pair', get_serializer('UTF8'), get_serializer('UTF8'))
247
+
248
+ Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
249
+ end
250
+
251
+ # Executes the given partition function f on the specified set of partitions,
252
+ # returning the result as an array of elements.
253
+ #
254
+ # If partitions is not specified, this will run over all partitions.
255
+ #
256
+ # == Example:
257
+ # rdd = $sc.parallelize(0..10, 5, batch_size: 1)
258
+ # $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
259
+ # # => ["[0, 1]", "[4, 5]"]
260
+ #
261
+ def run_job(rdd, f, partitions=nil, allow_local=false)
262
+ run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)
263
+ end
264
+
265
+ # Execute the given command on specific set of partitions.
266
+ #
267
+ def run_job_with_command(rdd, partitions, allow_local, command, *args)
268
+ if !partitions.nil? && !partitions.is_a?(Array)
269
+ raise Spark::ContextError, 'Partitions must be nil or Array'
270
+ end
271
+
272
+ partitions_size = rdd.partitions_size
273
+
274
+ # Execute all parts
275
+ if partitions.nil?
276
+ partitions = (0...partitions_size).to_a
277
+ end
278
+
279
+ # Can happend when you use coalesce
280
+ partitions.delete_if {|part| part >= partitions_size}
281
+
282
+ # Rjb represent Fixnum as Integer but Jruby as Long
283
+ partitions = to_java_array_list(convert_to_java_int(partitions))
284
+
285
+ mapped = rdd.new_rdd_from_command(command, *args)
286
+ iterator = PythonRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local)
287
+ mapped.collect_from_iterator(iterator)
288
+ end
289
+
290
+
291
+ # Aliases
292
+ alias_method :textFile, :text_file
293
+ alias_method :wholeTextFiles, :whole_text_files
294
+ alias_method :defaultParallelism, :default_parallelism
295
+ alias_method :setLocalProperty, :set_local_property
296
+ alias_method :getLocalProperty, :get_local_property
297
+ alias_method :setCallSite, :set_call_site
298
+ alias_method :getCallSite, :get_call_site
299
+ alias_method :runJob, :run_job
300
+ alias_method :runJobWithCommand, :run_job_with_command
301
+ alias_method :addFile, :add_file
302
+
303
+ end
304
+ end