ruby-spark 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +185 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +7 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/custom_marshal.rb +94 -0
  12. data/benchmark/digest.rb +150 -0
  13. data/benchmark/enumerator.rb +88 -0
  14. data/benchmark/performance/prepare.sh +18 -0
  15. data/benchmark/performance/python.py +156 -0
  16. data/benchmark/performance/r.r +69 -0
  17. data/benchmark/performance/ruby.rb +167 -0
  18. data/benchmark/performance/run-all.sh +160 -0
  19. data/benchmark/performance/scala.scala +181 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/ext/ruby_c/extconf.rb +3 -0
  27. data/ext/ruby_c/murmur.c +158 -0
  28. data/ext/ruby_c/murmur.h +9 -0
  29. data/ext/ruby_c/ruby-spark.c +18 -0
  30. data/ext/ruby_java/Digest.java +36 -0
  31. data/ext/ruby_java/Murmur2.java +98 -0
  32. data/ext/ruby_java/RubySparkExtService.java +28 -0
  33. data/ext/ruby_java/extconf.rb +3 -0
  34. data/ext/spark/build.sbt +73 -0
  35. data/ext/spark/project/plugins.sbt +9 -0
  36. data/ext/spark/sbt/sbt +34 -0
  37. data/ext/spark/src/main/scala/Exec.scala +91 -0
  38. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  39. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  40. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  41. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  42. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  43. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  44. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  46. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  47. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  48. data/ext/spark/src/main/scala/RubyRDD.scala +364 -0
  49. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  50. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  51. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  52. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  53. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  54. data/lib/ruby-spark.rb +1 -0
  55. data/lib/spark.rb +198 -0
  56. data/lib/spark/accumulator.rb +260 -0
  57. data/lib/spark/broadcast.rb +98 -0
  58. data/lib/spark/build.rb +43 -0
  59. data/lib/spark/cli.rb +169 -0
  60. data/lib/spark/command.rb +86 -0
  61. data/lib/spark/command/base.rb +154 -0
  62. data/lib/spark/command/basic.rb +345 -0
  63. data/lib/spark/command/pair.rb +124 -0
  64. data/lib/spark/command/sort.rb +51 -0
  65. data/lib/spark/command/statistic.rb +144 -0
  66. data/lib/spark/command_builder.rb +141 -0
  67. data/lib/spark/command_validator.rb +34 -0
  68. data/lib/spark/config.rb +244 -0
  69. data/lib/spark/constant.rb +14 -0
  70. data/lib/spark/context.rb +304 -0
  71. data/lib/spark/error.rb +50 -0
  72. data/lib/spark/ext/hash.rb +41 -0
  73. data/lib/spark/ext/integer.rb +25 -0
  74. data/lib/spark/ext/io.rb +57 -0
  75. data/lib/spark/ext/ip_socket.rb +29 -0
  76. data/lib/spark/ext/module.rb +58 -0
  77. data/lib/spark/ext/object.rb +24 -0
  78. data/lib/spark/ext/string.rb +24 -0
  79. data/lib/spark/helper.rb +10 -0
  80. data/lib/spark/helper/logger.rb +40 -0
  81. data/lib/spark/helper/parser.rb +85 -0
  82. data/lib/spark/helper/serialize.rb +71 -0
  83. data/lib/spark/helper/statistic.rb +93 -0
  84. data/lib/spark/helper/system.rb +42 -0
  85. data/lib/spark/java_bridge.rb +19 -0
  86. data/lib/spark/java_bridge/base.rb +203 -0
  87. data/lib/spark/java_bridge/jruby.rb +23 -0
  88. data/lib/spark/java_bridge/rjb.rb +41 -0
  89. data/lib/spark/logger.rb +76 -0
  90. data/lib/spark/mllib.rb +100 -0
  91. data/lib/spark/mllib/classification/common.rb +31 -0
  92. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  93. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  94. data/lib/spark/mllib/classification/svm.rb +135 -0
  95. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  96. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  97. data/lib/spark/mllib/matrix.rb +120 -0
  98. data/lib/spark/mllib/regression/common.rb +73 -0
  99. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  100. data/lib/spark/mllib/regression/lasso.rb +100 -0
  101. data/lib/spark/mllib/regression/linear.rb +124 -0
  102. data/lib/spark/mllib/regression/ridge.rb +97 -0
  103. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  104. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  105. data/lib/spark/mllib/stat/distribution.rb +12 -0
  106. data/lib/spark/mllib/vector.rb +185 -0
  107. data/lib/spark/rdd.rb +1328 -0
  108. data/lib/spark/sampler.rb +92 -0
  109. data/lib/spark/serializer.rb +24 -0
  110. data/lib/spark/serializer/base.rb +170 -0
  111. data/lib/spark/serializer/cartesian.rb +37 -0
  112. data/lib/spark/serializer/marshal.rb +19 -0
  113. data/lib/spark/serializer/message_pack.rb +25 -0
  114. data/lib/spark/serializer/oj.rb +25 -0
  115. data/lib/spark/serializer/pair.rb +27 -0
  116. data/lib/spark/serializer/utf8.rb +25 -0
  117. data/lib/spark/sort.rb +189 -0
  118. data/lib/spark/stat_counter.rb +125 -0
  119. data/lib/spark/storage_level.rb +39 -0
  120. data/lib/spark/version.rb +3 -0
  121. data/lib/spark/worker/master.rb +144 -0
  122. data/lib/spark/worker/spark_files.rb +15 -0
  123. data/lib/spark/worker/worker.rb +197 -0
  124. data/ruby-spark.gemspec +36 -0
  125. data/spec/generator.rb +37 -0
  126. data/spec/inputs/lorem_300.txt +316 -0
  127. data/spec/inputs/numbers/1.txt +50 -0
  128. data/spec/inputs/numbers/10.txt +50 -0
  129. data/spec/inputs/numbers/11.txt +50 -0
  130. data/spec/inputs/numbers/12.txt +50 -0
  131. data/spec/inputs/numbers/13.txt +50 -0
  132. data/spec/inputs/numbers/14.txt +50 -0
  133. data/spec/inputs/numbers/15.txt +50 -0
  134. data/spec/inputs/numbers/16.txt +50 -0
  135. data/spec/inputs/numbers/17.txt +50 -0
  136. data/spec/inputs/numbers/18.txt +50 -0
  137. data/spec/inputs/numbers/19.txt +50 -0
  138. data/spec/inputs/numbers/2.txt +50 -0
  139. data/spec/inputs/numbers/20.txt +50 -0
  140. data/spec/inputs/numbers/3.txt +50 -0
  141. data/spec/inputs/numbers/4.txt +50 -0
  142. data/spec/inputs/numbers/5.txt +50 -0
  143. data/spec/inputs/numbers/6.txt +50 -0
  144. data/spec/inputs/numbers/7.txt +50 -0
  145. data/spec/inputs/numbers/8.txt +50 -0
  146. data/spec/inputs/numbers/9.txt +50 -0
  147. data/spec/inputs/numbers_0_100.txt +101 -0
  148. data/spec/inputs/numbers_1_100.txt +100 -0
  149. data/spec/lib/collect_spec.rb +42 -0
  150. data/spec/lib/command_spec.rb +68 -0
  151. data/spec/lib/config_spec.rb +64 -0
  152. data/spec/lib/context_spec.rb +163 -0
  153. data/spec/lib/ext_spec.rb +72 -0
  154. data/spec/lib/external_apps_spec.rb +45 -0
  155. data/spec/lib/filter_spec.rb +80 -0
  156. data/spec/lib/flat_map_spec.rb +100 -0
  157. data/spec/lib/group_spec.rb +109 -0
  158. data/spec/lib/helper_spec.rb +19 -0
  159. data/spec/lib/key_spec.rb +41 -0
  160. data/spec/lib/manipulation_spec.rb +114 -0
  161. data/spec/lib/map_partitions_spec.rb +87 -0
  162. data/spec/lib/map_spec.rb +91 -0
  163. data/spec/lib/mllib/classification_spec.rb +54 -0
  164. data/spec/lib/mllib/clustering_spec.rb +35 -0
  165. data/spec/lib/mllib/matrix_spec.rb +32 -0
  166. data/spec/lib/mllib/regression_spec.rb +116 -0
  167. data/spec/lib/mllib/vector_spec.rb +77 -0
  168. data/spec/lib/reduce_by_key_spec.rb +118 -0
  169. data/spec/lib/reduce_spec.rb +131 -0
  170. data/spec/lib/sample_spec.rb +46 -0
  171. data/spec/lib/serializer_spec.rb +13 -0
  172. data/spec/lib/sort_spec.rb +58 -0
  173. data/spec/lib/statistic_spec.rb +168 -0
  174. data/spec/lib/whole_text_files_spec.rb +33 -0
  175. data/spec/spec_helper.rb +39 -0
  176. metadata +301 -0
@@ -0,0 +1,34 @@
1
+ module Spark
2
+ module CommandValidator
3
+
4
+ def validate(value, options)
5
+ validate_type(value, options[:type])
6
+ end
7
+
8
+ def valid?(value, options)
9
+ begin
10
+ validate(value, options)
11
+ return true
12
+ rescue
13
+ return false
14
+ end
15
+ end
16
+
17
+ def validate_type(value, types)
18
+ types = [types] if !types.is_a?(Array)
19
+
20
+ types.each do |type|
21
+ return if value.is_a?(type)
22
+ end
23
+
24
+ error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}."
25
+ end
26
+
27
+ def validate_size(array1, array2)
28
+ if array1.size != array2.size
29
+ error "Wrong number of arguments (#{array1.size} for #{array2.size})"
30
+ end
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,244 @@
1
+ # Necessary libraries
2
+ Spark.load_lib
3
+
4
+ module Spark
5
+ # Common configuration for RubySpark and Spark
6
+ class Config
7
+
8
+ include Spark::Helper::System
9
+
10
+ TYPES = {
11
+ 'spark.shuffle.spill' => :boolean,
12
+ 'spark.ruby.batch_size' => :integer
13
+ }
14
+
15
+ # Initialize java SparkConf and load default configuration.
16
+ def initialize
17
+ @spark_conf = SparkConf.new(true)
18
+ set_default
19
+ end
20
+
21
+ def from_file(file)
22
+ check_read_only
23
+
24
+ if file && File.exist?(file)
25
+ file = File.expand_path(file)
26
+ RubyUtils.loadPropertiesFile(spark_conf, file)
27
+ end
28
+ end
29
+
30
+ def [](key)
31
+ get(key)
32
+ end
33
+
34
+ def []=(key, value)
35
+ set(key, value)
36
+ end
37
+
38
+ def spark_conf
39
+ if Spark.started?
40
+ # Get latest configuration
41
+ Spark.context.jcontext.conf
42
+ else
43
+ @spark_conf
44
+ end
45
+ end
46
+
47
+ def valid!
48
+ errors = []
49
+
50
+ if !contains?('spark.app.name')
51
+ errors << 'An application name must be set in your configuration.'
52
+ end
53
+
54
+ if !contains?('spark.master')
55
+ errors << 'A master URL must be set in your configuration.'
56
+ end
57
+
58
+ if Spark::Serializer.get(get('spark.ruby.serializer')).nil?
59
+ errors << 'Default serializer must be set in your configuration.'
60
+ end
61
+
62
+ scanned = get('spark.ruby.executor.command').scan('%s')
63
+
64
+ if scanned.size == 0
65
+ errors << "Executor command must contain '%s'."
66
+ end
67
+
68
+ if scanned.size > 1
69
+ errors << "Executor command can contain only one '%s'."
70
+ end
71
+
72
+ if errors.any?
73
+ errors.map!{|error| "- #{error}"}
74
+
75
+ raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}"
76
+ end
77
+ end
78
+
79
+ def read_only?
80
+ Spark.started?
81
+ end
82
+
83
+ # Rescue from NoSuchElementException
84
+ def get(key)
85
+ value = spark_conf.get(key.to_s)
86
+
87
+ case TYPES[key]
88
+ when :boolean
89
+ parse_boolean(value)
90
+ when :integer
91
+ parse_integer(value)
92
+ else
93
+ value
94
+ end
95
+ rescue
96
+ nil
97
+ end
98
+
99
+ def get_all
100
+ Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]
101
+ end
102
+
103
+ def contains?(key)
104
+ spark_conf.contains(key.to_s)
105
+ end
106
+
107
+ def set(key, value)
108
+ check_read_only
109
+ spark_conf.set(key.to_s, value.to_s)
110
+ end
111
+
112
+ def set_app_name(name)
113
+ set('spark.app.name', name)
114
+ end
115
+
116
+ def set_master(master)
117
+ set('spark.master', master)
118
+ end
119
+
120
+ def parse_boolean(value)
121
+ case value
122
+ when 'true'
123
+ true
124
+ when 'false'
125
+ false
126
+ end
127
+ end
128
+
129
+ def parse_integer(value)
130
+ value.to_i
131
+ end
132
+
133
+ # =============================================================================
134
+ # Defaults
135
+
136
+ def set_default
137
+ set_app_name('RubySpark')
138
+ set_master('local[*]')
139
+ set('spark.ruby.driver_home', Spark.home)
140
+ set('spark.ruby.parallelize_strategy', default_parallelize_strategy)
141
+ set('spark.ruby.serializer', default_serializer)
142
+ set('spark.ruby.batch_size', default_batch_size)
143
+ set('spark.ruby.executor.uri', default_executor_uri)
144
+ set('spark.ruby.executor.command', default_executor_command)
145
+ set('spark.ruby.executor.options', default_executor_options)
146
+ set('spark.ruby.worker.type', default_worker_type)
147
+ load_executor_envs
148
+ end
149
+
150
+ # How to handle with data in method parallelize.
151
+ #
152
+ # == Possible options:
153
+ # inplace:: data are changed directly to save memory
154
+ # deep_copy:: data are cloned fist
155
+ #
156
+ def default_parallelize_strategy
157
+ ENV['SPARK_RUBY_PARALLELIZE_STRATEGY'] || 'inplace'
158
+ end
159
+
160
+ def default_serializer
161
+ ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
162
+ end
163
+
164
+ def default_batch_size
165
+ ENV['SPARK_RUBY_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE.to_s
166
+ end
167
+
168
+ # Ruby executor.
169
+ #
170
+ # == Options:
171
+ # nil::
172
+ # System's gem is loaded (ruby-spark).
173
+ #
174
+ # other::
175
+ # Path of library which will be used.
176
+ # Current ruby-spark gem is used.
177
+ # (default)
178
+ #
179
+ def default_executor_uri
180
+ ENV['SPARK_RUBY_EXECUTOR_URI'] || ''
181
+ end
182
+
183
+ # Command template which is applied when scala want create a ruby
184
+ # process (e.g. master, home request). Command is represented by '%s'.
185
+ #
186
+ # == Example:
187
+ # bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s"
188
+ #
189
+ def default_executor_command
190
+ ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'
191
+ end
192
+
193
+ # Options for every worker.
194
+ #
195
+ # == Examples:
196
+ # -J-Xmx512m
197
+ #
198
+ def default_executor_options
199
+ ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
200
+ end
201
+
202
+ # Type of worker.
203
+ #
204
+ # == Options:
205
+ # process:: (default)
206
+ # thread:: (experimental)
207
+ #
208
+ def default_worker_type
209
+ ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'
210
+ end
211
+
212
+ # Load environment variables for executor from ENV.
213
+ #
214
+ # == Examples:
215
+ # SPARK_RUBY_EXECUTOR_ENV_KEY1="1"
216
+ # SPARK_RUBY_EXECUTOR_ENV_KEY2="2"
217
+ #
218
+ def load_executor_envs
219
+ prefix = 'SPARK_RUBY_EXECUTOR_ENV_'
220
+
221
+ envs = ENV.select{|key, _| key.start_with?(prefix)}
222
+ envs.each do |key, value|
223
+ key = key.dup # ENV keys are frozen
224
+ key.slice!(0, prefix.size)
225
+
226
+ set("spark.ruby.executor.env.#{key}", value)
227
+ end
228
+ end
229
+
230
+ # Aliases
231
+ alias_method :getAll, :get_all
232
+ alias_method :setAppName, :set_app_name
233
+ alias_method :setMaster, :set_master
234
+
235
+ private
236
+
237
+ def check_read_only
238
+ if read_only?
239
+ raise Spark::ConfigurationError, 'Configuration is ready only'
240
+ end
241
+ end
242
+
243
+ end
244
+ end
@@ -0,0 +1,14 @@
1
+ module Spark
2
+ # Commond constant for Ruby and Spark
3
+ module Constant
4
+ DATA_EOF = -2
5
+ WORKER_ERROR = -1
6
+ WORKER_DONE = 0
7
+ CREATE_WORKER = 1
8
+ KILL_WORKER = 2
9
+ KILL_WORKER_AND_WAIT = 3
10
+ SUCCESSFULLY_KILLED = 4
11
+ UNSUCCESSFUL_KILLING = 5
12
+ ACCUMULATOR_ACK = 6
13
+ end
14
+ end
@@ -0,0 +1,304 @@
1
+ # Necessary libraries
2
+ Spark.load_lib
3
+
4
+ module Spark
5
+ # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
6
+ # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
7
+ #
8
+ class Context
9
+
10
+ include Spark::Helper::System
11
+ include Spark::Helper::Parser
12
+ include Spark::Helper::Logger
13
+
14
+ attr_reader :jcontext, :jaccumulator, :temp_dir
15
+
16
+ # Constructor for Ruby context. Configuration is automatically is taken
17
+ # from Spark. Config will be automatically set to default if user start
18
+ # context first.
19
+ #
20
+ def initialize
21
+ Spark.config.valid!
22
+ @jcontext = JavaSparkContext.new(Spark.config.spark_conf)
23
+ @jcontext.addJar(Spark.ruby_spark_jar)
24
+
25
+ # Does not work on 1.2
26
+ # ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))
27
+
28
+ spark_local_dir = JUtils.getLocalDir(sc.conf)
29
+ @temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath
30
+
31
+ accum_server = Spark::Accumulator::Server
32
+ accum_server.start
33
+ @jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))
34
+
35
+ log_info("Ruby accumulator server is running on port #{accum_server.port}")
36
+
37
+ set_call_site('Ruby') # description of stage
38
+ end
39
+
40
+ def stop
41
+ Spark::Accumulator::Server.stop
42
+ log_info('Ruby accumulator server was stopped')
43
+ @jcontext.stop
44
+ end
45
+
46
+ def sc
47
+ @jcontext.sc
48
+ end
49
+
50
+ def ui
51
+ sc.ui
52
+ end
53
+
54
+ # Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)
55
+ #
56
+ def default_parallelism
57
+ sc.defaultParallelism
58
+ end
59
+
60
+ def get_serializer(serializer, *args)
61
+ serializer = Spark::Serializer.get(serializer)
62
+ serializer ||= Spark::Serializer.get(config['spark.ruby.serializer'])
63
+ serializer.new(config['spark.ruby.batch_size']).set(*args)
64
+ end
65
+
66
+ # Set a local property that affects jobs submitted from this thread, such as the
67
+ # Spark fair scheduler pool.
68
+ #
69
+ def set_local_property(key, value)
70
+ jcontext.setLocalProperty(key, value)
71
+ end
72
+
73
+ # Get a local property set in this thread, or null if it is missing
74
+ #
75
+ def get_local_property(key)
76
+ jcontext.getLocalProperty(key)
77
+ end
78
+
79
+ # Support function for API backtraces.
80
+ #
81
+ def set_call_site(site)
82
+ set_local_property('externalCallSite', site)
83
+ end
84
+
85
+ # Capture the current user callsite and return a formatted version for printing. If the user
86
+ # has overridden the call site, this will return the user's version.
87
+ #
88
+ def get_call_site
89
+ jcontext.getCallSite
90
+ end
91
+
92
+ # Return a copy of this SparkContext's configuration. The configuration *cannot*
93
+ # be changed at runtime.
94
+ #
95
+ def config(key=nil)
96
+ # if key
97
+ # Spark.config[key]
98
+ # else
99
+ # Spark.config.get_all
100
+ # end
101
+ Spark.config
102
+ end
103
+
104
+ # Add a file to be downloaded with this Spark job on every node.
105
+ # The path of file passed can be either a local file, a file in HDFS
106
+ # (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
107
+ #
108
+ # To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the
109
+ # filename to find its download location.
110
+ #
111
+ # == Example:
112
+ # `echo 10 > test.txt`
113
+ #
114
+ # $sc.add_file('test.txt')
115
+ # $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect
116
+ # # => [0, 10, 20, 30, 40, 50]
117
+ #
118
+ def add_file(*files)
119
+ files.each do |file|
120
+ sc.addFile(file)
121
+ end
122
+ end
123
+
124
+ # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
125
+ # object for reading it in distributed functions. The variable will
126
+ # be sent to each cluster only once.
127
+ #
128
+ # == Example:
129
+ # broadcast1 = $sc.broadcast('a')
130
+ # broadcast2 = $sc.broadcast('b')
131
+ #
132
+ # rdd = $sc.parallelize(0..5, 4)
133
+ # rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
134
+ # rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })
135
+ # rdd.collect
136
+ # # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"]
137
+ #
138
+ def broadcast(value)
139
+ Spark::Broadcast.new(self, value)
140
+ end
141
+
142
+ # Create an Accumulator with the given initial value, using a given
143
+ # accum_param helper object to define how to add values of the
144
+ # data type if provided.
145
+ #
146
+ # == Example:
147
+ # accum = $sc.accumulator(7)
148
+ #
149
+ # rdd = $sc.parallelize(0..5, 4)
150
+ # rdd = rdd.bind(accum: accum)
151
+ # rdd = rdd.map_partitions(lambda{|_| accum.add(1) })
152
+ # rdd = rdd.collect
153
+ #
154
+ # accum.value
155
+ # # => 11
156
+ #
157
+ def accumulator(value, accum_param=:+, zero_value=0)
158
+ Spark::Accumulator.new(value, accum_param, zero_value)
159
+ end
160
+
161
+ # Distribute a local Ruby collection to form an RDD
162
+ # Direct method can be slow so be careful, this method update data inplace
163
+ #
164
+ # == Parameters:
165
+ # data:: Range or Array
166
+ # num_slices:: number of slice
167
+ # options::
168
+ # - use
169
+ # - serializer
170
+ # - batch_size
171
+ #
172
+ # == Examples:
173
+ # $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
174
+ # #=> [1, 2, 3]
175
+ #
176
+ # $sc.parallelize(1..3).map(:to_s).collect
177
+ # #=> ["1", "2", "3"]
178
+ #
179
+ def parallelize(data, num_slices=nil, options={})
180
+ num_slices ||= default_parallelism
181
+
182
+ # use = jruby? ? (options[:use] || :direct) : :file
183
+ use = :file
184
+ serializer = get_serializer(options[:serializer], options[:batch_size])
185
+
186
+ if data.is_a?(Array) && config['spark.ruby.parallelize_strategy'] == 'deep_copy'
187
+ data = data.deep_copy
188
+ else
189
+ # For enumerator or range
190
+ data = data.to_a
191
+ end
192
+
193
+ case use
194
+ when :direct
195
+ serializer.dump_to_java(data)
196
+ jrdd = jcontext.parallelize(data, num_slices)
197
+ when :file
198
+ file = Tempfile.new('to_parallelize', temp_dir)
199
+ serializer.dump(data, file)
200
+ file.close # not unlink
201
+ jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
202
+ file.unlink
203
+ end
204
+
205
+ Spark::RDD.new(jrdd, self, serializer)
206
+ end
207
+
208
+ # Read a text file from HDFS, a local file system (available on all nodes), or any
209
+ # Hadoop-supported file system URI, and return it as an RDD of Strings.
210
+ #
211
+ # == Example:
212
+ # f = Tempfile.new("test")
213
+ # f.puts("1")
214
+ # f.puts("2")
215
+ # f.close
216
+ #
217
+ # $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
218
+ # # => [1, 2]
219
+ #
220
+ def text_file(path, min_partitions=nil, options={})
221
+ min_partitions ||= default_parallelism
222
+ serializer = get_serializer(options[:serializer], options[:batch_size])
223
+
224
+ Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, get_serializer('UTF8'))
225
+ end
226
+
227
+ # Read a directory of text files from HDFS, a local file system (available on all nodes), or any
228
+ # Hadoop-supported file system URI. Each file is read as a single record and returned in a
229
+ # key-value pair, where the key is the path of each file, the value is the content of each file.
230
+ #
231
+ # == Example:
232
+ # dir = Dir.mktmpdir
233
+ # f1 = Tempfile.new("test1", dir)
234
+ # f2 = Tempfile.new("test2", dir)
235
+ # f1.puts("1"); f1.puts("2");
236
+ # f2.puts("3"); f2.puts("4");
237
+ # f1.close
238
+ # f2.close
239
+ #
240
+ # $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
241
+ # # => ["1", "2", "3", "4"]
242
+ #
243
+ def whole_text_files(path, min_partitions=nil, options={})
244
+ min_partitions ||= default_parallelism
245
+ serializer = get_serializer(options[:serializer], options[:batch_size])
246
+ deserializer = get_serializer('Pair', get_serializer('UTF8'), get_serializer('UTF8'))
247
+
248
+ Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
249
+ end
250
+
251
+ # Executes the given partition function f on the specified set of partitions,
252
+ # returning the result as an array of elements.
253
+ #
254
+ # If partitions is not specified, this will run over all partitions.
255
+ #
256
+ # == Example:
257
+ # rdd = $sc.parallelize(0..10, 5, batch_size: 1)
258
+ # $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
259
+ # # => ["[0, 1]", "[4, 5]"]
260
+ #
261
+ def run_job(rdd, f, partitions=nil, allow_local=false)
262
+ run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)
263
+ end
264
+
265
+ # Execute the given command on specific set of partitions.
266
+ #
267
+ def run_job_with_command(rdd, partitions, allow_local, command, *args)
268
+ if !partitions.nil? && !partitions.is_a?(Array)
269
+ raise Spark::ContextError, 'Partitions must be nil or Array'
270
+ end
271
+
272
+ partitions_size = rdd.partitions_size
273
+
274
+ # Execute all parts
275
+ if partitions.nil?
276
+ partitions = (0...partitions_size).to_a
277
+ end
278
+
279
+ # Can happend when you use coalesce
280
+ partitions.delete_if {|part| part >= partitions_size}
281
+
282
+ # Rjb represent Fixnum as Integer but Jruby as Long
283
+ partitions = to_java_array_list(convert_to_java_int(partitions))
284
+
285
+ mapped = rdd.new_rdd_from_command(command, *args)
286
+ iterator = PythonRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local)
287
+ mapped.collect_from_iterator(iterator)
288
+ end
289
+
290
+
291
+ # Aliases
292
+ alias_method :textFile, :text_file
293
+ alias_method :wholeTextFiles, :whole_text_files
294
+ alias_method :defaultParallelism, :default_parallelism
295
+ alias_method :setLocalProperty, :set_local_property
296
+ alias_method :getLocalProperty, :get_local_property
297
+ alias_method :setCallSite, :set_call_site
298
+ alias_method :getCallSite, :get_call_site
299
+ alias_method :runJob, :run_job
300
+ alias_method :runJobWithCommand, :run_job_with_command
301
+ alias_method :addFile, :add_file
302
+
303
+ end
304
+ end