ruby-spark 1.1.0.1-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (180) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +252 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +6 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/comparison/prepare.sh +18 -0
  12. data/benchmark/comparison/python.py +156 -0
  13. data/benchmark/comparison/r.r +69 -0
  14. data/benchmark/comparison/ruby.rb +167 -0
  15. data/benchmark/comparison/run-all.sh +160 -0
  16. data/benchmark/comparison/scala.scala +181 -0
  17. data/benchmark/custom_marshal.rb +94 -0
  18. data/benchmark/digest.rb +150 -0
  19. data/benchmark/enumerator.rb +88 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/example/website_search.rb +83 -0
  27. data/ext/ruby_c/extconf.rb +3 -0
  28. data/ext/ruby_c/murmur.c +158 -0
  29. data/ext/ruby_c/murmur.h +9 -0
  30. data/ext/ruby_c/ruby-spark.c +18 -0
  31. data/ext/ruby_java/Digest.java +36 -0
  32. data/ext/ruby_java/Murmur2.java +98 -0
  33. data/ext/ruby_java/RubySparkExtService.java +28 -0
  34. data/ext/ruby_java/extconf.rb +3 -0
  35. data/ext/spark/build.sbt +73 -0
  36. data/ext/spark/project/plugins.sbt +9 -0
  37. data/ext/spark/sbt/sbt +34 -0
  38. data/ext/spark/src/main/scala/Exec.scala +91 -0
  39. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  40. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  41. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  42. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  43. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  44. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  46. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  47. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  48. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  49. data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
  50. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  51. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  52. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  53. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  54. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  55. data/lib/ruby-spark.rb +1 -0
  56. data/lib/spark.rb +198 -0
  57. data/lib/spark/accumulator.rb +260 -0
  58. data/lib/spark/broadcast.rb +98 -0
  59. data/lib/spark/build.rb +43 -0
  60. data/lib/spark/cli.rb +169 -0
  61. data/lib/spark/command.rb +86 -0
  62. data/lib/spark/command/base.rb +158 -0
  63. data/lib/spark/command/basic.rb +345 -0
  64. data/lib/spark/command/pair.rb +124 -0
  65. data/lib/spark/command/sort.rb +51 -0
  66. data/lib/spark/command/statistic.rb +144 -0
  67. data/lib/spark/command_builder.rb +141 -0
  68. data/lib/spark/command_validator.rb +34 -0
  69. data/lib/spark/config.rb +238 -0
  70. data/lib/spark/constant.rb +14 -0
  71. data/lib/spark/context.rb +322 -0
  72. data/lib/spark/error.rb +50 -0
  73. data/lib/spark/ext/hash.rb +41 -0
  74. data/lib/spark/ext/integer.rb +25 -0
  75. data/lib/spark/ext/io.rb +67 -0
  76. data/lib/spark/ext/ip_socket.rb +29 -0
  77. data/lib/spark/ext/module.rb +58 -0
  78. data/lib/spark/ext/object.rb +24 -0
  79. data/lib/spark/ext/string.rb +24 -0
  80. data/lib/spark/helper.rb +10 -0
  81. data/lib/spark/helper/logger.rb +40 -0
  82. data/lib/spark/helper/parser.rb +85 -0
  83. data/lib/spark/helper/serialize.rb +71 -0
  84. data/lib/spark/helper/statistic.rb +93 -0
  85. data/lib/spark/helper/system.rb +42 -0
  86. data/lib/spark/java_bridge.rb +19 -0
  87. data/lib/spark/java_bridge/base.rb +203 -0
  88. data/lib/spark/java_bridge/jruby.rb +23 -0
  89. data/lib/spark/java_bridge/rjb.rb +41 -0
  90. data/lib/spark/logger.rb +76 -0
  91. data/lib/spark/mllib.rb +100 -0
  92. data/lib/spark/mllib/classification/common.rb +31 -0
  93. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  94. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  95. data/lib/spark/mllib/classification/svm.rb +135 -0
  96. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  97. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  98. data/lib/spark/mllib/matrix.rb +120 -0
  99. data/lib/spark/mllib/regression/common.rb +73 -0
  100. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  101. data/lib/spark/mllib/regression/lasso.rb +100 -0
  102. data/lib/spark/mllib/regression/linear.rb +124 -0
  103. data/lib/spark/mllib/regression/ridge.rb +97 -0
  104. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  105. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  106. data/lib/spark/mllib/stat/distribution.rb +12 -0
  107. data/lib/spark/mllib/vector.rb +185 -0
  108. data/lib/spark/rdd.rb +1377 -0
  109. data/lib/spark/sampler.rb +92 -0
  110. data/lib/spark/serializer.rb +79 -0
  111. data/lib/spark/serializer/auto_batched.rb +59 -0
  112. data/lib/spark/serializer/base.rb +63 -0
  113. data/lib/spark/serializer/batched.rb +84 -0
  114. data/lib/spark/serializer/cartesian.rb +13 -0
  115. data/lib/spark/serializer/compressed.rb +27 -0
  116. data/lib/spark/serializer/marshal.rb +17 -0
  117. data/lib/spark/serializer/message_pack.rb +23 -0
  118. data/lib/spark/serializer/oj.rb +23 -0
  119. data/lib/spark/serializer/pair.rb +41 -0
  120. data/lib/spark/serializer/text.rb +25 -0
  121. data/lib/spark/sort.rb +189 -0
  122. data/lib/spark/stat_counter.rb +125 -0
  123. data/lib/spark/storage_level.rb +39 -0
  124. data/lib/spark/version.rb +3 -0
  125. data/lib/spark/worker/master.rb +144 -0
  126. data/lib/spark/worker/spark_files.rb +15 -0
  127. data/lib/spark/worker/worker.rb +200 -0
  128. data/ruby-spark.gemspec +47 -0
  129. data/spec/generator.rb +37 -0
  130. data/spec/inputs/lorem_300.txt +316 -0
  131. data/spec/inputs/numbers/1.txt +50 -0
  132. data/spec/inputs/numbers/10.txt +50 -0
  133. data/spec/inputs/numbers/11.txt +50 -0
  134. data/spec/inputs/numbers/12.txt +50 -0
  135. data/spec/inputs/numbers/13.txt +50 -0
  136. data/spec/inputs/numbers/14.txt +50 -0
  137. data/spec/inputs/numbers/15.txt +50 -0
  138. data/spec/inputs/numbers/16.txt +50 -0
  139. data/spec/inputs/numbers/17.txt +50 -0
  140. data/spec/inputs/numbers/18.txt +50 -0
  141. data/spec/inputs/numbers/19.txt +50 -0
  142. data/spec/inputs/numbers/2.txt +50 -0
  143. data/spec/inputs/numbers/20.txt +50 -0
  144. data/spec/inputs/numbers/3.txt +50 -0
  145. data/spec/inputs/numbers/4.txt +50 -0
  146. data/spec/inputs/numbers/5.txt +50 -0
  147. data/spec/inputs/numbers/6.txt +50 -0
  148. data/spec/inputs/numbers/7.txt +50 -0
  149. data/spec/inputs/numbers/8.txt +50 -0
  150. data/spec/inputs/numbers/9.txt +50 -0
  151. data/spec/inputs/numbers_0_100.txt +101 -0
  152. data/spec/inputs/numbers_1_100.txt +100 -0
  153. data/spec/lib/collect_spec.rb +42 -0
  154. data/spec/lib/command_spec.rb +68 -0
  155. data/spec/lib/config_spec.rb +64 -0
  156. data/spec/lib/context_spec.rb +165 -0
  157. data/spec/lib/ext_spec.rb +72 -0
  158. data/spec/lib/external_apps_spec.rb +45 -0
  159. data/spec/lib/filter_spec.rb +80 -0
  160. data/spec/lib/flat_map_spec.rb +100 -0
  161. data/spec/lib/group_spec.rb +109 -0
  162. data/spec/lib/helper_spec.rb +19 -0
  163. data/spec/lib/key_spec.rb +41 -0
  164. data/spec/lib/manipulation_spec.rb +122 -0
  165. data/spec/lib/map_partitions_spec.rb +87 -0
  166. data/spec/lib/map_spec.rb +91 -0
  167. data/spec/lib/mllib/classification_spec.rb +54 -0
  168. data/spec/lib/mllib/clustering_spec.rb +35 -0
  169. data/spec/lib/mllib/matrix_spec.rb +32 -0
  170. data/spec/lib/mllib/regression_spec.rb +116 -0
  171. data/spec/lib/mllib/vector_spec.rb +77 -0
  172. data/spec/lib/reduce_by_key_spec.rb +118 -0
  173. data/spec/lib/reduce_spec.rb +131 -0
  174. data/spec/lib/sample_spec.rb +46 -0
  175. data/spec/lib/serializer_spec.rb +88 -0
  176. data/spec/lib/sort_spec.rb +58 -0
  177. data/spec/lib/statistic_spec.rb +170 -0
  178. data/spec/lib/whole_text_files_spec.rb +33 -0
  179. data/spec/spec_helper.rb +38 -0
  180. metadata +389 -0
@@ -0,0 +1,34 @@
1
+ module Spark
2
+ module CommandValidator
3
+
4
+ def validate(value, options)
5
+ validate_type(value, options[:type])
6
+ end
7
+
8
+ def valid?(value, options)
9
+ begin
10
+ validate(value, options)
11
+ return true
12
+ rescue
13
+ return false
14
+ end
15
+ end
16
+
17
+ def validate_type(value, types)
18
+ types = [types] if !types.is_a?(Array)
19
+
20
+ types.each do |type|
21
+ return if value.is_a?(type)
22
+ end
23
+
24
+ error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}."
25
+ end
26
+
27
+ def validate_size(array1, array2)
28
+ if array1.size != array2.size
29
+ error "Wrong number of arguments (#{array1.size} for #{array2.size})"
30
+ end
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,238 @@
1
+ # Necessary libraries
2
+ Spark.load_lib
3
+
4
+ module Spark
5
+ # Common configuration for RubySpark and Spark
6
+ class Config
7
+
8
+ include Spark::Helper::System
9
+
10
+ TYPES = {
11
+ 'spark.shuffle.spill' => :boolean,
12
+ 'spark.ruby.serializer.compress' => :boolean
13
+ }
14
+
15
+ # Initialize java SparkConf and load default configuration.
16
+ def initialize
17
+ @spark_conf = SparkConf.new(true)
18
+ set_default
19
+ end
20
+
21
+ def from_file(file)
22
+ check_read_only
23
+
24
+ if file && File.exist?(file)
25
+ file = File.expand_path(file)
26
+ RubyUtils.loadPropertiesFile(spark_conf, file)
27
+ end
28
+ end
29
+
30
+ def [](key)
31
+ get(key)
32
+ end
33
+
34
+ def []=(key, value)
35
+ set(key, value)
36
+ end
37
+
38
+ def spark_conf
39
+ if Spark.started?
40
+ # Get latest configuration
41
+ Spark.context.jcontext.conf
42
+ else
43
+ @spark_conf
44
+ end
45
+ end
46
+
47
+ def valid!
48
+ errors = []
49
+
50
+ if !contains?('spark.app.name')
51
+ errors << 'An application name must be set in your configuration.'
52
+ end
53
+
54
+ if !contains?('spark.master')
55
+ errors << 'A master URL must be set in your configuration.'
56
+ end
57
+
58
+ if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
59
+ errors << 'Unknow serializer.'
60
+ end
61
+
62
+ scanned = get('spark.ruby.executor.command').scan('%s')
63
+
64
+ if scanned.size == 0
65
+ errors << "Executor command must contain '%s'."
66
+ end
67
+
68
+ if scanned.size > 1
69
+ errors << "Executor command can contain only one '%s'."
70
+ end
71
+
72
+ if errors.any?
73
+ errors.map!{|error| "- #{error}"}
74
+
75
+ raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}"
76
+ end
77
+ end
78
+
79
+ def read_only?
80
+ Spark.started?
81
+ end
82
+
83
+ # Rescue from NoSuchElementException
84
+ def get(key)
85
+ value = spark_conf.get(key.to_s)
86
+
87
+ case TYPES[key]
88
+ when :boolean
89
+ parse_boolean(value)
90
+ when :integer
91
+ parse_integer(value)
92
+ else
93
+ value
94
+ end
95
+ rescue
96
+ nil
97
+ end
98
+
99
+ def get_all
100
+ Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]
101
+ end
102
+
103
+ def contains?(key)
104
+ spark_conf.contains(key.to_s)
105
+ end
106
+
107
+ def set(key, value)
108
+ check_read_only
109
+ spark_conf.set(key.to_s, value.to_s)
110
+ end
111
+
112
+ def set_app_name(name)
113
+ set('spark.app.name', name)
114
+ end
115
+
116
+ def set_master(master)
117
+ set('spark.master', master)
118
+ end
119
+
120
+ def parse_boolean(value)
121
+ case value
122
+ when 'true'
123
+ true
124
+ when 'false'
125
+ false
126
+ end
127
+ end
128
+
129
+ def parse_integer(value)
130
+ value.to_i
131
+ end
132
+
133
+ # =============================================================================
134
+ # Defaults
135
+
136
+ def set_default
137
+ set_app_name('RubySpark')
138
+ set_master('local[*]')
139
+ set('spark.ruby.driver_home', Spark.home)
140
+ set('spark.ruby.serializer', default_serializer)
141
+ set('spark.ruby.serializer.compress', default_serializer_compress)
142
+ set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
143
+ set('spark.ruby.executor.uri', default_executor_uri)
144
+ set('spark.ruby.executor.command', default_executor_command)
145
+ set('spark.ruby.executor.options', default_executor_options)
146
+ set('spark.ruby.worker.type', default_worker_type)
147
+ load_executor_envs
148
+ end
149
+
150
+ def default_serializer
151
+ ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
152
+ end
153
+
154
+ def default_serializer_compress
155
+ ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
156
+ end
157
+
158
+ def default_serializer_batch_size
159
+ ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
160
+ end
161
+
162
+ # Ruby executor.
163
+ #
164
+ # == Options:
165
+ # nil::
166
+ # System's gem is loaded (ruby-spark).
167
+ #
168
+ # other::
169
+ # Path of library which will be used.
170
+ # Current ruby-spark gem is used.
171
+ # (default)
172
+ #
173
+ def default_executor_uri
174
+ ENV['SPARK_RUBY_EXECUTOR_URI'] || ''
175
+ end
176
+
177
+ # Command template which is applied when scala want create a ruby
178
+ # process (e.g. master, home request). Command is represented by '%s'.
179
+ #
180
+ # == Example:
181
+ # bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s"
182
+ #
183
+ def default_executor_command
184
+ ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'
185
+ end
186
+
187
+ # Options for every worker.
188
+ #
189
+ # == Examples:
190
+ # -J-Xmx512m
191
+ #
192
+ def default_executor_options
193
+ ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
194
+ end
195
+
196
+ # Type of worker.
197
+ #
198
+ # == Options:
199
+ # process:: (default)
200
+ # thread:: (experimental)
201
+ #
202
+ def default_worker_type
203
+ ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'
204
+ end
205
+
206
+ # Load environment variables for executor from ENV.
207
+ #
208
+ # == Examples:
209
+ # SPARK_RUBY_EXECUTOR_ENV_KEY1="1"
210
+ # SPARK_RUBY_EXECUTOR_ENV_KEY2="2"
211
+ #
212
+ def load_executor_envs
213
+ prefix = 'SPARK_RUBY_EXECUTOR_ENV_'
214
+
215
+ envs = ENV.select{|key, _| key.start_with?(prefix)}
216
+ envs.each do |key, value|
217
+ key = key.dup # ENV keys are frozen
218
+ key.slice!(0, prefix.size)
219
+
220
+ set("spark.ruby.executor.env.#{key}", value)
221
+ end
222
+ end
223
+
224
+ # Aliases
225
+ alias_method :getAll, :get_all
226
+ alias_method :setAppName, :set_app_name
227
+ alias_method :setMaster, :set_master
228
+
229
+ private
230
+
231
+ def check_read_only
232
+ if read_only?
233
+ raise Spark::ConfigurationError, 'Configuration is ready only'
234
+ end
235
+ end
236
+
237
+ end
238
+ end
@@ -0,0 +1,14 @@
1
+ module Spark
2
+ # Commond constant for Ruby and Spark
3
+ module Constant
4
+ DATA_EOF = -2
5
+ WORKER_ERROR = -1
6
+ WORKER_DONE = 0
7
+ CREATE_WORKER = 1
8
+ KILL_WORKER = 2
9
+ KILL_WORKER_AND_WAIT = 3
10
+ SUCCESSFULLY_KILLED = 4
11
+ UNSUCCESSFUL_KILLING = 5
12
+ ACCUMULATOR_ACK = 6
13
+ end
14
+ end
@@ -0,0 +1,322 @@
1
+ # Necessary libraries
2
+ Spark.load_lib
3
+
4
+ module Spark
5
+ ##
6
+ # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
7
+ # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
8
+ #
9
+ class Context
10
+
11
+ include Spark::Helper::System
12
+ include Spark::Helper::Parser
13
+ include Spark::Helper::Logger
14
+
15
+ attr_reader :jcontext, :jaccumulator, :temp_dir
16
+
17
+ # Constructor for Ruby context. Configuration is automatically is taken
18
+ # from Spark. Config will be automatically set to default if user start
19
+ # context first.
20
+ #
21
+ def initialize
22
+ Spark.config.valid!
23
+ @jcontext = JavaSparkContext.new(Spark.config.spark_conf)
24
+ @jcontext.addJar(Spark.ruby_spark_jar)
25
+
26
+ # Does not work on 1.2
27
+ # ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))
28
+
29
+ spark_local_dir = JUtils.getLocalDir(sc.conf)
30
+ @temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath
31
+
32
+ accum_server = Spark::Accumulator::Server
33
+ accum_server.start
34
+ @jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))
35
+
36
+ log_info("Ruby accumulator server is running on port #{accum_server.port}")
37
+
38
+ set_call_site('Ruby') # description of stage
39
+ end
40
+
41
+ def stop
42
+ Spark::Accumulator::Server.stop
43
+ log_info('Ruby accumulator server was stopped')
44
+ @jcontext.stop
45
+ end
46
+
47
+ def sc
48
+ @jcontext.sc
49
+ end
50
+
51
+ def ui
52
+ sc.ui
53
+ end
54
+
55
+ # Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)
56
+ #
57
+ def default_parallelism
58
+ sc.defaultParallelism
59
+ end
60
+
61
+ # Default serializer
62
+ #
63
+ # Batch -> Compress -> Basic
64
+ #
65
+ def default_serializer
66
+ # Basic
67
+ serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
68
+
69
+ # Compress
70
+ if config('spark.ruby.serializer.compress')
71
+ serializer = Spark::Serializer.compressed(serializer)
72
+ end
73
+
74
+ # Bactching
75
+ batch_size = default_batch_size
76
+ if batch_size == 'auto'
77
+ serializer = Spark::Serializer.auto_batched(serializer)
78
+ else
79
+ serializer = Spark::Serializer.batched(serializer, batch_size)
80
+ end
81
+
82
+ # Finally, "container" contains serializers
83
+ serializer
84
+ end
85
+
86
+ def default_batch_size
87
+ size = config('spark.ruby.serializer.batch_size').to_i
88
+ if size >= 1
89
+ size
90
+ else
91
+ 'auto'
92
+ end
93
+ end
94
+
95
+ # Set a local property that affects jobs submitted from this thread, such as the
96
+ # Spark fair scheduler pool.
97
+ #
98
+ def set_local_property(key, value)
99
+ jcontext.setLocalProperty(key, value)
100
+ end
101
+
102
+ # Get a local property set in this thread, or null if it is missing
103
+ #
104
+ def get_local_property(key)
105
+ jcontext.getLocalProperty(key)
106
+ end
107
+
108
+ # Support function for API backtraces.
109
+ #
110
+ def set_call_site(site)
111
+ set_local_property('externalCallSite', site)
112
+ end
113
+
114
+ # Capture the current user callsite and return a formatted version for printing. If the user
115
+ # has overridden the call site, this will return the user's version.
116
+ #
117
+ def get_call_site
118
+ jcontext.getCallSite
119
+ end
120
+
121
+ # Return a copy of this SparkContext's configuration. The configuration *cannot*
122
+ # be changed at runtime.
123
+ #
124
+ def config(key=nil)
125
+ if key
126
+ Spark.config.get(key)
127
+ else
128
+ Spark.config
129
+ end
130
+ end
131
+
132
+ # Add a file to be downloaded with this Spark job on every node.
133
+ # The path of file passed can be either a local file, a file in HDFS
134
+ # (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
135
+ #
136
+ # To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the
137
+ # filename to find its download location.
138
+ #
139
+ # == Example:
140
+ # `echo 10 > test.txt`
141
+ #
142
+ # $sc.add_file('test.txt')
143
+ # $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect
144
+ # # => [0, 10, 20, 30, 40, 50]
145
+ #
146
+ def add_file(*files)
147
+ files.each do |file|
148
+ sc.addFile(file)
149
+ end
150
+ end
151
+
152
+ # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
153
+ # object for reading it in distributed functions. The variable will
154
+ # be sent to each cluster only once.
155
+ #
156
+ # == Example:
157
+ # broadcast1 = $sc.broadcast('a')
158
+ # broadcast2 = $sc.broadcast('b')
159
+ #
160
+ # rdd = $sc.parallelize(0..5, 4)
161
+ # rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
162
+ # rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })
163
+ # rdd.collect
164
+ # # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"]
165
+ #
166
+ def broadcast(value)
167
+ Spark::Broadcast.new(self, value)
168
+ end
169
+
170
+ # Create an Accumulator with the given initial value, using a given
171
+ # accum_param helper object to define how to add values of the
172
+ # data type if provided.
173
+ #
174
+ # == Example:
175
+ # accum = $sc.accumulator(7)
176
+ #
177
+ # rdd = $sc.parallelize(0..5, 4)
178
+ # rdd = rdd.bind(accum: accum)
179
+ # rdd = rdd.map_partitions(lambda{|_| accum.add(1) })
180
+ # rdd = rdd.collect
181
+ #
182
+ # accum.value
183
+ # # => 11
184
+ #
185
+ def accumulator(value, accum_param=:+, zero_value=0)
186
+ Spark::Accumulator.new(value, accum_param, zero_value)
187
+ end
188
+
189
+ # Distribute a local Ruby collection to form an RDD
190
+ # Direct method can be slow so be careful, this method update data inplace
191
+ #
192
+ # == Parameters:
193
+ # data:: Range or Array
194
+ # num_slices:: number of slice
195
+ # serializer:: custom serializer (default: serializer based on configuration)
196
+ #
197
+ # == Examples:
198
+ # $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
199
+ # #=> [1, 2, 3]
200
+ #
201
+ # $sc.parallelize(1..3).map(:to_s).collect
202
+ # #=> ["1", "2", "3"]
203
+ #
204
+ def parallelize(data, num_slices=nil, serializer=nil)
205
+ num_slices ||= default_parallelism
206
+ serializer ||= default_serializer
207
+
208
+ serializer.check_each(data)
209
+
210
+ # Through file
211
+ file = Tempfile.new('to_parallelize', temp_dir)
212
+ serializer.dump_to_io(data, file)
213
+ file.close # not unlink
214
+ jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
215
+
216
+ Spark::RDD.new(jrdd, self, serializer)
217
+ ensure
218
+ file && file.unlink
219
+ end
220
+
221
+ # Read a text file from HDFS, a local file system (available on all nodes), or any
222
+ # Hadoop-supported file system URI, and return it as an RDD of Strings.
223
+ #
224
+ # == Example:
225
+ # f = Tempfile.new("test")
226
+ # f.puts("1")
227
+ # f.puts("2")
228
+ # f.close
229
+ #
230
+ # $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
231
+ # # => [1, 2]
232
+ #
233
+ def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
234
+ min_partitions ||= default_parallelism
235
+ serializer ||= default_serializer
236
+ deserializer = Spark::Serializer.build { __text__(encoding) }
237
+
238
+ Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
239
+ end
240
+
241
+ # Read a directory of text files from HDFS, a local file system (available on all nodes), or any
242
+ # Hadoop-supported file system URI. Each file is read as a single record and returned in a
243
+ # key-value pair, where the key is the path of each file, the value is the content of each file.
244
+ #
245
+ # == Example:
246
+ # dir = Dir.mktmpdir
247
+ # f1 = Tempfile.new("test1", dir)
248
+ # f2 = Tempfile.new("test2", dir)
249
+ # f1.puts("1"); f1.puts("2");
250
+ # f2.puts("3"); f2.puts("4");
251
+ # f1.close
252
+ # f2.close
253
+ #
254
+ # $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
255
+ # # => ["1", "2", "3", "4"]
256
+ #
257
+ def whole_text_files(path, min_partitions=nil, serializer=nil)
258
+ min_partitions ||= default_parallelism
259
+ serializer ||= default_serializer
260
+ deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) }
261
+
262
+ Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
263
+ end
264
+
265
+ # Executes the given partition function f on the specified set of partitions,
266
+ # returning the result as an array of elements.
267
+ #
268
+ # If partitions is not specified, this will run over all partitions.
269
+ #
270
+ # == Example:
271
+ # rdd = $sc.parallelize(0..10, 5)
272
+ # $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
273
+ # # => ["[0, 1]", "[4, 5]"]
274
+ #
275
+ def run_job(rdd, f, partitions=nil, allow_local=false)
276
+ run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)
277
+ end
278
+
279
+ # Execute the given command on specific set of partitions.
280
+ #
281
+ def run_job_with_command(rdd, partitions, allow_local, command, *args)
282
+ if !partitions.nil? && !partitions.is_a?(Array)
283
+ raise Spark::ContextError, 'Partitions must be nil or Array'
284
+ end
285
+
286
+ partitions_size = rdd.partitions_size
287
+
288
+ # Execute all parts
289
+ if partitions.nil?
290
+ partitions = (0...partitions_size).to_a
291
+ end
292
+
293
+ # Can happend when you use coalesce
294
+ partitions.delete_if {|part| part >= partitions_size}
295
+
296
+ # Rjb represent Fixnum as Integer but Jruby as Long
297
+ partitions = to_java_array_list(convert_to_java_int(partitions))
298
+
299
+ # File for result
300
+ file = Tempfile.new('collect', temp_dir)
301
+
302
+ mapped = rdd.new_rdd_from_command(command, *args)
303
+ RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
304
+
305
+ mapped.collect_from_file(file)
306
+ end
307
+
308
+
309
+ # Aliases
310
+ alias_method :textFile, :text_file
311
+ alias_method :wholeTextFiles, :whole_text_files
312
+ alias_method :defaultParallelism, :default_parallelism
313
+ alias_method :setLocalProperty, :set_local_property
314
+ alias_method :getLocalProperty, :get_local_property
315
+ alias_method :setCallSite, :set_call_site
316
+ alias_method :getCallSite, :get_call_site
317
+ alias_method :runJob, :run_job
318
+ alias_method :runJobWithCommand, :run_job_with_command
319
+ alias_method :addFile, :add_file
320
+
321
+ end
322
+ end