ruby-spark 1.1.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +252 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +6 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/comparison/prepare.sh +18 -0
  12. data/benchmark/comparison/python.py +156 -0
  13. data/benchmark/comparison/r.r +69 -0
  14. data/benchmark/comparison/ruby.rb +167 -0
  15. data/benchmark/comparison/run-all.sh +160 -0
  16. data/benchmark/comparison/scala.scala +181 -0
  17. data/benchmark/custom_marshal.rb +94 -0
  18. data/benchmark/digest.rb +150 -0
  19. data/benchmark/enumerator.rb +88 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/example/website_search.rb +83 -0
  27. data/ext/ruby_c/extconf.rb +3 -0
  28. data/ext/ruby_c/murmur.c +158 -0
  29. data/ext/ruby_c/murmur.h +9 -0
  30. data/ext/ruby_c/ruby-spark.c +18 -0
  31. data/ext/ruby_java/Digest.java +36 -0
  32. data/ext/ruby_java/Murmur2.java +98 -0
  33. data/ext/ruby_java/RubySparkExtService.java +28 -0
  34. data/ext/ruby_java/extconf.rb +3 -0
  35. data/ext/spark/build.sbt +73 -0
  36. data/ext/spark/project/plugins.sbt +9 -0
  37. data/ext/spark/sbt/sbt +34 -0
  38. data/ext/spark/src/main/scala/Exec.scala +91 -0
  39. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  40. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  41. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  42. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  43. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  44. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  46. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  47. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  48. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  49. data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
  50. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  51. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  52. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  53. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  54. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  55. data/lib/ruby-spark.rb +1 -0
  56. data/lib/spark.rb +198 -0
  57. data/lib/spark/accumulator.rb +260 -0
  58. data/lib/spark/broadcast.rb +98 -0
  59. data/lib/spark/build.rb +43 -0
  60. data/lib/spark/cli.rb +169 -0
  61. data/lib/spark/command.rb +86 -0
  62. data/lib/spark/command/base.rb +158 -0
  63. data/lib/spark/command/basic.rb +345 -0
  64. data/lib/spark/command/pair.rb +124 -0
  65. data/lib/spark/command/sort.rb +51 -0
  66. data/lib/spark/command/statistic.rb +144 -0
  67. data/lib/spark/command_builder.rb +141 -0
  68. data/lib/spark/command_validator.rb +34 -0
  69. data/lib/spark/config.rb +238 -0
  70. data/lib/spark/constant.rb +14 -0
  71. data/lib/spark/context.rb +322 -0
  72. data/lib/spark/error.rb +50 -0
  73. data/lib/spark/ext/hash.rb +41 -0
  74. data/lib/spark/ext/integer.rb +25 -0
  75. data/lib/spark/ext/io.rb +67 -0
  76. data/lib/spark/ext/ip_socket.rb +29 -0
  77. data/lib/spark/ext/module.rb +58 -0
  78. data/lib/spark/ext/object.rb +24 -0
  79. data/lib/spark/ext/string.rb +24 -0
  80. data/lib/spark/helper.rb +10 -0
  81. data/lib/spark/helper/logger.rb +40 -0
  82. data/lib/spark/helper/parser.rb +85 -0
  83. data/lib/spark/helper/serialize.rb +71 -0
  84. data/lib/spark/helper/statistic.rb +93 -0
  85. data/lib/spark/helper/system.rb +42 -0
  86. data/lib/spark/java_bridge.rb +19 -0
  87. data/lib/spark/java_bridge/base.rb +203 -0
  88. data/lib/spark/java_bridge/jruby.rb +23 -0
  89. data/lib/spark/java_bridge/rjb.rb +41 -0
  90. data/lib/spark/logger.rb +76 -0
  91. data/lib/spark/mllib.rb +100 -0
  92. data/lib/spark/mllib/classification/common.rb +31 -0
  93. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  94. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  95. data/lib/spark/mllib/classification/svm.rb +135 -0
  96. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  97. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  98. data/lib/spark/mllib/matrix.rb +120 -0
  99. data/lib/spark/mllib/regression/common.rb +73 -0
  100. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  101. data/lib/spark/mllib/regression/lasso.rb +100 -0
  102. data/lib/spark/mllib/regression/linear.rb +124 -0
  103. data/lib/spark/mllib/regression/ridge.rb +97 -0
  104. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  105. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  106. data/lib/spark/mllib/stat/distribution.rb +12 -0
  107. data/lib/spark/mllib/vector.rb +185 -0
  108. data/lib/spark/rdd.rb +1377 -0
  109. data/lib/spark/sampler.rb +92 -0
  110. data/lib/spark/serializer.rb +79 -0
  111. data/lib/spark/serializer/auto_batched.rb +59 -0
  112. data/lib/spark/serializer/base.rb +63 -0
  113. data/lib/spark/serializer/batched.rb +84 -0
  114. data/lib/spark/serializer/cartesian.rb +13 -0
  115. data/lib/spark/serializer/compressed.rb +27 -0
  116. data/lib/spark/serializer/marshal.rb +17 -0
  117. data/lib/spark/serializer/message_pack.rb +23 -0
  118. data/lib/spark/serializer/oj.rb +23 -0
  119. data/lib/spark/serializer/pair.rb +41 -0
  120. data/lib/spark/serializer/text.rb +25 -0
  121. data/lib/spark/sort.rb +189 -0
  122. data/lib/spark/stat_counter.rb +125 -0
  123. data/lib/spark/storage_level.rb +39 -0
  124. data/lib/spark/version.rb +3 -0
  125. data/lib/spark/worker/master.rb +144 -0
  126. data/lib/spark/worker/spark_files.rb +15 -0
  127. data/lib/spark/worker/worker.rb +200 -0
  128. data/ruby-spark.gemspec +47 -0
  129. data/spec/generator.rb +37 -0
  130. data/spec/inputs/lorem_300.txt +316 -0
  131. data/spec/inputs/numbers/1.txt +50 -0
  132. data/spec/inputs/numbers/10.txt +50 -0
  133. data/spec/inputs/numbers/11.txt +50 -0
  134. data/spec/inputs/numbers/12.txt +50 -0
  135. data/spec/inputs/numbers/13.txt +50 -0
  136. data/spec/inputs/numbers/14.txt +50 -0
  137. data/spec/inputs/numbers/15.txt +50 -0
  138. data/spec/inputs/numbers/16.txt +50 -0
  139. data/spec/inputs/numbers/17.txt +50 -0
  140. data/spec/inputs/numbers/18.txt +50 -0
  141. data/spec/inputs/numbers/19.txt +50 -0
  142. data/spec/inputs/numbers/2.txt +50 -0
  143. data/spec/inputs/numbers/20.txt +50 -0
  144. data/spec/inputs/numbers/3.txt +50 -0
  145. data/spec/inputs/numbers/4.txt +50 -0
  146. data/spec/inputs/numbers/5.txt +50 -0
  147. data/spec/inputs/numbers/6.txt +50 -0
  148. data/spec/inputs/numbers/7.txt +50 -0
  149. data/spec/inputs/numbers/8.txt +50 -0
  150. data/spec/inputs/numbers/9.txt +50 -0
  151. data/spec/inputs/numbers_0_100.txt +101 -0
  152. data/spec/inputs/numbers_1_100.txt +100 -0
  153. data/spec/lib/collect_spec.rb +42 -0
  154. data/spec/lib/command_spec.rb +68 -0
  155. data/spec/lib/config_spec.rb +64 -0
  156. data/spec/lib/context_spec.rb +165 -0
  157. data/spec/lib/ext_spec.rb +72 -0
  158. data/spec/lib/external_apps_spec.rb +45 -0
  159. data/spec/lib/filter_spec.rb +80 -0
  160. data/spec/lib/flat_map_spec.rb +100 -0
  161. data/spec/lib/group_spec.rb +109 -0
  162. data/spec/lib/helper_spec.rb +19 -0
  163. data/spec/lib/key_spec.rb +41 -0
  164. data/spec/lib/manipulation_spec.rb +122 -0
  165. data/spec/lib/map_partitions_spec.rb +87 -0
  166. data/spec/lib/map_spec.rb +91 -0
  167. data/spec/lib/mllib/classification_spec.rb +54 -0
  168. data/spec/lib/mllib/clustering_spec.rb +35 -0
  169. data/spec/lib/mllib/matrix_spec.rb +32 -0
  170. data/spec/lib/mllib/regression_spec.rb +116 -0
  171. data/spec/lib/mllib/vector_spec.rb +77 -0
  172. data/spec/lib/reduce_by_key_spec.rb +118 -0
  173. data/spec/lib/reduce_spec.rb +131 -0
  174. data/spec/lib/sample_spec.rb +46 -0
  175. data/spec/lib/serializer_spec.rb +88 -0
  176. data/spec/lib/sort_spec.rb +58 -0
  177. data/spec/lib/statistic_spec.rb +170 -0
  178. data/spec/lib/whole_text_files_spec.rb +33 -0
  179. data/spec/spec_helper.rb +38 -0
  180. metadata +389 -0
@@ -0,0 +1,34 @@
1
+ module Spark
2
+ module CommandValidator
3
+
4
+ def validate(value, options)
5
+ validate_type(value, options[:type])
6
+ end
7
+
8
+ def valid?(value, options)
9
+ begin
10
+ validate(value, options)
11
+ return true
12
+ rescue
13
+ return false
14
+ end
15
+ end
16
+
17
+ def validate_type(value, types)
18
+ types = [types] if !types.is_a?(Array)
19
+
20
+ types.each do |type|
21
+ return if value.is_a?(type)
22
+ end
23
+
24
+ error "Value: #{value} should be a #{types.join(' or ')} but is #{value.class}."
25
+ end
26
+
27
+ def validate_size(array1, array2)
28
+ if array1.size != array2.size
29
+ error "Wrong number of arguments (#{array1.size} for #{array2.size})"
30
+ end
31
+ end
32
+
33
+ end
34
+ end
@@ -0,0 +1,238 @@
1
+ # Necessary libraries
2
+ Spark.load_lib
3
+
4
+ module Spark
5
+ # Common configuration for RubySpark and Spark
6
+ class Config
7
+
8
+ include Spark::Helper::System
9
+
10
+ TYPES = {
11
+ 'spark.shuffle.spill' => :boolean,
12
+ 'spark.ruby.serializer.compress' => :boolean
13
+ }
14
+
15
+ # Initialize java SparkConf and load default configuration.
16
+ def initialize
17
+ @spark_conf = SparkConf.new(true)
18
+ set_default
19
+ end
20
+
21
+ def from_file(file)
22
+ check_read_only
23
+
24
+ if file && File.exist?(file)
25
+ file = File.expand_path(file)
26
+ RubyUtils.loadPropertiesFile(spark_conf, file)
27
+ end
28
+ end
29
+
30
+ def [](key)
31
+ get(key)
32
+ end
33
+
34
+ def []=(key, value)
35
+ set(key, value)
36
+ end
37
+
38
+ def spark_conf
39
+ if Spark.started?
40
+ # Get latest configuration
41
+ Spark.context.jcontext.conf
42
+ else
43
+ @spark_conf
44
+ end
45
+ end
46
+
47
+ def valid!
48
+ errors = []
49
+
50
+ if !contains?('spark.app.name')
51
+ errors << 'An application name must be set in your configuration.'
52
+ end
53
+
54
+ if !contains?('spark.master')
55
+ errors << 'A master URL must be set in your configuration.'
56
+ end
57
+
58
+ if Spark::Serializer.find(get('spark.ruby.serializer')).nil?
59
+ errors << 'Unknow serializer.'
60
+ end
61
+
62
+ scanned = get('spark.ruby.executor.command').scan('%s')
63
+
64
+ if scanned.size == 0
65
+ errors << "Executor command must contain '%s'."
66
+ end
67
+
68
+ if scanned.size > 1
69
+ errors << "Executor command can contain only one '%s'."
70
+ end
71
+
72
+ if errors.any?
73
+ errors.map!{|error| "- #{error}"}
74
+
75
+ raise Spark::ConfigurationError, "Configuration is not valid:\r\n#{errors.join("\r\n")}"
76
+ end
77
+ end
78
+
79
+ def read_only?
80
+ Spark.started?
81
+ end
82
+
83
+ # Rescue from NoSuchElementException
84
+ def get(key)
85
+ value = spark_conf.get(key.to_s)
86
+
87
+ case TYPES[key]
88
+ when :boolean
89
+ parse_boolean(value)
90
+ when :integer
91
+ parse_integer(value)
92
+ else
93
+ value
94
+ end
95
+ rescue
96
+ nil
97
+ end
98
+
99
+ def get_all
100
+ Hash[spark_conf.getAll.map{|tuple| [tuple._1, tuple._2]}]
101
+ end
102
+
103
+ def contains?(key)
104
+ spark_conf.contains(key.to_s)
105
+ end
106
+
107
+ def set(key, value)
108
+ check_read_only
109
+ spark_conf.set(key.to_s, value.to_s)
110
+ end
111
+
112
+ def set_app_name(name)
113
+ set('spark.app.name', name)
114
+ end
115
+
116
+ def set_master(master)
117
+ set('spark.master', master)
118
+ end
119
+
120
+ def parse_boolean(value)
121
+ case value
122
+ when 'true'
123
+ true
124
+ when 'false'
125
+ false
126
+ end
127
+ end
128
+
129
+ def parse_integer(value)
130
+ value.to_i
131
+ end
132
+
133
+ # =============================================================================
134
+ # Defaults
135
+
136
+ def set_default
137
+ set_app_name('RubySpark')
138
+ set_master('local[*]')
139
+ set('spark.ruby.driver_home', Spark.home)
140
+ set('spark.ruby.serializer', default_serializer)
141
+ set('spark.ruby.serializer.compress', default_serializer_compress)
142
+ set('spark.ruby.serializer.batch_size', default_serializer_batch_size)
143
+ set('spark.ruby.executor.uri', default_executor_uri)
144
+ set('spark.ruby.executor.command', default_executor_command)
145
+ set('spark.ruby.executor.options', default_executor_options)
146
+ set('spark.ruby.worker.type', default_worker_type)
147
+ load_executor_envs
148
+ end
149
+
150
+ def default_serializer
151
+ ENV['SPARK_RUBY_SERIALIZER'] || Spark::Serializer::DEFAULT_SERIALIZER_NAME
152
+ end
153
+
154
+ def default_serializer_compress
155
+ ENV['SPARK_RUBY_SERIALIZER_COMPRESS'] || Spark::Serializer::DEFAULT_COMPRESS
156
+ end
157
+
158
+ def default_serializer_batch_size
159
+ ENV['SPARK_RUBY_SERIALIZER_BATCH_SIZE'] || Spark::Serializer::DEFAULT_BATCH_SIZE
160
+ end
161
+
162
+ # Ruby executor.
163
+ #
164
+ # == Options:
165
+ # nil::
166
+ # System's gem is loaded (ruby-spark).
167
+ #
168
+ # other::
169
+ # Path of library which will be used.
170
+ # Current ruby-spark gem is used.
171
+ # (default)
172
+ #
173
+ def default_executor_uri
174
+ ENV['SPARK_RUBY_EXECUTOR_URI'] || ''
175
+ end
176
+
177
+ # Command template which is applied when scala want create a ruby
178
+ # process (e.g. master, home request). Command is represented by '%s'.
179
+ #
180
+ # == Example:
181
+ # bash --norc -i -c "export HOME=/home/user; cd; source .bashrc; %s"
182
+ #
183
+ def default_executor_command
184
+ ENV['SPARK_RUBY_EXECUTOR_COMMAND'] || '%s'
185
+ end
186
+
187
+ # Options for every worker.
188
+ #
189
+ # == Examples:
190
+ # -J-Xmx512m
191
+ #
192
+ def default_executor_options
193
+ ENV['SPARK_RUBY_EXECUTOR_OPTIONS'] || ''
194
+ end
195
+
196
+ # Type of worker.
197
+ #
198
+ # == Options:
199
+ # process:: (default)
200
+ # thread:: (experimental)
201
+ #
202
+ def default_worker_type
203
+ ENV['SPARK_RUBY_WORKER_TYPE'] || 'process'
204
+ end
205
+
206
+ # Load environment variables for executor from ENV.
207
+ #
208
+ # == Examples:
209
+ # SPARK_RUBY_EXECUTOR_ENV_KEY1="1"
210
+ # SPARK_RUBY_EXECUTOR_ENV_KEY2="2"
211
+ #
212
+ def load_executor_envs
213
+ prefix = 'SPARK_RUBY_EXECUTOR_ENV_'
214
+
215
+ envs = ENV.select{|key, _| key.start_with?(prefix)}
216
+ envs.each do |key, value|
217
+ key = key.dup # ENV keys are frozen
218
+ key.slice!(0, prefix.size)
219
+
220
+ set("spark.ruby.executor.env.#{key}", value)
221
+ end
222
+ end
223
+
224
+ # Aliases
225
+ alias_method :getAll, :get_all
226
+ alias_method :setAppName, :set_app_name
227
+ alias_method :setMaster, :set_master
228
+
229
+ private
230
+
231
+ def check_read_only
232
+ if read_only?
233
+ raise Spark::ConfigurationError, 'Configuration is ready only'
234
+ end
235
+ end
236
+
237
+ end
238
+ end
@@ -0,0 +1,14 @@
1
+ module Spark
2
+ # Commond constant for Ruby and Spark
3
+ module Constant
4
+ DATA_EOF = -2
5
+ WORKER_ERROR = -1
6
+ WORKER_DONE = 0
7
+ CREATE_WORKER = 1
8
+ KILL_WORKER = 2
9
+ KILL_WORKER_AND_WAIT = 3
10
+ SUCCESSFULLY_KILLED = 4
11
+ UNSUCCESSFUL_KILLING = 5
12
+ ACCUMULATOR_ACK = 6
13
+ end
14
+ end
@@ -0,0 +1,322 @@
1
+ # Necessary libraries
2
+ Spark.load_lib
3
+
4
+ module Spark
5
+ ##
6
+ # Main entry point for Spark functionality. A SparkContext represents the connection to a Spark
7
+ # cluster, and can be used to create RDDs, accumulators and broadcast variables on that cluster.
8
+ #
9
+ class Context
10
+
11
+ include Spark::Helper::System
12
+ include Spark::Helper::Parser
13
+ include Spark::Helper::Logger
14
+
15
+ attr_reader :jcontext, :jaccumulator, :temp_dir
16
+
17
+ # Constructor for Ruby context. Configuration is automatically is taken
18
+ # from Spark. Config will be automatically set to default if user start
19
+ # context first.
20
+ #
21
+ def initialize
22
+ Spark.config.valid!
23
+ @jcontext = JavaSparkContext.new(Spark.config.spark_conf)
24
+ @jcontext.addJar(Spark.ruby_spark_jar)
25
+
26
+ # Does not work on 1.2
27
+ # ui.attachTab(RubyTab.new(ui, to_java_hash(RbConfig::CONFIG)))
28
+
29
+ spark_local_dir = JUtils.getLocalDir(sc.conf)
30
+ @temp_dir = JUtils.createTempDir(spark_local_dir, 'ruby').getAbsolutePath
31
+
32
+ accum_server = Spark::Accumulator::Server
33
+ accum_server.start
34
+ @jaccumulator = @jcontext.accumulator(ArrayList.new, RubyAccumulatorParam.new(accum_server.host, accum_server.port))
35
+
36
+ log_info("Ruby accumulator server is running on port #{accum_server.port}")
37
+
38
+ set_call_site('Ruby') # description of stage
39
+ end
40
+
41
+ def stop
42
+ Spark::Accumulator::Server.stop
43
+ log_info('Ruby accumulator server was stopped')
44
+ @jcontext.stop
45
+ end
46
+
47
+ def sc
48
+ @jcontext.sc
49
+ end
50
+
51
+ def ui
52
+ sc.ui
53
+ end
54
+
55
+ # Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD)
56
+ #
57
+ def default_parallelism
58
+ sc.defaultParallelism
59
+ end
60
+
61
+ # Default serializer
62
+ #
63
+ # Batch -> Compress -> Basic
64
+ #
65
+ def default_serializer
66
+ # Basic
67
+ serializer = Spark::Serializer.find!(config('spark.ruby.serializer')).new
68
+
69
+ # Compress
70
+ if config('spark.ruby.serializer.compress')
71
+ serializer = Spark::Serializer.compressed(serializer)
72
+ end
73
+
74
+ # Bactching
75
+ batch_size = default_batch_size
76
+ if batch_size == 'auto'
77
+ serializer = Spark::Serializer.auto_batched(serializer)
78
+ else
79
+ serializer = Spark::Serializer.batched(serializer, batch_size)
80
+ end
81
+
82
+ # Finally, "container" contains serializers
83
+ serializer
84
+ end
85
+
86
+ def default_batch_size
87
+ size = config('spark.ruby.serializer.batch_size').to_i
88
+ if size >= 1
89
+ size
90
+ else
91
+ 'auto'
92
+ end
93
+ end
94
+
95
+ # Set a local property that affects jobs submitted from this thread, such as the
96
+ # Spark fair scheduler pool.
97
+ #
98
+ def set_local_property(key, value)
99
+ jcontext.setLocalProperty(key, value)
100
+ end
101
+
102
+ # Get a local property set in this thread, or null if it is missing
103
+ #
104
+ def get_local_property(key)
105
+ jcontext.getLocalProperty(key)
106
+ end
107
+
108
+ # Support function for API backtraces.
109
+ #
110
+ def set_call_site(site)
111
+ set_local_property('externalCallSite', site)
112
+ end
113
+
114
+ # Capture the current user callsite and return a formatted version for printing. If the user
115
+ # has overridden the call site, this will return the user's version.
116
+ #
117
+ def get_call_site
118
+ jcontext.getCallSite
119
+ end
120
+
121
+ # Return a copy of this SparkContext's configuration. The configuration *cannot*
122
+ # be changed at runtime.
123
+ #
124
+ def config(key=nil)
125
+ if key
126
+ Spark.config.get(key)
127
+ else
128
+ Spark.config
129
+ end
130
+ end
131
+
132
+ # Add a file to be downloaded with this Spark job on every node.
133
+ # The path of file passed can be either a local file, a file in HDFS
134
+ # (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
135
+ #
136
+ # To access the file in Spark jobs, use `SparkFiles.get(file_name)` with the
137
+ # filename to find its download location.
138
+ #
139
+ # == Example:
140
+ # `echo 10 > test.txt`
141
+ #
142
+ # $sc.add_file('test.txt')
143
+ # $sc.parallelize(0..5).map(lambda{|x| x * SparkFiles.get_content('test.txt').to_i}).collect
144
+ # # => [0, 10, 20, 30, 40, 50]
145
+ #
146
+ def add_file(*files)
147
+ files.each do |file|
148
+ sc.addFile(file)
149
+ end
150
+ end
151
+
152
+ # Broadcast a read-only variable to the cluster, returning a Spark::Broadcast
153
+ # object for reading it in distributed functions. The variable will
154
+ # be sent to each cluster only once.
155
+ #
156
+ # == Example:
157
+ # broadcast1 = $sc.broadcast('a')
158
+ # broadcast2 = $sc.broadcast('b')
159
+ #
160
+ # rdd = $sc.parallelize(0..5, 4)
161
+ # rdd = rdd.bind(broadcast1: broadcast1, broadcast2: broadcast2)
162
+ # rdd = rdd.map_partitions_with_index(lambda{|part, index| [broadcast1.value * index, broadcast2.value * index] })
163
+ # rdd.collect
164
+ # # => ["", "", "a", "b", "aa", "bb", "aaa", "bbb"]
165
+ #
166
+ def broadcast(value)
167
+ Spark::Broadcast.new(self, value)
168
+ end
169
+
170
+ # Create an Accumulator with the given initial value, using a given
171
+ # accum_param helper object to define how to add values of the
172
+ # data type if provided.
173
+ #
174
+ # == Example:
175
+ # accum = $sc.accumulator(7)
176
+ #
177
+ # rdd = $sc.parallelize(0..5, 4)
178
+ # rdd = rdd.bind(accum: accum)
179
+ # rdd = rdd.map_partitions(lambda{|_| accum.add(1) })
180
+ # rdd = rdd.collect
181
+ #
182
+ # accum.value
183
+ # # => 11
184
+ #
185
+ def accumulator(value, accum_param=:+, zero_value=0)
186
+ Spark::Accumulator.new(value, accum_param, zero_value)
187
+ end
188
+
189
+ # Distribute a local Ruby collection to form an RDD
190
+ # Direct method can be slow so be careful, this method update data inplace
191
+ #
192
+ # == Parameters:
193
+ # data:: Range or Array
194
+ # num_slices:: number of slice
195
+ # serializer:: custom serializer (default: serializer based on configuration)
196
+ #
197
+ # == Examples:
198
+ # $sc.parallelize(["1", "2", "3"]).map(lambda{|x| x.to_i}).collect
199
+ # #=> [1, 2, 3]
200
+ #
201
+ # $sc.parallelize(1..3).map(:to_s).collect
202
+ # #=> ["1", "2", "3"]
203
+ #
204
+ def parallelize(data, num_slices=nil, serializer=nil)
205
+ num_slices ||= default_parallelism
206
+ serializer ||= default_serializer
207
+
208
+ serializer.check_each(data)
209
+
210
+ # Through file
211
+ file = Tempfile.new('to_parallelize', temp_dir)
212
+ serializer.dump_to_io(data, file)
213
+ file.close # not unlink
214
+ jrdd = RubyRDD.readRDDFromFile(jcontext, file.path, num_slices)
215
+
216
+ Spark::RDD.new(jrdd, self, serializer)
217
+ ensure
218
+ file && file.unlink
219
+ end
220
+
221
+ # Read a text file from HDFS, a local file system (available on all nodes), or any
222
+ # Hadoop-supported file system URI, and return it as an RDD of Strings.
223
+ #
224
+ # == Example:
225
+ # f = Tempfile.new("test")
226
+ # f.puts("1")
227
+ # f.puts("2")
228
+ # f.close
229
+ #
230
+ # $sc.text_file(f.path).map(lambda{|x| x.to_i}).collect
231
+ # # => [1, 2]
232
+ #
233
+ def text_file(path, min_partitions=nil, encoding=Encoding::UTF_8, serializer=nil)
234
+ min_partitions ||= default_parallelism
235
+ serializer ||= default_serializer
236
+ deserializer = Spark::Serializer.build { __text__(encoding) }
237
+
238
+ Spark::RDD.new(@jcontext.textFile(path, min_partitions), self, serializer, deserializer)
239
+ end
240
+
241
+ # Read a directory of text files from HDFS, a local file system (available on all nodes), or any
242
+ # Hadoop-supported file system URI. Each file is read as a single record and returned in a
243
+ # key-value pair, where the key is the path of each file, the value is the content of each file.
244
+ #
245
+ # == Example:
246
+ # dir = Dir.mktmpdir
247
+ # f1 = Tempfile.new("test1", dir)
248
+ # f2 = Tempfile.new("test2", dir)
249
+ # f1.puts("1"); f1.puts("2");
250
+ # f2.puts("3"); f2.puts("4");
251
+ # f1.close
252
+ # f2.close
253
+ #
254
+ # $sc.whole_text_files(dir).flat_map(lambda{|key, value| value.split}).collect
255
+ # # => ["1", "2", "3", "4"]
256
+ #
257
+ def whole_text_files(path, min_partitions=nil, serializer=nil)
258
+ min_partitions ||= default_parallelism
259
+ serializer ||= default_serializer
260
+ deserializer = Spark::Serializer.build{ __pair__(__text__, __text__) }
261
+
262
+ Spark::RDD.new(@jcontext.wholeTextFiles(path, min_partitions), self, serializer, deserializer)
263
+ end
264
+
265
+ # Executes the given partition function f on the specified set of partitions,
266
+ # returning the result as an array of elements.
267
+ #
268
+ # If partitions is not specified, this will run over all partitions.
269
+ #
270
+ # == Example:
271
+ # rdd = $sc.parallelize(0..10, 5)
272
+ # $sc.run_job(rdd, lambda{|x| x.to_s}, [0,2])
273
+ # # => ["[0, 1]", "[4, 5]"]
274
+ #
275
+ def run_job(rdd, f, partitions=nil, allow_local=false)
276
+ run_job_with_command(rdd, partitions, allow_local, Spark::Command::MapPartitions, f)
277
+ end
278
+
279
+ # Execute the given command on specific set of partitions.
280
+ #
281
+ def run_job_with_command(rdd, partitions, allow_local, command, *args)
282
+ if !partitions.nil? && !partitions.is_a?(Array)
283
+ raise Spark::ContextError, 'Partitions must be nil or Array'
284
+ end
285
+
286
+ partitions_size = rdd.partitions_size
287
+
288
+ # Execute all parts
289
+ if partitions.nil?
290
+ partitions = (0...partitions_size).to_a
291
+ end
292
+
293
+ # Can happend when you use coalesce
294
+ partitions.delete_if {|part| part >= partitions_size}
295
+
296
+ # Rjb represent Fixnum as Integer but Jruby as Long
297
+ partitions = to_java_array_list(convert_to_java_int(partitions))
298
+
299
+ # File for result
300
+ file = Tempfile.new('collect', temp_dir)
301
+
302
+ mapped = rdd.new_rdd_from_command(command, *args)
303
+ RubyRDD.runJob(rdd.context.sc, mapped.jrdd, partitions, allow_local, file.path)
304
+
305
+ mapped.collect_from_file(file)
306
+ end
307
+
308
+
309
+ # Aliases
310
+ alias_method :textFile, :text_file
311
+ alias_method :wholeTextFiles, :whole_text_files
312
+ alias_method :defaultParallelism, :default_parallelism
313
+ alias_method :setLocalProperty, :set_local_property
314
+ alias_method :getLocalProperty, :get_local_property
315
+ alias_method :setCallSite, :set_call_site
316
+ alias_method :getCallSite, :get_call_site
317
+ alias_method :runJob, :run_job
318
+ alias_method :runJobWithCommand, :run_job_with_command
319
+ alias_method :addFile, :add_file
320
+
321
+ end
322
+ end