ruby-spark 1.1.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +37 -0
  3. data/Gemfile +47 -0
  4. data/Guardfile +5 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +252 -0
  7. data/Rakefile +35 -0
  8. data/TODO.md +6 -0
  9. data/benchmark/aggregate.rb +33 -0
  10. data/benchmark/bisect.rb +88 -0
  11. data/benchmark/comparison/prepare.sh +18 -0
  12. data/benchmark/comparison/python.py +156 -0
  13. data/benchmark/comparison/r.r +69 -0
  14. data/benchmark/comparison/ruby.rb +167 -0
  15. data/benchmark/comparison/run-all.sh +160 -0
  16. data/benchmark/comparison/scala.scala +181 -0
  17. data/benchmark/custom_marshal.rb +94 -0
  18. data/benchmark/digest.rb +150 -0
  19. data/benchmark/enumerator.rb +88 -0
  20. data/benchmark/serializer.rb +82 -0
  21. data/benchmark/sort.rb +43 -0
  22. data/benchmark/sort2.rb +164 -0
  23. data/benchmark/take.rb +28 -0
  24. data/bin/ruby-spark +8 -0
  25. data/example/pi.rb +28 -0
  26. data/example/website_search.rb +83 -0
  27. data/ext/ruby_c/extconf.rb +3 -0
  28. data/ext/ruby_c/murmur.c +158 -0
  29. data/ext/ruby_c/murmur.h +9 -0
  30. data/ext/ruby_c/ruby-spark.c +18 -0
  31. data/ext/ruby_java/Digest.java +36 -0
  32. data/ext/ruby_java/Murmur2.java +98 -0
  33. data/ext/ruby_java/RubySparkExtService.java +28 -0
  34. data/ext/ruby_java/extconf.rb +3 -0
  35. data/ext/spark/build.sbt +73 -0
  36. data/ext/spark/project/plugins.sbt +9 -0
  37. data/ext/spark/sbt/sbt +34 -0
  38. data/ext/spark/src/main/scala/Exec.scala +91 -0
  39. data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
  40. data/ext/spark/src/main/scala/Marshal.scala +52 -0
  41. data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
  42. data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
  43. data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
  44. data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
  45. data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
  46. data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
  47. data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
  48. data/ext/spark/src/main/scala/RubyPage.scala +34 -0
  49. data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
  50. data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
  51. data/ext/spark/src/main/scala/RubyTab.scala +11 -0
  52. data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
  53. data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
  54. data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
  55. data/lib/ruby-spark.rb +1 -0
  56. data/lib/spark.rb +198 -0
  57. data/lib/spark/accumulator.rb +260 -0
  58. data/lib/spark/broadcast.rb +98 -0
  59. data/lib/spark/build.rb +43 -0
  60. data/lib/spark/cli.rb +169 -0
  61. data/lib/spark/command.rb +86 -0
  62. data/lib/spark/command/base.rb +158 -0
  63. data/lib/spark/command/basic.rb +345 -0
  64. data/lib/spark/command/pair.rb +124 -0
  65. data/lib/spark/command/sort.rb +51 -0
  66. data/lib/spark/command/statistic.rb +144 -0
  67. data/lib/spark/command_builder.rb +141 -0
  68. data/lib/spark/command_validator.rb +34 -0
  69. data/lib/spark/config.rb +238 -0
  70. data/lib/spark/constant.rb +14 -0
  71. data/lib/spark/context.rb +322 -0
  72. data/lib/spark/error.rb +50 -0
  73. data/lib/spark/ext/hash.rb +41 -0
  74. data/lib/spark/ext/integer.rb +25 -0
  75. data/lib/spark/ext/io.rb +67 -0
  76. data/lib/spark/ext/ip_socket.rb +29 -0
  77. data/lib/spark/ext/module.rb +58 -0
  78. data/lib/spark/ext/object.rb +24 -0
  79. data/lib/spark/ext/string.rb +24 -0
  80. data/lib/spark/helper.rb +10 -0
  81. data/lib/spark/helper/logger.rb +40 -0
  82. data/lib/spark/helper/parser.rb +85 -0
  83. data/lib/spark/helper/serialize.rb +71 -0
  84. data/lib/spark/helper/statistic.rb +93 -0
  85. data/lib/spark/helper/system.rb +42 -0
  86. data/lib/spark/java_bridge.rb +19 -0
  87. data/lib/spark/java_bridge/base.rb +203 -0
  88. data/lib/spark/java_bridge/jruby.rb +23 -0
  89. data/lib/spark/java_bridge/rjb.rb +41 -0
  90. data/lib/spark/logger.rb +76 -0
  91. data/lib/spark/mllib.rb +100 -0
  92. data/lib/spark/mllib/classification/common.rb +31 -0
  93. data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
  94. data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
  95. data/lib/spark/mllib/classification/svm.rb +135 -0
  96. data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
  97. data/lib/spark/mllib/clustering/kmeans.rb +118 -0
  98. data/lib/spark/mllib/matrix.rb +120 -0
  99. data/lib/spark/mllib/regression/common.rb +73 -0
  100. data/lib/spark/mllib/regression/labeled_point.rb +41 -0
  101. data/lib/spark/mllib/regression/lasso.rb +100 -0
  102. data/lib/spark/mllib/regression/linear.rb +124 -0
  103. data/lib/spark/mllib/regression/ridge.rb +97 -0
  104. data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
  105. data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
  106. data/lib/spark/mllib/stat/distribution.rb +12 -0
  107. data/lib/spark/mllib/vector.rb +185 -0
  108. data/lib/spark/rdd.rb +1377 -0
  109. data/lib/spark/sampler.rb +92 -0
  110. data/lib/spark/serializer.rb +79 -0
  111. data/lib/spark/serializer/auto_batched.rb +59 -0
  112. data/lib/spark/serializer/base.rb +63 -0
  113. data/lib/spark/serializer/batched.rb +84 -0
  114. data/lib/spark/serializer/cartesian.rb +13 -0
  115. data/lib/spark/serializer/compressed.rb +27 -0
  116. data/lib/spark/serializer/marshal.rb +17 -0
  117. data/lib/spark/serializer/message_pack.rb +23 -0
  118. data/lib/spark/serializer/oj.rb +23 -0
  119. data/lib/spark/serializer/pair.rb +41 -0
  120. data/lib/spark/serializer/text.rb +25 -0
  121. data/lib/spark/sort.rb +189 -0
  122. data/lib/spark/stat_counter.rb +125 -0
  123. data/lib/spark/storage_level.rb +39 -0
  124. data/lib/spark/version.rb +3 -0
  125. data/lib/spark/worker/master.rb +144 -0
  126. data/lib/spark/worker/spark_files.rb +15 -0
  127. data/lib/spark/worker/worker.rb +200 -0
  128. data/ruby-spark.gemspec +47 -0
  129. data/spec/generator.rb +37 -0
  130. data/spec/inputs/lorem_300.txt +316 -0
  131. data/spec/inputs/numbers/1.txt +50 -0
  132. data/spec/inputs/numbers/10.txt +50 -0
  133. data/spec/inputs/numbers/11.txt +50 -0
  134. data/spec/inputs/numbers/12.txt +50 -0
  135. data/spec/inputs/numbers/13.txt +50 -0
  136. data/spec/inputs/numbers/14.txt +50 -0
  137. data/spec/inputs/numbers/15.txt +50 -0
  138. data/spec/inputs/numbers/16.txt +50 -0
  139. data/spec/inputs/numbers/17.txt +50 -0
  140. data/spec/inputs/numbers/18.txt +50 -0
  141. data/spec/inputs/numbers/19.txt +50 -0
  142. data/spec/inputs/numbers/2.txt +50 -0
  143. data/spec/inputs/numbers/20.txt +50 -0
  144. data/spec/inputs/numbers/3.txt +50 -0
  145. data/spec/inputs/numbers/4.txt +50 -0
  146. data/spec/inputs/numbers/5.txt +50 -0
  147. data/spec/inputs/numbers/6.txt +50 -0
  148. data/spec/inputs/numbers/7.txt +50 -0
  149. data/spec/inputs/numbers/8.txt +50 -0
  150. data/spec/inputs/numbers/9.txt +50 -0
  151. data/spec/inputs/numbers_0_100.txt +101 -0
  152. data/spec/inputs/numbers_1_100.txt +100 -0
  153. data/spec/lib/collect_spec.rb +42 -0
  154. data/spec/lib/command_spec.rb +68 -0
  155. data/spec/lib/config_spec.rb +64 -0
  156. data/spec/lib/context_spec.rb +165 -0
  157. data/spec/lib/ext_spec.rb +72 -0
  158. data/spec/lib/external_apps_spec.rb +45 -0
  159. data/spec/lib/filter_spec.rb +80 -0
  160. data/spec/lib/flat_map_spec.rb +100 -0
  161. data/spec/lib/group_spec.rb +109 -0
  162. data/spec/lib/helper_spec.rb +19 -0
  163. data/spec/lib/key_spec.rb +41 -0
  164. data/spec/lib/manipulation_spec.rb +122 -0
  165. data/spec/lib/map_partitions_spec.rb +87 -0
  166. data/spec/lib/map_spec.rb +91 -0
  167. data/spec/lib/mllib/classification_spec.rb +54 -0
  168. data/spec/lib/mllib/clustering_spec.rb +35 -0
  169. data/spec/lib/mllib/matrix_spec.rb +32 -0
  170. data/spec/lib/mllib/regression_spec.rb +116 -0
  171. data/spec/lib/mllib/vector_spec.rb +77 -0
  172. data/spec/lib/reduce_by_key_spec.rb +118 -0
  173. data/spec/lib/reduce_spec.rb +131 -0
  174. data/spec/lib/sample_spec.rb +46 -0
  175. data/spec/lib/serializer_spec.rb +88 -0
  176. data/spec/lib/sort_spec.rb +58 -0
  177. data/spec/lib/statistic_spec.rb +170 -0
  178. data/spec/lib/whole_text_files_spec.rb +33 -0
  179. data/spec/spec_helper.rb +38 -0
  180. metadata +389 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0c7cdadb3ef29b9ff9c4a4d24c545be97937e6d0
4
+ data.tar.gz: fd949d2b46717f81e3fefe6ae00f3cc5ca741c32
5
+ SHA512:
6
+ metadata.gz: 80595e1ff9ae32831f2bf501c78150796e037460d70b7476cc7beab9c73035324fcabc6e4ce143eca919af9f71ec6850f46b399a11c8453fc411b0ca249a7a79
7
+ data.tar.gz: 112f079c1b024df1b2b35e008945109afb35553373998ce49496f7fa9361d0bc89fcd78bed10d10734fb4842dc9f35024a8c86ee4c41c33fcbe626b0c93bb1dc
@@ -0,0 +1,37 @@
1
+ /.gemtags
2
+ /.tags
3
+ /java/spark.jar
4
+ .jbundler
5
+ target/*
6
+ *.class
7
+ *.jar
8
+ pom.xml
9
+ vendor/*
10
+ *.gem
11
+ *.rbc
12
+ .bundle
13
+ .config
14
+ .yardoc
15
+ Gemfile.lock
16
+ InstalledFiles
17
+ _yardoc
18
+ coverage
19
+ doc/
20
+ lib/bundler/man
21
+ pkg
22
+ rdoc
23
+ spec/reports
24
+ test/tmp
25
+ test/version_tmp
26
+ tmp
27
+ *.bundle
28
+ *.so
29
+ *.o
30
+ *.a
31
+ mkmf.log
32
+ ext/spark/target/*
33
+ ext/spark/project/target/*
34
+ ext/spark/project/project/target/*
35
+ wiki
36
+ /benchmark/performance/spark/*
37
+ /benchmark/performance/rspark/*
data/Gemfile ADDED
@@ -0,0 +1,47 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ gem 'sourcify', '0.6.0.rc4'
6
+ gem 'method_source'
7
+ gem 'commander'
8
+ gem 'pry'
9
+ gem 'nio4r'
10
+ gem 'distribution'
11
+
12
+ platform :mri do
13
+ gem 'rjb'
14
+ gem 'msgpack'
15
+ gem 'oj'
16
+ gem 'narray'
17
+ end
18
+
19
+ platform :jruby do
20
+ gem 'msgpack-jruby', require: 'msgpack'
21
+
22
+ # NameError: no constructorfor arguments (org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.jruby.RubyFixnum,org.joda.time.chrono.GJChronology) on Java::OrgJodaTime::DateTime
23
+ # gem 'mdarray'
24
+ end
25
+
26
+ group :stats do
27
+ # gem 'nmatrix'
28
+ # gem 'statsample'
29
+ # gem 'statsample-glm'
30
+ # gem 'statsample-timeseries'
31
+ # gem 'statistics2'
32
+ # gem 'statsample-optimization' # libgsl0-dev
33
+ # gem 'narray'
34
+ # gem 'gsl-nmatrix'
35
+ end
36
+
37
+ group :development do
38
+ gem 'benchmark-ips'
39
+ gem 'rspec'
40
+ gem 'rake-compiler'
41
+ gem 'guard'
42
+ gem 'guard-rspec'
43
+ end
44
+
45
+ group :test do
46
+ gem 'simplecov', require: false
47
+ end
@@ -0,0 +1,5 @@
1
+ guard :rspec, cmd: 'rspec' do
2
+ watch(%r{^spec/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { "spec" }
5
+ end
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Ondřej Moravčík
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,252 @@
1
+ # Ruby-Spark
2
+
3
+ Apache Spark™ is a fast and general engine for large-scale data processing.
4
+
5
+ This Gem allows you use Spark functionality on Ruby.
6
+
7
+ > Word count in Spark's Ruby API
8
+
9
+ ```ruby
10
+ file = spark.text_file("hdfs://...")
11
+
12
+ file.flat_map(:split)
13
+ .map(lambda{|word| [word, 1]})
14
+ .reduce_by_key(lambda{|a, b| a+b})
15
+ ```
16
+
17
+ - [Apache Spark](http://spark.apache.org)
18
+ - [Wiki](https://github.com/ondra-m/ruby-spark/wiki)
19
+ - [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark)
20
+
21
+ ## Installation
22
+
23
+ ### Requirments
24
+
25
+ - Java 7+
26
+ - Ruby 2+
27
+ - MRI or JRuby
28
+
29
+ Add this line to your application's Gemfile:
30
+
31
+ ```ruby
32
+ gem 'ruby-spark'
33
+ ```
34
+
35
+ And then execute:
36
+
37
+ ```
38
+ $ bundle
39
+ ```
40
+
41
+ Or install it yourself as:
42
+
43
+ ```
44
+ $ gem install ruby-spark
45
+ ```
46
+
47
+ Run `rake compile` if you are using gem from local filesystem.
48
+
49
+ ### Build Apache Spark
50
+
51
+ This command will download Spark and build extensions for this gem ([SBT](ext/spark/build.sbt) is used for compiling). For more informations check [wiki](https://github.com/ondra-m/ruby-spark/wiki/Installation). Everything is stored by default at [GEM_ROOT]/target,
52
+
53
+ ```
54
+ $ ruby-spark build
55
+ ```
56
+
57
+ ## Usage
58
+
59
+ You can use Ruby Spark via interactive shell (Pry is used)
60
+
61
+ ```
62
+ $ ruby-spark shell
63
+ ```
64
+
65
+ Or on existing project
66
+
67
+ ```ruby
68
+ require 'ruby-spark'
69
+
70
+ # Create a SparkContext
71
+ Spark.start
72
+
73
+ # Context reference
74
+ Spark.sc
75
+ ```
76
+
77
+ If you want configure Spark first. See [configurations](https://github.com/ondra-m/ruby-spark/wiki/Configuration) for more details.
78
+
79
+ ```ruby
80
+ require 'ruby-spark'
81
+
82
+ # Use if you have custom SPARK_HOME
83
+ Spark.load_lib(spark_home)
84
+
85
+ # Configuration
86
+ Spark.config do
87
+ set_app_name "RubySpark"
88
+ set 'spark.ruby.serializer', 'oj'
89
+ set 'spark.ruby.serializer.batch_size', 100
90
+ end
91
+
92
+ # Start Apache Spark
93
+ Spark.start
94
+ ```
95
+
96
+ Finally, to stop the cluster. On the shell is Spark stopped automatically when you exist.
97
+
98
+ ```ruby
99
+ Spark.stop
100
+ ```
101
+
102
+
103
+
104
+ ## Creating RDD (upload data)
105
+
106
+ Single text file:
107
+
108
+ ```ruby
109
+ rdd = sc.text_file(FILE, workers_num, serializer=nil)
110
+ ```
111
+
112
+ All files on directory:
113
+
114
+ ```ruby
115
+ rdd = sc.whole_text_files(DIRECTORY, workers_num, serializer=nil)
116
+ ```
117
+
118
+ Direct uploading structures from ruby (choosen serializer must be able to serialize it):
119
+
120
+ ```ruby
121
+ rdd = sc.parallelize([1,2,3,4,5], workers_num, serializer=nil)
122
+ rdd = sc.parallelize(1..5, workers_num, serializer=nil)
123
+ ```
124
+
125
+ ### Options
126
+
127
+ <dl>
128
+ <dt>workers_num</dt>
129
+ <dd>
130
+ Min count of works computing this task.<br>
131
+ <i>(This value can be overwriten by spark)</i>
132
+ </dd>
133
+
134
+ <dt>serializer</dt>
135
+ <dd>
136
+ Custom serializer.<br>
137
+ <i>(default: by <b>spark.ruby.serializer</b> options)</i>
138
+ </dd>
139
+ </dl>
140
+
141
+ ## Operations
142
+
143
+ All operations can be divided into 2 groups:
144
+
145
+ - **Transformations:** append new operation to current RDD and return new
146
+ - **Actions:** add operation and start calculations
147
+
148
+ See [wiki page](https://github.com/ondra-m/ruby-spark/wiki/RDD) or [Ruby-doc](http://www.rubydoc.info/github/ondra-m/ruby-spark/master/Spark/RDD) for more details.
149
+
150
+ #### Transformations
151
+
152
+ ```ruby
153
+ rdd.map(lambda{|item| ...})
154
+ rdd.flat_map(lambda{|item| ...})
155
+ rdd.filter(lambda{|item| ...})
156
+ rdd.union(rdd)
157
+ rdd.map_paritions(lambda{|iterator| ...})
158
+ # ...
159
+ ```
160
+
161
+ #### Actions
162
+
163
+ ```ruby
164
+ rdd.count
165
+ rdd.take(n)
166
+ rdd.collect
167
+ # ...
168
+ ```
169
+
170
+
171
+ ## Examples
172
+
173
+ Sum of numbers
174
+
175
+ ```ruby
176
+ sc.parallelize(0..10).sum
177
+ # => 55
178
+ ```
179
+
180
+ Words count using methods
181
+
182
+ ```ruby
183
+ # Content:
184
+ # "first line"
185
+ # "second line"
186
+ rdd = sc.text_file(PATH)
187
+
188
+ # ["first", "line", "second", "line"]
189
+ rdd = rdd.flat_map(lambda{|line| line.split})
190
+
191
+ # [["first", 1], ["line", 1], ["second", 1], ["line", 1]]
192
+ rdd = rdd.map(lambda{|word| [word, 1]})
193
+
194
+ # [["first", 1], ["line", 2], ["second", 1]]
195
+ rdd = rdd.reduce_by_key(lambda{|a, b| a+b})
196
+
197
+ # {"first"=>1, "line"=>2, "second"=>1}
198
+ rdd.collect_as_hash
199
+ ```
200
+
201
+ Estimating PI with a custom serializer
202
+
203
+ ```ruby
204
+ slices = 3
205
+ n = 100000 * slices
206
+
207
+ def map(_)
208
+ x = rand * 2 - 1
209
+ y = rand * 2 - 1
210
+
211
+ if x**2 + y**2 < 1
212
+ return 1
213
+ else
214
+ return 0
215
+ end
216
+ end
217
+
218
+ rdd = Spark.context.parallelize(1..n, slices, serializer: 'oj')
219
+ rdd = rdd.map(method(:map))
220
+
221
+ puts 'Pi is roughly %f' % (4.0 * rdd.sum / n)
222
+ ```
223
+
224
+ Estimating PI
225
+
226
+ ```ruby
227
+ rdd = sc.parallelize([10_000], 1)
228
+ rdd = rdd.add_library('bigdecimal/math')
229
+ rdd = rdd.map(lambda{|x| BigMath.PI(x)})
230
+ rdd.collect # => #<BigDecimal, '0.31415926...'>
231
+ ```
232
+
233
+ Linear regression
234
+
235
+ ```ruby
236
+ # Import Mllib classes into Object
237
+ # Otherwise are accessible via Spark::Mllib::LinearRegressionWithSGD
238
+ Spark::Mllib.import(Object)
239
+
240
+ # Training data
241
+ data = [
242
+ LabeledPoint.new(0.0, [0.0]),
243
+ LabeledPoint.new(1.0, [1.0]),
244
+ LabeledPoint.new(3.0, [2.0]),
245
+ LabeledPoint.new(2.0, [3.0])
246
+ ]
247
+
248
+ # Train a model
249
+ lrm = LinearRegressionWithSGD.train(sc.parallelize(data), initial_weights: [1.0])
250
+
251
+ lrm.predict([0.0])
252
+ ```
@@ -0,0 +1,35 @@
1
+ #-*- mode: ruby -*-
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new
7
+
8
+ task default: :spec
9
+ task test: :spec
10
+
11
+ def java?
12
+ RUBY_PLATFORM =~ /java/
13
+ end
14
+
15
+ if java?
16
+ require "rake/javaextensiontask"
17
+ Rake::JavaExtensionTask.new("ruby_java") do |ext|
18
+ ext.name = "ruby_spark_ext"
19
+ end
20
+ else
21
+ require "rake/extensiontask"
22
+ Rake::ExtensionTask.new("ruby_c") do |ext|
23
+ ext.name = "ruby_spark_ext"
24
+ end
25
+ end
26
+
27
+
28
+ task :clean do
29
+ Dir['lib/*.{jar,o,so}'].each do |path|
30
+ puts "Deleting #{path} ..."
31
+ File.delete(path)
32
+ end
33
+ FileUtils.rm_rf('./pkg')
34
+ FileUtils.rm_rf('./tmp')
35
+ end
data/TODO.md ADDED
@@ -0,0 +1,6 @@
1
+ - refactor JavaBridge: to_java, from_java
2
+ - add Streaming
3
+ - add SQL
4
+ - worker informations (time, memory, ...)
5
+ - killing zombie workers
6
+ - global config to ~/.ruby-spark.conf (e.g. target folder for spark)
@@ -0,0 +1,33 @@
1
+ require 'benchmark'
2
+ require 'benchmark/ips'
3
+
4
+ data = 0..1_000_000
5
+ zero_value = rand(100_000)
6
+ function = Proc.new{|sum, n| sum+n}
7
+
8
+ Benchmark.ips do |r|
9
+ r.report('each') do
10
+ sum = zero_value
11
+ data.each do |n|
12
+ sum += n
13
+ end
14
+ end
15
+
16
+ r.report('reduce') do
17
+ data.reduce(zero_value){|sum, n| sum+n}
18
+ end
19
+
20
+ r.report('each with function') do
21
+ sum = zero_value
22
+ data.each do |n|
23
+ sum = function.call(sum, n)
24
+ end
25
+ end
26
+
27
+ r.report('reduce with function') do
28
+ data.reduce(zero_value, &function)
29
+ end
30
+
31
+ r.compare!
32
+ end
33
+