ruby-spark 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/TODO.md +1 -0
- data/ext/spark/build.sbt +1 -1
- data/ext/spark/sbt/sbt +7 -7
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/mllib/classification/naive_bayes.rb +1 -1
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +1 -1
- data/lib/spark/rdd.rb +22 -0
- data/lib/spark/version.rb +1 -1
- data/spec/lib/key_spec.rb +17 -0
- data/spec/lib/mllib/regression_spec.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c7435669c50b38e45f97113e7a67aa991edfaa46
|
4
|
+
data.tar.gz: 95bb22186a07f47f40915edcded518d081f5f8f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 27fd1ff26ed3478595f6b5ef2d48cb74da73c3a4f68df29ad4745f47621df8b617b21cc006ba8b2d5169d680e10a59498956a3a06be92437949e2faac4ccd1f7
|
7
|
+
data.tar.gz: d0d465c5e8f86ab3c8987ef6732446f10886354b8f3e7db281cb3a18eb64db362de89164680f288a015f28e242553fab3362fa35a1462eed2caa8b0322244158
|
data/CHANGELOG.md
CHANGED
data/TODO.md
CHANGED
data/ext/spark/build.sbt
CHANGED
data/ext/spark/sbt/sbt
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
# This script launches sbt for this project. If present it uses the system
|
4
4
|
# version of sbt. If there is no system version of sbt it attempts to download
|
5
5
|
# sbt locally.
|
6
|
-
SBT_VERSION=0.13.
|
7
|
-
URL1=http://
|
8
|
-
URL2=http://
|
6
|
+
SBT_VERSION=0.13.9
|
7
|
+
URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
|
8
|
+
URL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
|
9
9
|
JAR=sbt/sbt-launch-${SBT_VERSION}.jar
|
10
10
|
|
11
11
|
# Download sbt launch jar if it hasn't been downloaded yet
|
@@ -13,10 +13,10 @@ if [ ! -f ${JAR} ]; then
|
|
13
13
|
# Download
|
14
14
|
printf "Attempting to fetch sbt\n"
|
15
15
|
JAR_DL=${JAR}.part
|
16
|
-
if hash
|
17
|
-
(curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
|
18
|
-
elif hash wget 2>/dev/null; then
|
16
|
+
if hash wget 2>/dev/null; then
|
19
17
|
(wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
|
18
|
+
elif hash curl 2>/dev/null; then
|
19
|
+
(curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
|
20
20
|
else
|
21
21
|
printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
|
22
22
|
exit -1
|
@@ -31,4 +31,4 @@ printf "Launching sbt from ${JAR}\n"
|
|
31
31
|
java \
|
32
32
|
-Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
|
33
33
|
-jar ${JAR} \
|
34
|
-
"$@"
|
34
|
+
"$@"
|
@@ -22,10 +22,10 @@ class RubyMLLibAPI extends MLLibAPI {
|
|
22
22
|
// trainLogisticRegressionModelWithLBFGS
|
23
23
|
// trainSVMModelWithSGD
|
24
24
|
// trainKMeansModel
|
25
|
-
//
|
25
|
+
// trainGaussianMixtureModel
|
26
26
|
|
27
27
|
// Rjb have a problem with theta: Array[Array[Double]]
|
28
|
-
override def
|
28
|
+
override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = {
|
29
29
|
val model = NaiveBayes.train(data.rdd, lambda)
|
30
30
|
|
31
31
|
List(
|
data/lib/spark/build.rb
CHANGED
@@ -86,7 +86,7 @@ module Spark
|
|
86
86
|
raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}"
|
87
87
|
end
|
88
88
|
|
89
|
-
labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, '
|
89
|
+
labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayesModel', rdd, lambda)
|
90
90
|
theta = Spark::Mllib::Matrices.dense(theta.size, theta.first.size, theta)
|
91
91
|
|
92
92
|
NaiveBayesModel.new(labels, pi, theta)
|
@@ -64,7 +64,7 @@ module Spark
|
|
64
64
|
class GaussianMixture
|
65
65
|
|
66
66
|
def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100, seed: nil)
|
67
|
-
weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, '
|
67
|
+
weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixtureModel', rdd,
|
68
68
|
k, convergence_tol, max_iterations, Spark.jb.to_long(seed))
|
69
69
|
|
70
70
|
means.map! {|mu| Spark.jb.java_to_ruby(mu)}
|
data/lib/spark/rdd.rb
CHANGED
@@ -1241,6 +1241,28 @@ module Spark
|
|
1241
1241
|
self.map('lambda{|(_, value)| value}')
|
1242
1242
|
end
|
1243
1243
|
|
1244
|
+
# Return the list of values in the RDD for key `key`.
|
1245
|
+
# TODO: add Partitioner for efficiently searching
|
1246
|
+
#
|
1247
|
+
# == Example:
|
1248
|
+
# rdd = $sc.parallelize(0..10)
|
1249
|
+
# rdd = rdd.group_by(lambda {|x| x%3})
|
1250
|
+
# rdd.lookup(2)
|
1251
|
+
# # => [[2, 5, 8]]
|
1252
|
+
#
|
1253
|
+
# rdd = $sc.parallelize(0..10)
|
1254
|
+
# rdd = rdd.key_by(lambda{|x| x.even?})
|
1255
|
+
# rdd.lookup(true)
|
1256
|
+
# # => [0, 2, 4, 6, 8, 10]
|
1257
|
+
#
|
1258
|
+
def lookup(key)
|
1259
|
+
lookup_key = "lookup_key_#{object_id}"
|
1260
|
+
|
1261
|
+
self.filter("lambda{|(key, _)| key == #{lookup_key}}")
|
1262
|
+
.bind(lookup_key => key)
|
1263
|
+
.values
|
1264
|
+
.collect
|
1265
|
+
end
|
1244
1266
|
|
1245
1267
|
# Aliases
|
1246
1268
|
alias_method :partitionsSize, :partitions_size
|
data/lib/spark/version.rb
CHANGED
data/spec/lib/key_spec.rb
CHANGED
@@ -39,4 +39,21 @@ RSpec.describe 'Spark::RDD' do
|
|
39
39
|
# it_behaves_like 'a keying by', rand(2..10)
|
40
40
|
end
|
41
41
|
|
42
|
+
it 'lookup' do
|
43
|
+
numbers = Generator.numbers
|
44
|
+
rdd_numbers = $sc.parallelize(numbers, 2)
|
45
|
+
|
46
|
+
rdd = rdd_numbers.group_by(lambda {|x| x%3})
|
47
|
+
rdd.lookup(2)
|
48
|
+
|
49
|
+
expect(rdd.lookup(2).first).to eq(
|
50
|
+
numbers.group_by{|x| x%3}[2]
|
51
|
+
)
|
52
|
+
|
53
|
+
rdd = rdd_numbers.key_by(lambda{|x| x.even?})
|
54
|
+
expect(rdd.lookup(true)).to eq(
|
55
|
+
numbers.select(&:even?)
|
56
|
+
)
|
57
|
+
end
|
58
|
+
|
42
59
|
end
|
@@ -99,7 +99,7 @@ RSpec.describe 'Spark::Mllib regression' do
|
|
99
99
|
|
100
100
|
expect(lrm.weights[0]).to be_between(1.9, 2.1)
|
101
101
|
expect(lrm.weights[1]).to be_between(-1.60, -1.40)
|
102
|
-
expect(lrm.weights[2]).to be_between(-1.0e-
|
102
|
+
expect(lrm.weights[2]).to be_between(-1.0e-2, 1.0e-2)
|
103
103
|
end
|
104
104
|
end
|
105
105
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-spark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ondřej Moravčík
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rjb
|
@@ -346,7 +346,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
346
346
|
requirements:
|
347
347
|
- java, scala
|
348
348
|
rubyforge_project:
|
349
|
-
rubygems_version: 2.4.5
|
349
|
+
rubygems_version: 2.4.5.1
|
350
350
|
signing_key:
|
351
351
|
specification_version: 4
|
352
352
|
summary: Ruby wrapper for Apache Spark
|
@@ -402,3 +402,4 @@ test_files:
|
|
402
402
|
- spec/lib/statistic_spec.rb
|
403
403
|
- spec/lib/whole_text_files_spec.rb
|
404
404
|
- spec/spec_helper.rb
|
405
|
+
has_rdoc:
|