ruby-spark 1.2.0 → 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/TODO.md +1 -0
- data/ext/spark/build.sbt +1 -1
- data/ext/spark/sbt/sbt +7 -7
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/mllib/classification/naive_bayes.rb +1 -1
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +1 -1
- data/lib/spark/rdd.rb +22 -0
- data/lib/spark/version.rb +1 -1
- data/spec/lib/key_spec.rb +17 -0
- data/spec/lib/mllib/regression_spec.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c7435669c50b38e45f97113e7a67aa991edfaa46
|
4
|
+
data.tar.gz: 95bb22186a07f47f40915edcded518d081f5f8f7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 27fd1ff26ed3478595f6b5ef2d48cb74da73c3a4f68df29ad4745f47621df8b617b21cc006ba8b2d5169d680e10a59498956a3a06be92437949e2faac4ccd1f7
|
7
|
+
data.tar.gz: d0d465c5e8f86ab3c8987ef6732446f10886354b8f3e7db281cb3a18eb64db362de89164680f288a015f28e242553fab3362fa35a1462eed2caa8b0322244158
|
data/CHANGELOG.md
CHANGED
data/TODO.md
CHANGED
data/ext/spark/build.sbt
CHANGED
data/ext/spark/sbt/sbt
CHANGED
@@ -3,9 +3,9 @@
|
|
3
3
|
# This script launches sbt for this project. If present it uses the system
|
4
4
|
# version of sbt. If there is no system version of sbt it attempts to download
|
5
5
|
# sbt locally.
|
6
|
-
SBT_VERSION=0.13.
|
7
|
-
URL1=http://
|
8
|
-
URL2=http://
|
6
|
+
SBT_VERSION=0.13.9
|
7
|
+
URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
|
8
|
+
URL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
|
9
9
|
JAR=sbt/sbt-launch-${SBT_VERSION}.jar
|
10
10
|
|
11
11
|
# Download sbt launch jar if it hasn't been downloaded yet
|
@@ -13,10 +13,10 @@ if [ ! -f ${JAR} ]; then
|
|
13
13
|
# Download
|
14
14
|
printf "Attempting to fetch sbt\n"
|
15
15
|
JAR_DL=${JAR}.part
|
16
|
-
if hash
|
17
|
-
(curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
|
18
|
-
elif hash wget 2>/dev/null; then
|
16
|
+
if hash wget 2>/dev/null; then
|
19
17
|
(wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
|
18
|
+
elif hash curl 2>/dev/null; then
|
19
|
+
(curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
|
20
20
|
else
|
21
21
|
printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
|
22
22
|
exit -1
|
@@ -31,4 +31,4 @@ printf "Launching sbt from ${JAR}\n"
|
|
31
31
|
java \
|
32
32
|
-Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
|
33
33
|
-jar ${JAR} \
|
34
|
-
"$@"
|
34
|
+
"$@"
|
@@ -22,10 +22,10 @@ class RubyMLLibAPI extends MLLibAPI {
|
|
22
22
|
// trainLogisticRegressionModelWithLBFGS
|
23
23
|
// trainSVMModelWithSGD
|
24
24
|
// trainKMeansModel
|
25
|
-
//
|
25
|
+
// trainGaussianMixtureModel
|
26
26
|
|
27
27
|
// Rjb have a problem with theta: Array[Array[Double]]
|
28
|
-
override def
|
28
|
+
override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = {
|
29
29
|
val model = NaiveBayes.train(data.rdd, lambda)
|
30
30
|
|
31
31
|
List(
|
data/lib/spark/build.rb
CHANGED
@@ -86,7 +86,7 @@ module Spark
|
|
86
86
|
raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}"
|
87
87
|
end
|
88
88
|
|
89
|
-
labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, '
|
89
|
+
labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayesModel', rdd, lambda)
|
90
90
|
theta = Spark::Mllib::Matrices.dense(theta.size, theta.first.size, theta)
|
91
91
|
|
92
92
|
NaiveBayesModel.new(labels, pi, theta)
|
@@ -64,7 +64,7 @@ module Spark
|
|
64
64
|
class GaussianMixture
|
65
65
|
|
66
66
|
def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100, seed: nil)
|
67
|
-
weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, '
|
67
|
+
weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixtureModel', rdd,
|
68
68
|
k, convergence_tol, max_iterations, Spark.jb.to_long(seed))
|
69
69
|
|
70
70
|
means.map! {|mu| Spark.jb.java_to_ruby(mu)}
|
data/lib/spark/rdd.rb
CHANGED
@@ -1241,6 +1241,28 @@ module Spark
|
|
1241
1241
|
self.map('lambda{|(_, value)| value}')
|
1242
1242
|
end
|
1243
1243
|
|
1244
|
+
# Return the list of values in the RDD for key `key`.
|
1245
|
+
# TODO: add Partitioner for efficiently searching
|
1246
|
+
#
|
1247
|
+
# == Example:
|
1248
|
+
# rdd = $sc.parallelize(0..10)
|
1249
|
+
# rdd = rdd.group_by(lambda {|x| x%3})
|
1250
|
+
# rdd.lookup(2)
|
1251
|
+
# # => [[2, 5, 8]]
|
1252
|
+
#
|
1253
|
+
# rdd = $sc.parallelize(0..10)
|
1254
|
+
# rdd = rdd.key_by(lambda{|x| x.even?})
|
1255
|
+
# rdd.lookup(true)
|
1256
|
+
# # => [0, 2, 4, 6, 8, 10]
|
1257
|
+
#
|
1258
|
+
def lookup(key)
|
1259
|
+
lookup_key = "lookup_key_#{object_id}"
|
1260
|
+
|
1261
|
+
self.filter("lambda{|(key, _)| key == #{lookup_key}}")
|
1262
|
+
.bind(lookup_key => key)
|
1263
|
+
.values
|
1264
|
+
.collect
|
1265
|
+
end
|
1244
1266
|
|
1245
1267
|
# Aliases
|
1246
1268
|
alias_method :partitionsSize, :partitions_size
|
data/lib/spark/version.rb
CHANGED
data/spec/lib/key_spec.rb
CHANGED
@@ -39,4 +39,21 @@ RSpec.describe 'Spark::RDD' do
|
|
39
39
|
# it_behaves_like 'a keying by', rand(2..10)
|
40
40
|
end
|
41
41
|
|
42
|
+
it 'lookup' do
|
43
|
+
numbers = Generator.numbers
|
44
|
+
rdd_numbers = $sc.parallelize(numbers, 2)
|
45
|
+
|
46
|
+
rdd = rdd_numbers.group_by(lambda {|x| x%3})
|
47
|
+
rdd.lookup(2)
|
48
|
+
|
49
|
+
expect(rdd.lookup(2).first).to eq(
|
50
|
+
numbers.group_by{|x| x%3}[2]
|
51
|
+
)
|
52
|
+
|
53
|
+
rdd = rdd_numbers.key_by(lambda{|x| x.even?})
|
54
|
+
expect(rdd.lookup(true)).to eq(
|
55
|
+
numbers.select(&:even?)
|
56
|
+
)
|
57
|
+
end
|
58
|
+
|
42
59
|
end
|
@@ -99,7 +99,7 @@ RSpec.describe 'Spark::Mllib regression' do
|
|
99
99
|
|
100
100
|
expect(lrm.weights[0]).to be_between(1.9, 2.1)
|
101
101
|
expect(lrm.weights[1]).to be_between(-1.60, -1.40)
|
102
|
-
expect(lrm.weights[2]).to be_between(-1.0e-
|
102
|
+
expect(lrm.weights[2]).to be_between(-1.0e-2, 1.0e-2)
|
103
103
|
end
|
104
104
|
end
|
105
105
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-spark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ondřej Moravčík
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rjb
|
@@ -346,7 +346,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
346
346
|
requirements:
|
347
347
|
- java, scala
|
348
348
|
rubyforge_project:
|
349
|
-
rubygems_version: 2.4.5
|
349
|
+
rubygems_version: 2.4.5.1
|
350
350
|
signing_key:
|
351
351
|
specification_version: 4
|
352
352
|
summary: Ruby wrapper for Apache Spark
|
@@ -402,3 +402,4 @@ test_files:
|
|
402
402
|
- spec/lib/statistic_spec.rb
|
403
403
|
- spec/lib/whole_text_files_spec.rb
|
404
404
|
- spec/spec_helper.rb
|
405
|
+
has_rdoc:
|