ruby-spark 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cd863f728212557da03e76f6e98eeed05695ea5d
4
- data.tar.gz: 214b2022187727a50badcd1910313550e59aefdf
3
+ metadata.gz: c7435669c50b38e45f97113e7a67aa991edfaa46
4
+ data.tar.gz: 95bb22186a07f47f40915edcded518d081f5f8f7
5
5
  SHA512:
6
- metadata.gz: 23c0c7b6ab63a2f9c191cddc4836c73cde61722b9e6f3c7e25b090afed7cda2eaff0d8718074ae3337ff5c4bd57e1223dab76f6cf7772b4c7dda3e7ed69d98c6
7
- data.tar.gz: 234897b1851614ae1371b3a33417c8d036b00a4551185829b99ef398a110a614ffe1eaeac556c00859a45598bed4219e59eb98ddbffbe0fe2c25c024408b8628
6
+ metadata.gz: 27fd1ff26ed3478595f6b5ef2d48cb74da73c3a4f68df29ad4745f47621df8b617b21cc006ba8b2d5169d680e10a59498956a3a06be92437949e2faac4ccd1f7
7
+ data.tar.gz: d0d465c5e8f86ab3c8987ef6732446f10886354b8f3e7db281cb3a18eb64db362de89164680f288a015f28e242553fab3362fa35a1462eed2caa8b0322244158
@@ -1,3 +1,11 @@
1
+ ## Unreleased
2
+
3
+ ## 1.3.0
4
+
5
+ - new method on RDD (lookup)
6
+ - fix sbt url
7
+ - Spark 1.5.0
8
+
1
9
  ## 1.2.0 (15.06.2015)
2
10
 
3
11
  - target folder is now located at HOME
data/TODO.md CHANGED
@@ -6,3 +6,4 @@
6
6
  - add_rb, add_inline_rb to Spark::{Context, RDD}
7
7
  - fix broadcast for cluster
8
8
  - dump to disk if there is memory limit
9
+ - Add Partitioner to RDD
@@ -4,7 +4,7 @@ assemblySettings
4
4
 
5
5
  // Default values
6
6
  val defaultScalaVersion = "2.10.4"
7
- val defaultSparkVersion = "1.3.0"
7
+ val defaultSparkVersion = "1.5.0"
8
8
  val defaultSparkCoreVersion = "2.10"
9
9
  val defaultTargetDir = "target"
10
10
  val defaultHadoopVersion = "1.0.4"
@@ -3,9 +3,9 @@
3
3
  # This script launches sbt for this project. If present it uses the system
4
4
  # version of sbt. If there is no system version of sbt it attempts to download
5
5
  # sbt locally.
6
- SBT_VERSION=0.13.7
7
- URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
8
- URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
6
+ SBT_VERSION=0.13.9
7
+ URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
8
+ URL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
9
9
  JAR=sbt/sbt-launch-${SBT_VERSION}.jar
10
10
 
11
11
  # Download sbt launch jar if it hasn't been downloaded yet
@@ -13,10 +13,10 @@ if [ ! -f ${JAR} ]; then
13
13
  # Download
14
14
  printf "Attempting to fetch sbt\n"
15
15
  JAR_DL=${JAR}.part
16
- if hash curl 2>/dev/null; then
17
- (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
18
- elif hash wget 2>/dev/null; then
16
+ if hash wget 2>/dev/null; then
19
17
  (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
18
+ elif hash curl 2>/dev/null; then
19
+ (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
20
20
  else
21
21
  printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
22
22
  exit -1
@@ -31,4 +31,4 @@ printf "Launching sbt from ${JAR}\n"
31
31
  java \
32
32
  -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
33
33
  -jar ${JAR} \
34
- "$@"
34
+ "$@"
@@ -22,10 +22,10 @@ class RubyMLLibAPI extends MLLibAPI {
22
22
  // trainLogisticRegressionModelWithLBFGS
23
23
  // trainSVMModelWithSGD
24
24
  // trainKMeansModel
25
- // trainGaussianMixture
25
+ // trainGaussianMixtureModel
26
26
 
27
27
  // Rjb have a problem with theta: Array[Array[Double]]
28
- override def trainNaiveBayes(data: JavaRDD[LabeledPoint], lambda: Double) = {
28
+ override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = {
29
29
  val model = NaiveBayes.train(data.rdd, lambda)
30
30
 
31
31
  List(
@@ -3,7 +3,7 @@ module Spark
3
3
 
4
4
  DEFAULT_SCALA_VERSION = '2.10.4'
5
5
  DEFAULT_CORE_VERSION = '2.10'
6
- DEFAULT_SPARK_VERSION = '1.4.0'
6
+ DEFAULT_SPARK_VERSION = '1.5.0'
7
7
  DEFAULT_HADOOP_VERSION = '1.0.4'
8
8
 
9
9
  SBT = 'sbt/sbt'
@@ -86,7 +86,7 @@ module Spark
86
86
  raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}"
87
87
  end
88
88
 
89
- labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayes', rdd, lambda)
89
+ labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayesModel', rdd, lambda)
90
90
  theta = Spark::Mllib::Matrices.dense(theta.size, theta.first.size, theta)
91
91
 
92
92
  NaiveBayesModel.new(labels, pi, theta)
@@ -64,7 +64,7 @@ module Spark
64
64
  class GaussianMixture
65
65
 
66
66
  def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100, seed: nil)
67
- weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixture', rdd,
67
+ weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixtureModel', rdd,
68
68
  k, convergence_tol, max_iterations, Spark.jb.to_long(seed))
69
69
 
70
70
  means.map! {|mu| Spark.jb.java_to_ruby(mu)}
@@ -1241,6 +1241,28 @@ module Spark
1241
1241
  self.map('lambda{|(_, value)| value}')
1242
1242
  end
1243
1243
 
1244
+ # Return the list of values in the RDD for key `key`.
1245
+ # TODO: add Partitioner for efficiently searching
1246
+ #
1247
+ # == Example:
1248
+ # rdd = $sc.parallelize(0..10)
1249
+ # rdd = rdd.group_by(lambda {|x| x%3})
1250
+ # rdd.lookup(2)
1251
+ # # => [[2, 5, 8]]
1252
+ #
1253
+ # rdd = $sc.parallelize(0..10)
1254
+ # rdd = rdd.key_by(lambda{|x| x.even?})
1255
+ # rdd.lookup(true)
1256
+ # # => [0, 2, 4, 6, 8, 10]
1257
+ #
1258
+ def lookup(key)
1259
+ lookup_key = "lookup_key_#{object_id}"
1260
+
1261
+ self.filter("lambda{|(key, _)| key == #{lookup_key}}")
1262
+ .bind(lookup_key => key)
1263
+ .values
1264
+ .collect
1265
+ end
1244
1266
 
1245
1267
  # Aliases
1246
1268
  alias_method :partitionsSize, :partitions_size
@@ -1,3 +1,3 @@
1
1
  module Spark
2
- VERSION = '1.2.0'
2
+ VERSION = '1.2.1'
3
3
  end
@@ -39,4 +39,21 @@ RSpec.describe 'Spark::RDD' do
39
39
  # it_behaves_like 'a keying by', rand(2..10)
40
40
  end
41
41
 
42
+ it 'lookup' do
43
+ numbers = Generator.numbers
44
+ rdd_numbers = $sc.parallelize(numbers, 2)
45
+
46
+ rdd = rdd_numbers.group_by(lambda {|x| x%3})
47
+ rdd.lookup(2)
48
+
49
+ expect(rdd.lookup(2).first).to eq(
50
+ numbers.group_by{|x| x%3}[2]
51
+ )
52
+
53
+ rdd = rdd_numbers.key_by(lambda{|x| x.even?})
54
+ expect(rdd.lookup(true)).to eq(
55
+ numbers.select(&:even?)
56
+ )
57
+ end
58
+
42
59
  end
@@ -99,7 +99,7 @@ RSpec.describe 'Spark::Mllib regression' do
99
99
 
100
100
  expect(lrm.weights[0]).to be_between(1.9, 2.1)
101
101
  expect(lrm.weights[1]).to be_between(-1.60, -1.40)
102
- expect(lrm.weights[2]).to be_between(-1.0e-3, 1.0e-3)
102
+ expect(lrm.weights[2]).to be_between(-1.0e-2, 1.0e-2)
103
103
  end
104
104
  end
105
105
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spark
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ondřej Moravčík
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-15 00:00:00.000000000 Z
11
+ date: 2015-11-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rjb
@@ -346,7 +346,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
346
346
  requirements:
347
347
  - java, scala
348
348
  rubyforge_project:
349
- rubygems_version: 2.4.5
349
+ rubygems_version: 2.4.5.1
350
350
  signing_key:
351
351
  specification_version: 4
352
352
  summary: Ruby wrapper for Apache Spark
@@ -402,3 +402,4 @@ test_files:
402
402
  - spec/lib/statistic_spec.rb
403
403
  - spec/lib/whole_text_files_spec.rb
404
404
  - spec/spec_helper.rb
405
+ has_rdoc: