ruby-spark 1.2.0 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cd863f728212557da03e76f6e98eeed05695ea5d
4
- data.tar.gz: 214b2022187727a50badcd1910313550e59aefdf
3
+ metadata.gz: c7435669c50b38e45f97113e7a67aa991edfaa46
4
+ data.tar.gz: 95bb22186a07f47f40915edcded518d081f5f8f7
5
5
  SHA512:
6
- metadata.gz: 23c0c7b6ab63a2f9c191cddc4836c73cde61722b9e6f3c7e25b090afed7cda2eaff0d8718074ae3337ff5c4bd57e1223dab76f6cf7772b4c7dda3e7ed69d98c6
7
- data.tar.gz: 234897b1851614ae1371b3a33417c8d036b00a4551185829b99ef398a110a614ffe1eaeac556c00859a45598bed4219e59eb98ddbffbe0fe2c25c024408b8628
6
+ metadata.gz: 27fd1ff26ed3478595f6b5ef2d48cb74da73c3a4f68df29ad4745f47621df8b617b21cc006ba8b2d5169d680e10a59498956a3a06be92437949e2faac4ccd1f7
7
+ data.tar.gz: d0d465c5e8f86ab3c8987ef6732446f10886354b8f3e7db281cb3a18eb64db362de89164680f288a015f28e242553fab3362fa35a1462eed2caa8b0322244158
@@ -1,3 +1,11 @@
1
+ ## Unreleased
2
+
3
+ ## 1.3.0
4
+
5
+ - new method on RDD (lookup)
6
+ - fix sbt url
7
+ - Spark 1.5.0
8
+
1
9
  ## 1.2.0 (15.06.2015)
2
10
 
3
11
  - target folder is now located at HOME
data/TODO.md CHANGED
@@ -6,3 +6,4 @@
6
6
  - add_rb, add_inline_rb to Spark::{Context, RDD}
7
7
  - fix broadcast for cluster
8
8
  - dump to disk if there is memory limit
9
+ - Add Partitioner to RDD
@@ -4,7 +4,7 @@ assemblySettings
4
4
 
5
5
  // Default values
6
6
  val defaultScalaVersion = "2.10.4"
7
- val defaultSparkVersion = "1.3.0"
7
+ val defaultSparkVersion = "1.5.0"
8
8
  val defaultSparkCoreVersion = "2.10"
9
9
  val defaultTargetDir = "target"
10
10
  val defaultHadoopVersion = "1.0.4"
@@ -3,9 +3,9 @@
3
3
  # This script launches sbt for this project. If present it uses the system
4
4
  # version of sbt. If there is no system version of sbt it attempts to download
5
5
  # sbt locally.
6
- SBT_VERSION=0.13.7
7
- URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
8
- URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
6
+ SBT_VERSION=0.13.9
7
+ URL1=http://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
8
+ URL2=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
9
9
  JAR=sbt/sbt-launch-${SBT_VERSION}.jar
10
10
 
11
11
  # Download sbt launch jar if it hasn't been downloaded yet
@@ -13,10 +13,10 @@ if [ ! -f ${JAR} ]; then
13
13
  # Download
14
14
  printf "Attempting to fetch sbt\n"
15
15
  JAR_DL=${JAR}.part
16
- if hash curl 2>/dev/null; then
17
- (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
18
- elif hash wget 2>/dev/null; then
16
+ if hash wget 2>/dev/null; then
19
17
  (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
18
+ elif hash curl 2>/dev/null; then
19
+ (curl --progress-bar ${URL1} > ${JAR_DL} || curl --progress-bar ${URL2} > ${JAR_DL}) && mv ${JAR_DL} ${JAR}
20
20
  else
21
21
  printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
22
22
  exit -1
@@ -31,4 +31,4 @@ printf "Launching sbt from ${JAR}\n"
31
31
  java \
32
32
  -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
33
33
  -jar ${JAR} \
34
- "$@"
34
+ "$@"
@@ -22,10 +22,10 @@ class RubyMLLibAPI extends MLLibAPI {
22
22
  // trainLogisticRegressionModelWithLBFGS
23
23
  // trainSVMModelWithSGD
24
24
  // trainKMeansModel
25
- // trainGaussianMixture
25
+ // trainGaussianMixtureModel
26
26
 
27
27
  // Rjb have a problem with theta: Array[Array[Double]]
28
- override def trainNaiveBayes(data: JavaRDD[LabeledPoint], lambda: Double) = {
28
+ override def trainNaiveBayesModel(data: JavaRDD[LabeledPoint], lambda: Double) = {
29
29
  val model = NaiveBayes.train(data.rdd, lambda)
30
30
 
31
31
  List(
@@ -3,7 +3,7 @@ module Spark
3
3
 
4
4
  DEFAULT_SCALA_VERSION = '2.10.4'
5
5
  DEFAULT_CORE_VERSION = '2.10'
6
- DEFAULT_SPARK_VERSION = '1.4.0'
6
+ DEFAULT_SPARK_VERSION = '1.5.0'
7
7
  DEFAULT_HADOOP_VERSION = '1.0.4'
8
8
 
9
9
  SBT = 'sbt/sbt'
@@ -86,7 +86,7 @@ module Spark
86
86
  raise Spark::MllibError, "RDD should contains LabeledPoint, got #{first.class}"
87
87
  end
88
88
 
89
- labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayes', rdd, lambda)
89
+ labels, pi, theta = Spark.jb.call(RubyMLLibAPI.new, 'trainNaiveBayesModel', rdd, lambda)
90
90
  theta = Spark::Mllib::Matrices.dense(theta.size, theta.first.size, theta)
91
91
 
92
92
  NaiveBayesModel.new(labels, pi, theta)
@@ -64,7 +64,7 @@ module Spark
64
64
  class GaussianMixture
65
65
 
66
66
  def self.train(rdd, k, convergence_tol: 0.001, max_iterations: 100, seed: nil)
67
- weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixture', rdd,
67
+ weights, means, sigmas = Spark.jb.call(RubyMLLibAPI.new, 'trainGaussianMixtureModel', rdd,
68
68
  k, convergence_tol, max_iterations, Spark.jb.to_long(seed))
69
69
 
70
70
  means.map! {|mu| Spark.jb.java_to_ruby(mu)}
@@ -1241,6 +1241,28 @@ module Spark
1241
1241
  self.map('lambda{|(_, value)| value}')
1242
1242
  end
1243
1243
 
1244
+ # Return the list of values in the RDD for key `key`.
1245
+ # TODO: add Partitioner for efficiently searching
1246
+ #
1247
+ # == Example:
1248
+ # rdd = $sc.parallelize(0..10)
1249
+ # rdd = rdd.group_by(lambda {|x| x%3})
1250
+ # rdd.lookup(2)
1251
+ # # => [[2, 5, 8]]
1252
+ #
1253
+ # rdd = $sc.parallelize(0..10)
1254
+ # rdd = rdd.key_by(lambda{|x| x.even?})
1255
+ # rdd.lookup(true)
1256
+ # # => [0, 2, 4, 6, 8, 10]
1257
+ #
1258
+ def lookup(key)
1259
+ lookup_key = "lookup_key_#{object_id}"
1260
+
1261
+ self.filter("lambda{|(key, _)| key == #{lookup_key}}")
1262
+ .bind(lookup_key => key)
1263
+ .values
1264
+ .collect
1265
+ end
1244
1266
 
1245
1267
  # Aliases
1246
1268
  alias_method :partitionsSize, :partitions_size
@@ -1,3 +1,3 @@
1
1
  module Spark
2
- VERSION = '1.2.0'
2
+ VERSION = '1.2.1'
3
3
  end
@@ -39,4 +39,21 @@ RSpec.describe 'Spark::RDD' do
39
39
  # it_behaves_like 'a keying by', rand(2..10)
40
40
  end
41
41
 
42
+ it 'lookup' do
43
+ numbers = Generator.numbers
44
+ rdd_numbers = $sc.parallelize(numbers, 2)
45
+
46
+ rdd = rdd_numbers.group_by(lambda {|x| x%3})
47
+ rdd.lookup(2)
48
+
49
+ expect(rdd.lookup(2).first).to eq(
50
+ numbers.group_by{|x| x%3}[2]
51
+ )
52
+
53
+ rdd = rdd_numbers.key_by(lambda{|x| x.even?})
54
+ expect(rdd.lookup(true)).to eq(
55
+ numbers.select(&:even?)
56
+ )
57
+ end
58
+
42
59
  end
@@ -99,7 +99,7 @@ RSpec.describe 'Spark::Mllib regression' do
99
99
 
100
100
  expect(lrm.weights[0]).to be_between(1.9, 2.1)
101
101
  expect(lrm.weights[1]).to be_between(-1.60, -1.40)
102
- expect(lrm.weights[2]).to be_between(-1.0e-3, 1.0e-3)
102
+ expect(lrm.weights[2]).to be_between(-1.0e-2, 1.0e-2)
103
103
  end
104
104
  end
105
105
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spark
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ondřej Moravčík
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-15 00:00:00.000000000 Z
11
+ date: 2015-11-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rjb
@@ -346,7 +346,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
346
346
  requirements:
347
347
  - java, scala
348
348
  rubyforge_project:
349
- rubygems_version: 2.4.5
349
+ rubygems_version: 2.4.5.1
350
350
  signing_key:
351
351
  specification_version: 4
352
352
  summary: Ruby wrapper for Apache Spark
@@ -402,3 +402,4 @@ test_files:
402
402
  - spec/lib/statistic_spec.rb
403
403
  - spec/lib/whole_text_files_spec.rb
404
404
  - spec/spec_helper.rb
405
+ has_rdoc: