ruby-spark 1.0.0 → 1.1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/README.md +99 -32
- data/TODO.md +2 -3
- data/benchmark/{performance → comparison}/prepare.sh +0 -0
- data/benchmark/{performance → comparison}/python.py +0 -0
- data/benchmark/{performance → comparison}/r.r +0 -0
- data/benchmark/{performance → comparison}/ruby.rb +0 -0
- data/benchmark/{performance → comparison}/run-all.sh +0 -0
- data/benchmark/{performance → comparison}/scala.scala +0 -0
- data/example/pi.rb +1 -1
- data/example/website_search.rb +83 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
- data/lib/spark.rb +2 -2
- data/lib/spark/build.rb +1 -1
- data/lib/spark/cli.rb +1 -1
- data/lib/spark/command/base.rb +4 -0
- data/lib/spark/command_builder.rb +2 -2
- data/lib/spark/config.rb +11 -17
- data/lib/spark/context.rb +63 -45
- data/lib/spark/ext/io.rb +11 -1
- data/lib/spark/java_bridge/base.rb +2 -2
- data/lib/spark/rdd.rb +67 -18
- data/lib/spark/serializer.rb +68 -13
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +30 -137
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +5 -29
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +6 -8
- data/lib/spark/serializer/message_pack.rb +8 -10
- data/lib/spark/serializer/oj.rb +8 -10
- data/lib/spark/serializer/pair.rb +27 -13
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/version.rb +1 -1
- data/lib/spark/worker/worker.rb +5 -2
- data/ruby-spark.gemspec +13 -1
- data/spec/lib/context_spec.rb +3 -1
- data/spec/lib/manipulation_spec.rb +18 -10
- data/spec/lib/map_partitions_spec.rb +16 -16
- data/spec/lib/serializer_spec.rb +84 -9
- data/spec/lib/statistic_spec.rb +26 -24
- data/spec/spec_helper.rb +1 -2
- metadata +112 -10
- data/lib/spark/serializer/utf8.rb +0 -25
data/spec/lib/statistic_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
RSpec
|
3
|
+
RSpec.shared_examples 'a stats' do |workers|
|
4
4
|
let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] }
|
5
5
|
|
6
6
|
context "with #{workers || 'default'} worker" do
|
@@ -29,23 +29,23 @@ RSpec::shared_examples 'a stats' do |workers|
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
-
RSpec
|
32
|
+
RSpec.shared_examples 'a histogram' do |workers|
|
33
33
|
|
34
34
|
context "with #{workers || 'default'} worker" do
|
35
35
|
it 'empty' do
|
36
|
-
rdd = $sc.parallelize([], workers,
|
36
|
+
rdd = $sc.parallelize([], workers, ser)
|
37
37
|
|
38
38
|
expect( rdd.histogram([0, 10])[1] ).to eq([0])
|
39
39
|
expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
|
40
40
|
end
|
41
41
|
|
42
42
|
it 'validation' do
|
43
|
-
rdd = $sc.parallelize([], workers,
|
43
|
+
rdd = $sc.parallelize([], workers, ser)
|
44
44
|
expect { rdd.histogram(0) }.to raise_error(ArgumentError)
|
45
45
|
end
|
46
46
|
|
47
47
|
it 'double' do
|
48
|
-
rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers,
|
48
|
+
rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, ser)
|
49
49
|
buckets, counts = rdd.histogram(2)
|
50
50
|
|
51
51
|
expect(buckets).to eq([1.0, 2.5, 4.0])
|
@@ -53,91 +53,91 @@ RSpec::shared_examples 'a histogram' do |workers|
|
|
53
53
|
end
|
54
54
|
|
55
55
|
it 'out of range' do
|
56
|
-
rdd = $sc.parallelize([10.01, -0.01], workers,
|
56
|
+
rdd = $sc.parallelize([10.01, -0.01], workers, ser)
|
57
57
|
|
58
58
|
expect( rdd.histogram([0, 10])[1] ).to eq([0])
|
59
59
|
expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
|
60
60
|
end
|
61
61
|
|
62
62
|
it 'in range with one bucket' do
|
63
|
-
rdd = $sc.parallelize([1, 2, 3, 4], workers,
|
63
|
+
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
|
64
64
|
|
65
65
|
expect( rdd.histogram([0, 10])[1] ).to eq([4])
|
66
66
|
expect( rdd.histogram([0, 4, 10])[1] ).to eq([3, 1])
|
67
67
|
end
|
68
68
|
|
69
69
|
it 'in range with one bucket exact match' do
|
70
|
-
rdd = $sc.parallelize([1, 2, 3, 4], workers,
|
70
|
+
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
|
71
71
|
expect( rdd.histogram([1, 4])[1] ).to eq([4])
|
72
72
|
end
|
73
73
|
|
74
74
|
it 'out of range with two buckets' do
|
75
|
-
rdd = $sc.parallelize([10.01, -0.01], workers,
|
75
|
+
rdd = $sc.parallelize([10.01, -0.01], workers, ser)
|
76
76
|
expect( rdd.histogram([0, 5, 10])[1] ).to eq([0, 0])
|
77
77
|
end
|
78
78
|
|
79
79
|
it 'out of range with two uneven buckets' do
|
80
|
-
rdd = $sc.parallelize([10.01, -0.01], workers,
|
80
|
+
rdd = $sc.parallelize([10.01, -0.01], workers, ser)
|
81
81
|
expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
|
82
82
|
end
|
83
83
|
|
84
84
|
it 'in range with two buckets' do
|
85
|
-
rdd = $sc.parallelize([1, 2, 3, 5, 6], workers,
|
85
|
+
rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
|
86
86
|
expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
|
87
87
|
end
|
88
88
|
|
89
89
|
it 'in range with two bucket and nil' do
|
90
|
-
rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers,
|
90
|
+
rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, ser)
|
91
91
|
expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
|
92
92
|
end
|
93
93
|
|
94
94
|
it 'in range with two uneven buckets' do
|
95
|
-
rdd = $sc.parallelize([1, 2, 3, 5, 6], workers,
|
95
|
+
rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
|
96
96
|
expect( rdd.histogram([0, 5, 11])[1] ).to eq([3, 2])
|
97
97
|
end
|
98
98
|
|
99
99
|
it 'mixed range with two uneven buckets' do
|
100
|
-
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers,
|
100
|
+
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, ser)
|
101
101
|
expect( rdd.histogram([0, 5, 11])[1] ).to eq([4, 3])
|
102
102
|
end
|
103
103
|
|
104
104
|
it 'mixed range with four uneven buckets' do
|
105
|
-
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers,
|
105
|
+
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, ser)
|
106
106
|
expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
|
107
107
|
end
|
108
108
|
|
109
109
|
it 'mixed range with uneven buckets and NaN' do
|
110
|
-
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers,
|
110
|
+
rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, ser)
|
111
111
|
expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
|
112
112
|
end
|
113
113
|
|
114
114
|
it 'out of range with infinite buckets' do
|
115
|
-
rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers,
|
115
|
+
rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, ser)
|
116
116
|
expect( rdd.histogram([-Float::INFINITY, 0, Float::INFINITY])[1] ).to eq([1, 1])
|
117
117
|
end
|
118
118
|
|
119
119
|
it 'without buckets' do
|
120
|
-
rdd = $sc.parallelize([1, 2, 3, 4], workers,
|
120
|
+
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
|
121
121
|
expect( rdd.histogram(1) ).to eq([[1, 4], [4]])
|
122
122
|
end
|
123
123
|
|
124
124
|
it 'without buckets single element' do
|
125
|
-
rdd = $sc.parallelize([1], workers,
|
125
|
+
rdd = $sc.parallelize([1], workers, ser)
|
126
126
|
expect( rdd.histogram(1) ).to eq([[1, 1], [1]])
|
127
127
|
end
|
128
128
|
|
129
129
|
it 'without bucket no range' do
|
130
|
-
rdd = $sc.parallelize([1, 1, 1, 1], workers,
|
130
|
+
rdd = $sc.parallelize([1, 1, 1, 1], workers, ser)
|
131
131
|
expect( rdd.histogram(1) ).to eq([[1, 1], [4]])
|
132
132
|
end
|
133
133
|
|
134
134
|
it 'without buckets basic two' do
|
135
|
-
rdd = $sc.parallelize([1, 2, 3, 4], workers,
|
135
|
+
rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
|
136
136
|
expect( rdd.histogram(2) ).to eq([[1, 2.5, 4], [2, 2]])
|
137
137
|
end
|
138
138
|
|
139
139
|
it 'without buckets with more requested than elements' do
|
140
|
-
rdd = $sc.parallelize([1, 2], workers,
|
140
|
+
rdd = $sc.parallelize([1, 2], workers, ser)
|
141
141
|
buckets = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
|
142
142
|
hist = [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
|
143
143
|
|
@@ -145,7 +145,7 @@ RSpec::shared_examples 'a histogram' do |workers|
|
|
145
145
|
end
|
146
146
|
|
147
147
|
it 'string' do
|
148
|
-
rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers,
|
148
|
+
rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, ser)
|
149
149
|
|
150
150
|
expect( rdd.histogram(['a', 'b', 'c'])[1] ).to eq([2, 2])
|
151
151
|
expect( rdd.histogram(1) ).to eq([['ab', 'ef'], [5]])
|
@@ -155,7 +155,9 @@ RSpec::shared_examples 'a histogram' do |workers|
|
|
155
155
|
end
|
156
156
|
end
|
157
157
|
|
158
|
-
RSpec
|
158
|
+
RSpec.describe Spark::RDD do
|
159
|
+
let(:ser) { Spark::Serializer.build { __batched__(__marshal__, 1) } }
|
160
|
+
|
159
161
|
context '.stats' do
|
160
162
|
it_behaves_like 'a stats', 1
|
161
163
|
it_behaves_like 'a stats', rand(2..5)
|
data/spec/spec_helper.rb
CHANGED
@@ -14,8 +14,7 @@ Spark::Mllib.import
|
|
14
14
|
def spark_start
|
15
15
|
Spark.logger.disable
|
16
16
|
Spark.config do
|
17
|
-
set 'spark.ruby.
|
18
|
-
set 'spark.ruby.batch_size', 100
|
17
|
+
set 'spark.ruby.serializer.batch_size', 100
|
19
18
|
end
|
20
19
|
Spark.start
|
21
20
|
$sc = Spark.context
|
metadata
CHANGED
@@ -1,15 +1,113 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-spark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ondřej Moravčík
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-05-
|
11
|
+
date: 2015-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: sourcify
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.6.0.rc4
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.6.0.rc4
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: method_source
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: commander
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: pry
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nio4r
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: distribution
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rjb
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
13
111
|
- !ruby/object:Gem::Dependency
|
14
112
|
name: bundler
|
15
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -56,21 +154,22 @@ files:
|
|
56
154
|
- TODO.md
|
57
155
|
- benchmark/aggregate.rb
|
58
156
|
- benchmark/bisect.rb
|
157
|
+
- benchmark/comparison/prepare.sh
|
158
|
+
- benchmark/comparison/python.py
|
159
|
+
- benchmark/comparison/r.r
|
160
|
+
- benchmark/comparison/ruby.rb
|
161
|
+
- benchmark/comparison/run-all.sh
|
162
|
+
- benchmark/comparison/scala.scala
|
59
163
|
- benchmark/custom_marshal.rb
|
60
164
|
- benchmark/digest.rb
|
61
165
|
- benchmark/enumerator.rb
|
62
|
-
- benchmark/performance/prepare.sh
|
63
|
-
- benchmark/performance/python.py
|
64
|
-
- benchmark/performance/r.r
|
65
|
-
- benchmark/performance/ruby.rb
|
66
|
-
- benchmark/performance/run-all.sh
|
67
|
-
- benchmark/performance/scala.scala
|
68
166
|
- benchmark/serializer.rb
|
69
167
|
- benchmark/sort.rb
|
70
168
|
- benchmark/sort2.rb
|
71
169
|
- benchmark/take.rb
|
72
170
|
- bin/ruby-spark
|
73
171
|
- example/pi.rb
|
172
|
+
- example/website_search.rb
|
74
173
|
- ext/ruby_c/extconf.rb
|
75
174
|
- ext/ruby_c/murmur.c
|
76
175
|
- ext/ruby_c/murmur.h
|
@@ -155,13 +254,16 @@ files:
|
|
155
254
|
- lib/spark/rdd.rb
|
156
255
|
- lib/spark/sampler.rb
|
157
256
|
- lib/spark/serializer.rb
|
257
|
+
- lib/spark/serializer/auto_batched.rb
|
158
258
|
- lib/spark/serializer/base.rb
|
259
|
+
- lib/spark/serializer/batched.rb
|
159
260
|
- lib/spark/serializer/cartesian.rb
|
261
|
+
- lib/spark/serializer/compressed.rb
|
160
262
|
- lib/spark/serializer/marshal.rb
|
161
263
|
- lib/spark/serializer/message_pack.rb
|
162
264
|
- lib/spark/serializer/oj.rb
|
163
265
|
- lib/spark/serializer/pair.rb
|
164
|
-
- lib/spark/serializer/
|
266
|
+
- lib/spark/serializer/text.rb
|
165
267
|
- lib/spark/sort.rb
|
166
268
|
- lib/spark/stat_counter.rb
|
167
269
|
- lib/spark/storage_level.rb
|
@@ -245,7 +347,7 @@ rubyforge_project:
|
|
245
347
|
rubygems_version: 2.2.2
|
246
348
|
signing_key:
|
247
349
|
specification_version: 4
|
248
|
-
summary: Ruby wrapper for Spark
|
350
|
+
summary: Ruby wrapper for Apache Spark
|
249
351
|
test_files:
|
250
352
|
- spec/generator.rb
|
251
353
|
- spec/inputs/lorem_300.txt
|
@@ -1,25 +0,0 @@
|
|
1
|
-
module Spark
|
2
|
-
module Serializer
|
3
|
-
##
|
4
|
-
# Used for file
|
5
|
-
#
|
6
|
-
# File is sended as String but worker use serialization
|
7
|
-
#
|
8
|
-
class UTF8 < Base
|
9
|
-
|
10
|
-
def set(*)
|
11
|
-
unbatch!
|
12
|
-
self
|
13
|
-
end
|
14
|
-
|
15
|
-
def batched?
|
16
|
-
false
|
17
|
-
end
|
18
|
-
|
19
|
-
def load_next_from_io(io, lenght)
|
20
|
-
io.read(lenght).force_encoding(Encoding::UTF_8)
|
21
|
-
end
|
22
|
-
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|