ruby-spark 1.0.0 → 1.1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/README.md +99 -32
  4. data/TODO.md +2 -3
  5. data/benchmark/{performance → comparison}/prepare.sh +0 -0
  6. data/benchmark/{performance → comparison}/python.py +0 -0
  7. data/benchmark/{performance → comparison}/r.r +0 -0
  8. data/benchmark/{performance → comparison}/ruby.rb +0 -0
  9. data/benchmark/{performance → comparison}/run-all.sh +0 -0
  10. data/benchmark/{performance → comparison}/scala.scala +0 -0
  11. data/example/pi.rb +1 -1
  12. data/example/website_search.rb +83 -0
  13. data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
  14. data/lib/spark.rb +2 -2
  15. data/lib/spark/build.rb +1 -1
  16. data/lib/spark/cli.rb +1 -1
  17. data/lib/spark/command/base.rb +4 -0
  18. data/lib/spark/command_builder.rb +2 -2
  19. data/lib/spark/config.rb +11 -17
  20. data/lib/spark/context.rb +63 -45
  21. data/lib/spark/ext/io.rb +11 -1
  22. data/lib/spark/java_bridge/base.rb +2 -2
  23. data/lib/spark/rdd.rb +67 -18
  24. data/lib/spark/serializer.rb +68 -13
  25. data/lib/spark/serializer/auto_batched.rb +59 -0
  26. data/lib/spark/serializer/base.rb +30 -137
  27. data/lib/spark/serializer/batched.rb +84 -0
  28. data/lib/spark/serializer/cartesian.rb +5 -29
  29. data/lib/spark/serializer/compressed.rb +27 -0
  30. data/lib/spark/serializer/marshal.rb +6 -8
  31. data/lib/spark/serializer/message_pack.rb +8 -10
  32. data/lib/spark/serializer/oj.rb +8 -10
  33. data/lib/spark/serializer/pair.rb +27 -13
  34. data/lib/spark/serializer/text.rb +25 -0
  35. data/lib/spark/version.rb +1 -1
  36. data/lib/spark/worker/worker.rb +5 -2
  37. data/ruby-spark.gemspec +13 -1
  38. data/spec/lib/context_spec.rb +3 -1
  39. data/spec/lib/manipulation_spec.rb +18 -10
  40. data/spec/lib/map_partitions_spec.rb +16 -16
  41. data/spec/lib/serializer_spec.rb +84 -9
  42. data/spec/lib/statistic_spec.rb +26 -24
  43. data/spec/spec_helper.rb +1 -2
  44. metadata +112 -10
  45. data/lib/spark/serializer/utf8.rb +0 -25
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- RSpec::shared_examples 'a stats' do |workers|
3
+ RSpec.shared_examples 'a stats' do |workers|
4
4
  let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] }
5
5
 
6
6
  context "with #{workers || 'default'} worker" do
@@ -29,23 +29,23 @@ RSpec::shared_examples 'a stats' do |workers|
29
29
  end
30
30
  end
31
31
 
32
- RSpec::shared_examples 'a histogram' do |workers|
32
+ RSpec.shared_examples 'a histogram' do |workers|
33
33
 
34
34
  context "with #{workers || 'default'} worker" do
35
35
  it 'empty' do
36
- rdd = $sc.parallelize([], workers, batch_size: 1)
36
+ rdd = $sc.parallelize([], workers, ser)
37
37
 
38
38
  expect( rdd.histogram([0, 10])[1] ).to eq([0])
39
39
  expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
40
40
  end
41
41
 
42
42
  it 'validation' do
43
- rdd = $sc.parallelize([], workers, batch_size: 1)
43
+ rdd = $sc.parallelize([], workers, ser)
44
44
  expect { rdd.histogram(0) }.to raise_error(ArgumentError)
45
45
  end
46
46
 
47
47
  it 'double' do
48
- rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, batch_size: 1)
48
+ rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, ser)
49
49
  buckets, counts = rdd.histogram(2)
50
50
 
51
51
  expect(buckets).to eq([1.0, 2.5, 4.0])
@@ -53,91 +53,91 @@ RSpec::shared_examples 'a histogram' do |workers|
53
53
  end
54
54
 
55
55
  it 'out of range' do
56
- rdd = $sc.parallelize([10.01, -0.01], workers, batch_size: 1)
56
+ rdd = $sc.parallelize([10.01, -0.01], workers, ser)
57
57
 
58
58
  expect( rdd.histogram([0, 10])[1] ).to eq([0])
59
59
  expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
60
60
  end
61
61
 
62
62
  it 'in range with one bucket' do
63
- rdd = $sc.parallelize([1, 2, 3, 4], workers, batch_size: 1)
63
+ rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
64
64
 
65
65
  expect( rdd.histogram([0, 10])[1] ).to eq([4])
66
66
  expect( rdd.histogram([0, 4, 10])[1] ).to eq([3, 1])
67
67
  end
68
68
 
69
69
  it 'in range with one bucket exact match' do
70
- rdd = $sc.parallelize([1, 2, 3, 4], workers, batch_size: 1)
70
+ rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
71
71
  expect( rdd.histogram([1, 4])[1] ).to eq([4])
72
72
  end
73
73
 
74
74
  it 'out of range with two buckets' do
75
- rdd = $sc.parallelize([10.01, -0.01], workers, batch_size: 1)
75
+ rdd = $sc.parallelize([10.01, -0.01], workers, ser)
76
76
  expect( rdd.histogram([0, 5, 10])[1] ).to eq([0, 0])
77
77
  end
78
78
 
79
79
  it 'out of range with two uneven buckets' do
80
- rdd = $sc.parallelize([10.01, -0.01], workers, batch_size: 1)
80
+ rdd = $sc.parallelize([10.01, -0.01], workers, ser)
81
81
  expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
82
82
  end
83
83
 
84
84
  it 'in range with two buckets' do
85
- rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, batch_size: 1)
85
+ rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
86
86
  expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
87
87
  end
88
88
 
89
89
  it 'in range with two bucket and nil' do
90
- rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, batch_size: 1)
90
+ rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, ser)
91
91
  expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
92
92
  end
93
93
 
94
94
  it 'in range with two uneven buckets' do
95
- rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, batch_size: 1)
95
+ rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
96
96
  expect( rdd.histogram([0, 5, 11])[1] ).to eq([3, 2])
97
97
  end
98
98
 
99
99
  it 'mixed range with two uneven buckets' do
100
- rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, batch_size: 1)
100
+ rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, ser)
101
101
  expect( rdd.histogram([0, 5, 11])[1] ).to eq([4, 3])
102
102
  end
103
103
 
104
104
  it 'mixed range with four uneven buckets' do
105
- rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, batch_size: 1)
105
+ rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, ser)
106
106
  expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
107
107
  end
108
108
 
109
109
  it 'mixed range with uneven buckets and NaN' do
110
- rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, batch_size: 1)
110
+ rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, ser)
111
111
  expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
112
112
  end
113
113
 
114
114
  it 'out of range with infinite buckets' do
115
- rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, batch_size: 1)
115
+ rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, ser)
116
116
  expect( rdd.histogram([-Float::INFINITY, 0, Float::INFINITY])[1] ).to eq([1, 1])
117
117
  end
118
118
 
119
119
  it 'without buckets' do
120
- rdd = $sc.parallelize([1, 2, 3, 4], workers, batch_size: 1)
120
+ rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
121
121
  expect( rdd.histogram(1) ).to eq([[1, 4], [4]])
122
122
  end
123
123
 
124
124
  it 'without buckets single element' do
125
- rdd = $sc.parallelize([1], workers, batch_size: 1)
125
+ rdd = $sc.parallelize([1], workers, ser)
126
126
  expect( rdd.histogram(1) ).to eq([[1, 1], [1]])
127
127
  end
128
128
 
129
129
  it 'without bucket no range' do
130
- rdd = $sc.parallelize([1, 1, 1, 1], workers, batch_size: 1)
130
+ rdd = $sc.parallelize([1, 1, 1, 1], workers, ser)
131
131
  expect( rdd.histogram(1) ).to eq([[1, 1], [4]])
132
132
  end
133
133
 
134
134
  it 'without buckets basic two' do
135
- rdd = $sc.parallelize([1, 2, 3, 4], workers, batch_size: 1)
135
+ rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
136
136
  expect( rdd.histogram(2) ).to eq([[1, 2.5, 4], [2, 2]])
137
137
  end
138
138
 
139
139
  it 'without buckets with more requested than elements' do
140
- rdd = $sc.parallelize([1, 2], workers, batch_size: 1)
140
+ rdd = $sc.parallelize([1, 2], workers, ser)
141
141
  buckets = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
142
142
  hist = [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
143
143
 
@@ -145,7 +145,7 @@ RSpec::shared_examples 'a histogram' do |workers|
145
145
  end
146
146
 
147
147
  it 'string' do
148
- rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, batch_size: 1)
148
+ rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, ser)
149
149
 
150
150
  expect( rdd.histogram(['a', 'b', 'c'])[1] ).to eq([2, 2])
151
151
  expect( rdd.histogram(1) ).to eq([['ab', 'ef'], [5]])
@@ -155,7 +155,9 @@ RSpec::shared_examples 'a histogram' do |workers|
155
155
  end
156
156
  end
157
157
 
158
- RSpec::describe Spark::RDD do
158
+ RSpec.describe Spark::RDD do
159
+ let(:ser) { Spark::Serializer.build { __batched__(__marshal__, 1) } }
160
+
159
161
  context '.stats' do
160
162
  it_behaves_like 'a stats', 1
161
163
  it_behaves_like 'a stats', rand(2..5)
data/spec/spec_helper.rb CHANGED
@@ -14,8 +14,7 @@ Spark::Mllib.import
14
14
  def spark_start
15
15
  Spark.logger.disable
16
16
  Spark.config do
17
- set 'spark.ruby.parallelize_strategy', 'deep_copy'
18
- set 'spark.ruby.batch_size', 100
17
+ set 'spark.ruby.serializer.batch_size', 100
19
18
  end
20
19
  Spark.start
21
20
  $sc = Spark.context
metadata CHANGED
@@ -1,15 +1,113 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spark
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ondřej Moravčík
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-04 00:00:00.000000000 Z
11
+ date: 2015-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: sourcify
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.6.0.rc4
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.6.0.rc4
27
+ - !ruby/object:Gem::Dependency
28
+ name: method_source
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: commander
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nio4r
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: distribution
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rjb
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
13
111
  - !ruby/object:Gem::Dependency
14
112
  name: bundler
15
113
  requirement: !ruby/object:Gem::Requirement
@@ -56,21 +154,22 @@ files:
56
154
  - TODO.md
57
155
  - benchmark/aggregate.rb
58
156
  - benchmark/bisect.rb
157
+ - benchmark/comparison/prepare.sh
158
+ - benchmark/comparison/python.py
159
+ - benchmark/comparison/r.r
160
+ - benchmark/comparison/ruby.rb
161
+ - benchmark/comparison/run-all.sh
162
+ - benchmark/comparison/scala.scala
59
163
  - benchmark/custom_marshal.rb
60
164
  - benchmark/digest.rb
61
165
  - benchmark/enumerator.rb
62
- - benchmark/performance/prepare.sh
63
- - benchmark/performance/python.py
64
- - benchmark/performance/r.r
65
- - benchmark/performance/ruby.rb
66
- - benchmark/performance/run-all.sh
67
- - benchmark/performance/scala.scala
68
166
  - benchmark/serializer.rb
69
167
  - benchmark/sort.rb
70
168
  - benchmark/sort2.rb
71
169
  - benchmark/take.rb
72
170
  - bin/ruby-spark
73
171
  - example/pi.rb
172
+ - example/website_search.rb
74
173
  - ext/ruby_c/extconf.rb
75
174
  - ext/ruby_c/murmur.c
76
175
  - ext/ruby_c/murmur.h
@@ -155,13 +254,16 @@ files:
155
254
  - lib/spark/rdd.rb
156
255
  - lib/spark/sampler.rb
157
256
  - lib/spark/serializer.rb
257
+ - lib/spark/serializer/auto_batched.rb
158
258
  - lib/spark/serializer/base.rb
259
+ - lib/spark/serializer/batched.rb
159
260
  - lib/spark/serializer/cartesian.rb
261
+ - lib/spark/serializer/compressed.rb
160
262
  - lib/spark/serializer/marshal.rb
161
263
  - lib/spark/serializer/message_pack.rb
162
264
  - lib/spark/serializer/oj.rb
163
265
  - lib/spark/serializer/pair.rb
164
- - lib/spark/serializer/utf8.rb
266
+ - lib/spark/serializer/text.rb
165
267
  - lib/spark/sort.rb
166
268
  - lib/spark/stat_counter.rb
167
269
  - lib/spark/storage_level.rb
@@ -245,7 +347,7 @@ rubyforge_project:
245
347
  rubygems_version: 2.2.2
246
348
  signing_key:
247
349
  specification_version: 4
248
- summary: Ruby wrapper for Spark
350
+ summary: Ruby wrapper for Apache Spark
249
351
  test_files:
250
352
  - spec/generator.rb
251
353
  - spec/inputs/lorem_300.txt
@@ -1,25 +0,0 @@
1
- module Spark
2
- module Serializer
3
- ##
4
- # Used for file
5
- #
6
- # File is sended as String but worker use serialization
7
- #
8
- class UTF8 < Base
9
-
10
- def set(*)
11
- unbatch!
12
- self
13
- end
14
-
15
- def batched?
16
- false
17
- end
18
-
19
- def load_next_from_io(io, lenght)
20
- io.read(lenght).force_encoding(Encoding::UTF_8)
21
- end
22
-
23
- end
24
- end
25
- end