ruby-spark 1.0.0 → 1.1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/README.md +99 -32
  4. data/TODO.md +2 -3
  5. data/benchmark/{performance → comparison}/prepare.sh +0 -0
  6. data/benchmark/{performance → comparison}/python.py +0 -0
  7. data/benchmark/{performance → comparison}/r.r +0 -0
  8. data/benchmark/{performance → comparison}/ruby.rb +0 -0
  9. data/benchmark/{performance → comparison}/run-all.sh +0 -0
  10. data/benchmark/{performance → comparison}/scala.scala +0 -0
  11. data/example/pi.rb +1 -1
  12. data/example/website_search.rb +83 -0
  13. data/ext/spark/src/main/scala/RubyRDD.scala +30 -2
  14. data/lib/spark.rb +2 -2
  15. data/lib/spark/build.rb +1 -1
  16. data/lib/spark/cli.rb +1 -1
  17. data/lib/spark/command/base.rb +4 -0
  18. data/lib/spark/command_builder.rb +2 -2
  19. data/lib/spark/config.rb +11 -17
  20. data/lib/spark/context.rb +63 -45
  21. data/lib/spark/ext/io.rb +11 -1
  22. data/lib/spark/java_bridge/base.rb +2 -2
  23. data/lib/spark/rdd.rb +67 -18
  24. data/lib/spark/serializer.rb +68 -13
  25. data/lib/spark/serializer/auto_batched.rb +59 -0
  26. data/lib/spark/serializer/base.rb +30 -137
  27. data/lib/spark/serializer/batched.rb +84 -0
  28. data/lib/spark/serializer/cartesian.rb +5 -29
  29. data/lib/spark/serializer/compressed.rb +27 -0
  30. data/lib/spark/serializer/marshal.rb +6 -8
  31. data/lib/spark/serializer/message_pack.rb +8 -10
  32. data/lib/spark/serializer/oj.rb +8 -10
  33. data/lib/spark/serializer/pair.rb +27 -13
  34. data/lib/spark/serializer/text.rb +25 -0
  35. data/lib/spark/version.rb +1 -1
  36. data/lib/spark/worker/worker.rb +5 -2
  37. data/ruby-spark.gemspec +13 -1
  38. data/spec/lib/context_spec.rb +3 -1
  39. data/spec/lib/manipulation_spec.rb +18 -10
  40. data/spec/lib/map_partitions_spec.rb +16 -16
  41. data/spec/lib/serializer_spec.rb +84 -9
  42. data/spec/lib/statistic_spec.rb +26 -24
  43. data/spec/spec_helper.rb +1 -2
  44. metadata +112 -10
  45. data/lib/spark/serializer/utf8.rb +0 -25
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- RSpec::shared_examples 'a stats' do |workers|
3
+ RSpec.shared_examples 'a stats' do |workers|
4
4
  let(:numbers) { [1.0, 1.0, 2.0, 3.0, 5.0, 8.0] }
5
5
 
6
6
  context "with #{workers || 'default'} worker" do
@@ -29,23 +29,23 @@ RSpec::shared_examples 'a stats' do |workers|
29
29
  end
30
30
  end
31
31
 
32
- RSpec::shared_examples 'a histogram' do |workers|
32
+ RSpec.shared_examples 'a histogram' do |workers|
33
33
 
34
34
  context "with #{workers || 'default'} worker" do
35
35
  it 'empty' do
36
- rdd = $sc.parallelize([], workers, batch_size: 1)
36
+ rdd = $sc.parallelize([], workers, ser)
37
37
 
38
38
  expect( rdd.histogram([0, 10])[1] ).to eq([0])
39
39
  expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
40
40
  end
41
41
 
42
42
  it 'validation' do
43
- rdd = $sc.parallelize([], workers, batch_size: 1)
43
+ rdd = $sc.parallelize([], workers, ser)
44
44
  expect { rdd.histogram(0) }.to raise_error(ArgumentError)
45
45
  end
46
46
 
47
47
  it 'double' do
48
- rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, batch_size: 1)
48
+ rdd = $sc.parallelize([1.0, 2.0, 3.0, 4.0], workers, ser)
49
49
  buckets, counts = rdd.histogram(2)
50
50
 
51
51
  expect(buckets).to eq([1.0, 2.5, 4.0])
@@ -53,91 +53,91 @@ RSpec::shared_examples 'a histogram' do |workers|
53
53
  end
54
54
 
55
55
  it 'out of range' do
56
- rdd = $sc.parallelize([10.01, -0.01], workers, batch_size: 1)
56
+ rdd = $sc.parallelize([10.01, -0.01], workers, ser)
57
57
 
58
58
  expect( rdd.histogram([0, 10])[1] ).to eq([0])
59
59
  expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
60
60
  end
61
61
 
62
62
  it 'in range with one bucket' do
63
- rdd = $sc.parallelize([1, 2, 3, 4], workers, batch_size: 1)
63
+ rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
64
64
 
65
65
  expect( rdd.histogram([0, 10])[1] ).to eq([4])
66
66
  expect( rdd.histogram([0, 4, 10])[1] ).to eq([3, 1])
67
67
  end
68
68
 
69
69
  it 'in range with one bucket exact match' do
70
- rdd = $sc.parallelize([1, 2, 3, 4], workers, batch_size: 1)
70
+ rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
71
71
  expect( rdd.histogram([1, 4])[1] ).to eq([4])
72
72
  end
73
73
 
74
74
  it 'out of range with two buckets' do
75
- rdd = $sc.parallelize([10.01, -0.01], workers, batch_size: 1)
75
+ rdd = $sc.parallelize([10.01, -0.01], workers, ser)
76
76
  expect( rdd.histogram([0, 5, 10])[1] ).to eq([0, 0])
77
77
  end
78
78
 
79
79
  it 'out of range with two uneven buckets' do
80
- rdd = $sc.parallelize([10.01, -0.01], workers, batch_size: 1)
80
+ rdd = $sc.parallelize([10.01, -0.01], workers, ser)
81
81
  expect( rdd.histogram([0, 4, 10])[1] ).to eq([0, 0])
82
82
  end
83
83
 
84
84
  it 'in range with two buckets' do
85
- rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, batch_size: 1)
85
+ rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
86
86
  expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
87
87
  end
88
88
 
89
89
  it 'in range with two bucket and nil' do
90
- rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, batch_size: 1)
90
+ rdd = $sc.parallelize([1, 2, 3, 5, 6, nil, Float::NAN], workers, ser)
91
91
  expect( rdd.histogram([0, 5, 10])[1] ).to eq([3, 2])
92
92
  end
93
93
 
94
94
  it 'in range with two uneven buckets' do
95
- rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, batch_size: 1)
95
+ rdd = $sc.parallelize([1, 2, 3, 5, 6], workers, ser)
96
96
  expect( rdd.histogram([0, 5, 11])[1] ).to eq([3, 2])
97
97
  end
98
98
 
99
99
  it 'mixed range with two uneven buckets' do
100
- rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, batch_size: 1)
100
+ rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01], workers, ser)
101
101
  expect( rdd.histogram([0, 5, 11])[1] ).to eq([4, 3])
102
102
  end
103
103
 
104
104
  it 'mixed range with four uneven buckets' do
105
- rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, batch_size: 1)
105
+ rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1], workers, ser)
106
106
  expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
107
107
  end
108
108
 
109
109
  it 'mixed range with uneven buckets and NaN' do
110
- rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, batch_size: 1)
110
+ rdd = $sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, nil, Float::NAN], workers, ser)
111
111
  expect( rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1] ).to eq([4, 2, 1, 3])
112
112
  end
113
113
 
114
114
  it 'out of range with infinite buckets' do
115
- rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, batch_size: 1)
115
+ rdd = $sc.parallelize([10.01, -0.01, Float::NAN, Float::INFINITY], workers, ser)
116
116
  expect( rdd.histogram([-Float::INFINITY, 0, Float::INFINITY])[1] ).to eq([1, 1])
117
117
  end
118
118
 
119
119
  it 'without buckets' do
120
- rdd = $sc.parallelize([1, 2, 3, 4], workers, batch_size: 1)
120
+ rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
121
121
  expect( rdd.histogram(1) ).to eq([[1, 4], [4]])
122
122
  end
123
123
 
124
124
  it 'without buckets single element' do
125
- rdd = $sc.parallelize([1], workers, batch_size: 1)
125
+ rdd = $sc.parallelize([1], workers, ser)
126
126
  expect( rdd.histogram(1) ).to eq([[1, 1], [1]])
127
127
  end
128
128
 
129
129
  it 'without bucket no range' do
130
- rdd = $sc.parallelize([1, 1, 1, 1], workers, batch_size: 1)
130
+ rdd = $sc.parallelize([1, 1, 1, 1], workers, ser)
131
131
  expect( rdd.histogram(1) ).to eq([[1, 1], [4]])
132
132
  end
133
133
 
134
134
  it 'without buckets basic two' do
135
- rdd = $sc.parallelize([1, 2, 3, 4], workers, batch_size: 1)
135
+ rdd = $sc.parallelize([1, 2, 3, 4], workers, ser)
136
136
  expect( rdd.histogram(2) ).to eq([[1, 2.5, 4], [2, 2]])
137
137
  end
138
138
 
139
139
  it 'without buckets with more requested than elements' do
140
- rdd = $sc.parallelize([1, 2], workers, batch_size: 1)
140
+ rdd = $sc.parallelize([1, 2], workers, ser)
141
141
  buckets = [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0]
142
142
  hist = [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
143
143
 
@@ -145,7 +145,7 @@ RSpec::shared_examples 'a histogram' do |workers|
145
145
  end
146
146
 
147
147
  it 'string' do
148
- rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, batch_size: 1)
148
+ rdd = $sc.parallelize(['ab', 'ac', 'b', 'bd', 'ef'], workers, ser)
149
149
 
150
150
  expect( rdd.histogram(['a', 'b', 'c'])[1] ).to eq([2, 2])
151
151
  expect( rdd.histogram(1) ).to eq([['ab', 'ef'], [5]])
@@ -155,7 +155,9 @@ RSpec::shared_examples 'a histogram' do |workers|
155
155
  end
156
156
  end
157
157
 
158
- RSpec::describe Spark::RDD do
158
+ RSpec.describe Spark::RDD do
159
+ let(:ser) { Spark::Serializer.build { __batched__(__marshal__, 1) } }
160
+
159
161
  context '.stats' do
160
162
  it_behaves_like 'a stats', 1
161
163
  it_behaves_like 'a stats', rand(2..5)
data/spec/spec_helper.rb CHANGED
@@ -14,8 +14,7 @@ Spark::Mllib.import
14
14
  def spark_start
15
15
  Spark.logger.disable
16
16
  Spark.config do
17
- set 'spark.ruby.parallelize_strategy', 'deep_copy'
18
- set 'spark.ruby.batch_size', 100
17
+ set 'spark.ruby.serializer.batch_size', 100
19
18
  end
20
19
  Spark.start
21
20
  $sc = Spark.context
metadata CHANGED
@@ -1,15 +1,113 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-spark
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ondřej Moravčík
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-04 00:00:00.000000000 Z
11
+ date: 2015-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: sourcify
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.6.0.rc4
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.6.0.rc4
27
+ - !ruby/object:Gem::Dependency
28
+ name: method_source
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: commander
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pry
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nio4r
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: distribution
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rjb
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
13
111
  - !ruby/object:Gem::Dependency
14
112
  name: bundler
15
113
  requirement: !ruby/object:Gem::Requirement
@@ -56,21 +154,22 @@ files:
56
154
  - TODO.md
57
155
  - benchmark/aggregate.rb
58
156
  - benchmark/bisect.rb
157
+ - benchmark/comparison/prepare.sh
158
+ - benchmark/comparison/python.py
159
+ - benchmark/comparison/r.r
160
+ - benchmark/comparison/ruby.rb
161
+ - benchmark/comparison/run-all.sh
162
+ - benchmark/comparison/scala.scala
59
163
  - benchmark/custom_marshal.rb
60
164
  - benchmark/digest.rb
61
165
  - benchmark/enumerator.rb
62
- - benchmark/performance/prepare.sh
63
- - benchmark/performance/python.py
64
- - benchmark/performance/r.r
65
- - benchmark/performance/ruby.rb
66
- - benchmark/performance/run-all.sh
67
- - benchmark/performance/scala.scala
68
166
  - benchmark/serializer.rb
69
167
  - benchmark/sort.rb
70
168
  - benchmark/sort2.rb
71
169
  - benchmark/take.rb
72
170
  - bin/ruby-spark
73
171
  - example/pi.rb
172
+ - example/website_search.rb
74
173
  - ext/ruby_c/extconf.rb
75
174
  - ext/ruby_c/murmur.c
76
175
  - ext/ruby_c/murmur.h
@@ -155,13 +254,16 @@ files:
155
254
  - lib/spark/rdd.rb
156
255
  - lib/spark/sampler.rb
157
256
  - lib/spark/serializer.rb
257
+ - lib/spark/serializer/auto_batched.rb
158
258
  - lib/spark/serializer/base.rb
259
+ - lib/spark/serializer/batched.rb
159
260
  - lib/spark/serializer/cartesian.rb
261
+ - lib/spark/serializer/compressed.rb
160
262
  - lib/spark/serializer/marshal.rb
161
263
  - lib/spark/serializer/message_pack.rb
162
264
  - lib/spark/serializer/oj.rb
163
265
  - lib/spark/serializer/pair.rb
164
- - lib/spark/serializer/utf8.rb
266
+ - lib/spark/serializer/text.rb
165
267
  - lib/spark/sort.rb
166
268
  - lib/spark/stat_counter.rb
167
269
  - lib/spark/storage_level.rb
@@ -245,7 +347,7 @@ rubyforge_project:
245
347
  rubygems_version: 2.2.2
246
348
  signing_key:
247
349
  specification_version: 4
248
- summary: Ruby wrapper for Spark
350
+ summary: Ruby wrapper for Apache Spark
249
351
  test_files:
250
352
  - spec/generator.rb
251
353
  - spec/inputs/lorem_300.txt
@@ -1,25 +0,0 @@
1
- module Spark
2
- module Serializer
3
- ##
4
- # Used for file
5
- #
6
- # File is sended as String but worker use serialization
7
- #
8
- class UTF8 < Base
9
-
10
- def set(*)
11
- unbatch!
12
- self
13
- end
14
-
15
- def batched?
16
- false
17
- end
18
-
19
- def load_next_from_io(io, lenght)
20
- io.read(lenght).force_encoding(Encoding::UTF_8)
21
- end
22
-
23
- end
24
- end
25
- end