ruby-spark 1.1.0.1-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +252 -0
- data/Rakefile +35 -0
- data/TODO.md +6 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/comparison/prepare.sh +18 -0
- data/benchmark/comparison/python.py +156 -0
- data/benchmark/comparison/r.r +69 -0
- data/benchmark/comparison/ruby.rb +167 -0
- data/benchmark/comparison/run-all.sh +160 -0
- data/benchmark/comparison/scala.scala +181 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/example/website_search.rb +83 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +158 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +238 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +322 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +67 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1377 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +79 -0
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +63 -0
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +13 -0
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +17 -0
- data/lib/spark/serializer/message_pack.rb +23 -0
- data/lib/spark/serializer/oj.rb +23 -0
- data/lib/spark/serializer/pair.rb +41 -0
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +200 -0
- data/ruby-spark.gemspec +47 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +165 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +122 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +88 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +170 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +38 -0
- metadata +389 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
class Oj < Base
|
4
|
+
|
5
|
+
def dump(data)
|
6
|
+
::Oj.dump(data)
|
7
|
+
end
|
8
|
+
|
9
|
+
def load(data)
|
10
|
+
::Oj.load(data)
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
begin
|
18
|
+
# TODO: require only if it is necessary
|
19
|
+
require 'oj'
|
20
|
+
|
21
|
+
Spark::Serializer.register('oj', Spark::Serializer::Oj)
|
22
|
+
rescue LoadError
|
23
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
class Pair < Base
|
4
|
+
|
5
|
+
def initialize(serializer1, serializer2)
|
6
|
+
@serializer1 = serializer1
|
7
|
+
@serializer2 = serializer2
|
8
|
+
end
|
9
|
+
|
10
|
+
def to_s
|
11
|
+
"#{name}(#{@serializer1}, #{@serializer2})"
|
12
|
+
end
|
13
|
+
|
14
|
+
def aggregate(item1, item2)
|
15
|
+
item1.zip(item2)
|
16
|
+
end
|
17
|
+
|
18
|
+
def load_from_io(io)
|
19
|
+
return to_enum(__callee__, io) unless block_given?
|
20
|
+
|
21
|
+
loop do
|
22
|
+
size = io.read_int_or_eof
|
23
|
+
break if size == Spark::Constant::DATA_EOF
|
24
|
+
|
25
|
+
item1 = @serializer1.load(io.read(size))
|
26
|
+
item2 = @serializer2.load(io.read_string)
|
27
|
+
|
28
|
+
item1 = [item1] unless @serializer1.batched?
|
29
|
+
item2 = [item2] unless @serializer2.batched?
|
30
|
+
|
31
|
+
aggregate(item1, item2).each do |item|
|
32
|
+
yield item
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
Spark::Serializer.register('pair', Spark::Serializer::Pair)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Spark
|
2
|
+
module Serializer
|
3
|
+
class Text < Base
|
4
|
+
|
5
|
+
attr_reader :encoding
|
6
|
+
|
7
|
+
def initialize(encoding=Encoding::UTF_8)
|
8
|
+
error('Encoding must be an instance of Encoding') unless encoding.is_a?(Encoding)
|
9
|
+
|
10
|
+
@encoding = encoding
|
11
|
+
end
|
12
|
+
|
13
|
+
def load(data)
|
14
|
+
data.to_s.force_encoding(@encoding)
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_s
|
18
|
+
"Text(#{@encoding})"
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
Spark::Serializer.register('string', 'text', Spark::Serializer::Text)
|
data/lib/spark/sort.rb
ADDED
@@ -0,0 +1,189 @@
|
|
1
|
+
module Spark
|
2
|
+
module InternalSorter
|
3
|
+
class Base
|
4
|
+
def initialize(key_function)
|
5
|
+
@key_function = key_function
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
class Ascending < Base
|
10
|
+
def sort(data)
|
11
|
+
data.sort_by!(&@key_function)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Descending < Ascending
|
16
|
+
def sort(data)
|
17
|
+
super
|
18
|
+
data.reverse!
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.get(ascending, key_function)
|
23
|
+
if ascending
|
24
|
+
type = Ascending
|
25
|
+
else
|
26
|
+
type = Descending
|
27
|
+
end
|
28
|
+
|
29
|
+
type.new(key_function)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
module Spark
|
36
|
+
class ExternalSorter
|
37
|
+
|
38
|
+
include Spark::Helper::System
|
39
|
+
|
40
|
+
# Items from GC cannot be destroyed so #make_parts need some reserve
|
41
|
+
MEMORY_RESERVE = 50 # %
|
42
|
+
|
43
|
+
# How big will be chunk for adding new memory because GC not cleaning
|
44
|
+
# immediately un-referenced variables
|
45
|
+
MEMORY_FREE_CHUNK = 10 # %
|
46
|
+
|
47
|
+
# How many items will be evaluate from iterator at start
|
48
|
+
START_SLICE_SIZE = 10
|
49
|
+
|
50
|
+
# Maximum of slicing. Memory control can be avoided by large value.
|
51
|
+
MAX_SLICE_SIZE = 10_000
|
52
|
+
|
53
|
+
# How many values will be taken from each enumerator.
|
54
|
+
EVAL_N_VALUES = 10
|
55
|
+
|
56
|
+
# Default key function
|
57
|
+
KEY_FUNCTION = lambda{|item| item}
|
58
|
+
|
59
|
+
attr_reader :total_memory, :memory_limit, :memory_chunk, :serializer
|
60
|
+
|
61
|
+
def initialize(total_memory, serializer)
|
62
|
+
@total_memory = total_memory
|
63
|
+
@memory_limit = total_memory * (100-MEMORY_RESERVE) / 100
|
64
|
+
@memory_chunk = total_memory * (100-MEMORY_FREE_CHUNK) / 100
|
65
|
+
@serializer = serializer
|
66
|
+
end
|
67
|
+
|
68
|
+
def add_memory!
|
69
|
+
@memory_limit += memory_chunk
|
70
|
+
end
|
71
|
+
|
72
|
+
def sort_by(iterator, ascending=true, key_function=KEY_FUNCTION)
|
73
|
+
return to_enum(__callee__, iterator, key_function) unless block_given?
|
74
|
+
|
75
|
+
create_temp_folder
|
76
|
+
internal_sorter = Spark::InternalSorter.get(ascending, key_function)
|
77
|
+
|
78
|
+
# Make N sorted enumerators
|
79
|
+
parts = make_parts(iterator, internal_sorter)
|
80
|
+
|
81
|
+
return [] if parts.empty?
|
82
|
+
|
83
|
+
# Need new key function because items have new structure
|
84
|
+
# From: [1,2,3] to [[1, Enumerator],[2, Enumerator],[3, Enumerator]]
|
85
|
+
key_function_with_enum = lambda{|(key, _)| key_function[key]}
|
86
|
+
internal_sorter = Spark::InternalSorter.get(ascending, key_function_with_enum)
|
87
|
+
|
88
|
+
heap = []
|
89
|
+
enums = []
|
90
|
+
|
91
|
+
# Load first items to heap
|
92
|
+
parts.each do |part|
|
93
|
+
EVAL_N_VALUES.times {
|
94
|
+
begin
|
95
|
+
heap << [part.next, part]
|
96
|
+
rescue StopIteration
|
97
|
+
break
|
98
|
+
end
|
99
|
+
}
|
100
|
+
end
|
101
|
+
|
102
|
+
# Parts can be empty but heap not
|
103
|
+
while parts.any? || heap.any?
|
104
|
+
internal_sorter.sort(heap)
|
105
|
+
|
106
|
+
# Since parts are sorted and heap contains EVAL_N_VALUES method
|
107
|
+
# can add EVAL_N_VALUES items to the result
|
108
|
+
EVAL_N_VALUES.times {
|
109
|
+
break if heap.empty?
|
110
|
+
|
111
|
+
item, enum = heap.shift
|
112
|
+
enums << enum
|
113
|
+
|
114
|
+
yield item
|
115
|
+
}
|
116
|
+
|
117
|
+
# Add new element to heap from part of which was result item
|
118
|
+
while (enum = enums.shift)
|
119
|
+
begin
|
120
|
+
heap << [enum.next, enum]
|
121
|
+
rescue StopIteration
|
122
|
+
parts.delete(enum)
|
123
|
+
enums.delete(enum)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
ensure
|
129
|
+
destroy_temp_folder
|
130
|
+
end
|
131
|
+
|
132
|
+
private
|
133
|
+
|
134
|
+
def create_temp_folder
|
135
|
+
@dir = Dir.mktmpdir
|
136
|
+
end
|
137
|
+
|
138
|
+
def destroy_temp_folder
|
139
|
+
FileUtils.remove_entry_secure(@dir) if @dir
|
140
|
+
end
|
141
|
+
|
142
|
+
# New part is created when current part exceeds memory limit (is variable)
|
143
|
+
# Every new part have more memory because of ruby GC
|
144
|
+
def make_parts(iterator, internal_sorter)
|
145
|
+
slice = START_SLICE_SIZE
|
146
|
+
|
147
|
+
parts = []
|
148
|
+
part = []
|
149
|
+
|
150
|
+
loop do
|
151
|
+
begin
|
152
|
+
# Enumerator does not have slice method
|
153
|
+
slice.times { part << iterator.next }
|
154
|
+
rescue StopIteration
|
155
|
+
break
|
156
|
+
end
|
157
|
+
|
158
|
+
# Carefully memory_limit is variable
|
159
|
+
if memory_usage > memory_limit
|
160
|
+
# Sort current part with origin key_function
|
161
|
+
internal_sorter.sort(part)
|
162
|
+
# Tempfile for current part
|
163
|
+
# will be destroyed on #destroy_temp_folder
|
164
|
+
file = Tempfile.new("part", @dir)
|
165
|
+
serializer.dump(part, file)
|
166
|
+
# Peek is at the end of file
|
167
|
+
file.seek(0)
|
168
|
+
parts << serializer.load(file)
|
169
|
+
|
170
|
+
# Some memory will be released but not immediately
|
171
|
+
# need some new memory for start
|
172
|
+
part.clear
|
173
|
+
add_memory!
|
174
|
+
else
|
175
|
+
slice = [slice*2, MAX_SLICE_SIZE].min
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
# Last part which is not in the file
|
180
|
+
if part.any?
|
181
|
+
internal_sorter.sort(part)
|
182
|
+
parts << part.each
|
183
|
+
end
|
184
|
+
|
185
|
+
parts
|
186
|
+
end
|
187
|
+
|
188
|
+
end # ExternalSorter
|
189
|
+
end # Spark
|
@@ -0,0 +1,125 @@
|
|
1
|
+
module Spark
|
2
|
+
class StatCounter
|
3
|
+
|
4
|
+
attr_reader :n # count of our values
|
5
|
+
attr_reader :mu # mean of our values
|
6
|
+
attr_reader :m2 # variance numerator (sum of (x - mean)^2)
|
7
|
+
attr_reader :max # max of our values
|
8
|
+
attr_reader :min # min of our values
|
9
|
+
|
10
|
+
def initialize(iterator)
|
11
|
+
@n = 0
|
12
|
+
@mu = 0.0
|
13
|
+
@m2 = 0.0
|
14
|
+
@max = -Float::INFINITY
|
15
|
+
@min = Float::INFINITY
|
16
|
+
|
17
|
+
merge(iterator)
|
18
|
+
end
|
19
|
+
|
20
|
+
def merge(other)
|
21
|
+
if other.is_a?(Spark::StatCounter)
|
22
|
+
merge_stat_counter(other)
|
23
|
+
elsif other.respond_to?(:each)
|
24
|
+
merge_array(other)
|
25
|
+
else
|
26
|
+
merge_value(other)
|
27
|
+
end
|
28
|
+
|
29
|
+
self
|
30
|
+
end
|
31
|
+
|
32
|
+
def sum
|
33
|
+
@n * @mu
|
34
|
+
end
|
35
|
+
|
36
|
+
# Return the variance of the values.
|
37
|
+
def variance
|
38
|
+
if @n == 0
|
39
|
+
Float::NAN
|
40
|
+
else
|
41
|
+
@m2 / @n
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Return the sample variance, which corrects for bias in estimating the variance by dividing
|
46
|
+
# by N-1 instead of N.
|
47
|
+
def sample_variance
|
48
|
+
if @n <= 1
|
49
|
+
Float::NAN
|
50
|
+
else
|
51
|
+
@m2 / (@n - 1)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Return the standard deviation of the values.
|
56
|
+
def stdev
|
57
|
+
Math.sqrt(variance)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Return the sample standard deviation of the values, which corrects for bias in estimating the
|
61
|
+
# variance by dividing by N-1 instead of N.
|
62
|
+
def sample_stdev
|
63
|
+
Math.sqrt(sample_variance)
|
64
|
+
end
|
65
|
+
|
66
|
+
def to_s
|
67
|
+
"(count: #{count}, mean: #{mean}, stdev: #{stdev}, max: #{max}, min: #{min})"
|
68
|
+
end
|
69
|
+
|
70
|
+
alias_method :count, :n
|
71
|
+
alias_method :mean, :mu
|
72
|
+
alias_method :max_value, :max
|
73
|
+
alias_method :min_value, :min
|
74
|
+
alias_method :sampleStdev, :sample_stdev
|
75
|
+
alias_method :sampleVariance, :sample_variance
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def merge_stat_counter(other)
|
80
|
+
if other == self
|
81
|
+
other = self.deep_copy
|
82
|
+
end
|
83
|
+
|
84
|
+
if @n == 0
|
85
|
+
@n = other.n
|
86
|
+
@mu = other.mu
|
87
|
+
@m2 = other.m2
|
88
|
+
@max = other.max
|
89
|
+
@min = other.min
|
90
|
+
elsif other.n != 0
|
91
|
+
delta = other.mu - @mu
|
92
|
+
|
93
|
+
if other.n * 10 < @n
|
94
|
+
@mu = @mu + (delta * other.n) / (@n + other.n)
|
95
|
+
elsif @n * 10 < other.n
|
96
|
+
@mu = other.mu - (delta * @n) / (@n + other.n)
|
97
|
+
else
|
98
|
+
@mu = (@mu * @n + other.mu * other.n) / (@n + other.n)
|
99
|
+
end
|
100
|
+
|
101
|
+
@max = [@max, other.max].max
|
102
|
+
@min = [@min, other.min].min
|
103
|
+
|
104
|
+
@m2 += other.m2 + (delta * delta * @n * other.n) / (@n + other.n)
|
105
|
+
@n += other.n
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def merge_array(array)
|
110
|
+
array.each do |item|
|
111
|
+
merge_value(item)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def merge_value(value)
|
116
|
+
delta = value - @mu
|
117
|
+
@n += 1
|
118
|
+
@mu += delta / @n
|
119
|
+
@m2 += delta * (value - @mu)
|
120
|
+
@max = [@max, value].max
|
121
|
+
@min = [@min, value].min
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
# Necessary libraries
|
2
|
+
Spark.load_lib
|
3
|
+
|
4
|
+
module Spark
|
5
|
+
class StorageLevel
|
6
|
+
|
7
|
+
def self.reload
|
8
|
+
return if @reloaded
|
9
|
+
reload!
|
10
|
+
@reloaded = true
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.reload!
|
14
|
+
self.const_set(:NONE, JStorageLevel.NONE)
|
15
|
+
self.const_set(:DISK_ONLY, JStorageLevel.DISK_ONLY)
|
16
|
+
self.const_set(:DISK_ONLY_2, JStorageLevel.DISK_ONLY_2)
|
17
|
+
self.const_set(:MEMORY_ONLY, JStorageLevel.MEMORY_ONLY)
|
18
|
+
self.const_set(:MEMORY_ONLY_SER, JStorageLevel.MEMORY_ONLY_SER)
|
19
|
+
self.const_set(:MEMORY_ONLY_2, JStorageLevel.MEMORY_ONLY_2)
|
20
|
+
self.const_set(:MEMORY_ONLY_SER_2, JStorageLevel.MEMORY_ONLY_SER_2)
|
21
|
+
self.const_set(:MEMORY_AND_DISK, JStorageLevel.MEMORY_AND_DISK)
|
22
|
+
self.const_set(:MEMORY_AND_DISK_2, JStorageLevel.MEMORY_AND_DISK_2)
|
23
|
+
self.const_set(:MEMORY_AND_DISK_SER, JStorageLevel.MEMORY_AND_DISK_SER)
|
24
|
+
self.const_set(:MEMORY_AND_DISK_SER_2, JStorageLevel.MEMORY_AND_DISK_SER_2)
|
25
|
+
self.const_set(:OFF_HEAP, JStorageLevel.OFF_HEAP)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.java_get(arg)
|
29
|
+
reload
|
30
|
+
|
31
|
+
if arg.is_a?(String)
|
32
|
+
const_get(arg.upcase)
|
33
|
+
else
|
34
|
+
arg
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|