ruby-spark 1.1.0.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +37 -0
- data/Gemfile +47 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +252 -0
- data/Rakefile +35 -0
- data/TODO.md +6 -0
- data/benchmark/aggregate.rb +33 -0
- data/benchmark/bisect.rb +88 -0
- data/benchmark/comparison/prepare.sh +18 -0
- data/benchmark/comparison/python.py +156 -0
- data/benchmark/comparison/r.r +69 -0
- data/benchmark/comparison/ruby.rb +167 -0
- data/benchmark/comparison/run-all.sh +160 -0
- data/benchmark/comparison/scala.scala +181 -0
- data/benchmark/custom_marshal.rb +94 -0
- data/benchmark/digest.rb +150 -0
- data/benchmark/enumerator.rb +88 -0
- data/benchmark/serializer.rb +82 -0
- data/benchmark/sort.rb +43 -0
- data/benchmark/sort2.rb +164 -0
- data/benchmark/take.rb +28 -0
- data/bin/ruby-spark +8 -0
- data/example/pi.rb +28 -0
- data/example/website_search.rb +83 -0
- data/ext/ruby_c/extconf.rb +3 -0
- data/ext/ruby_c/murmur.c +158 -0
- data/ext/ruby_c/murmur.h +9 -0
- data/ext/ruby_c/ruby-spark.c +18 -0
- data/ext/ruby_java/Digest.java +36 -0
- data/ext/ruby_java/Murmur2.java +98 -0
- data/ext/ruby_java/RubySparkExtService.java +28 -0
- data/ext/ruby_java/extconf.rb +3 -0
- data/ext/spark/build.sbt +73 -0
- data/ext/spark/project/plugins.sbt +9 -0
- data/ext/spark/sbt/sbt +34 -0
- data/ext/spark/src/main/scala/Exec.scala +91 -0
- data/ext/spark/src/main/scala/MLLibAPI.scala +4 -0
- data/ext/spark/src/main/scala/Marshal.scala +52 -0
- data/ext/spark/src/main/scala/MarshalDump.scala +113 -0
- data/ext/spark/src/main/scala/MarshalLoad.scala +220 -0
- data/ext/spark/src/main/scala/RubyAccumulatorParam.scala +69 -0
- data/ext/spark/src/main/scala/RubyBroadcast.scala +13 -0
- data/ext/spark/src/main/scala/RubyConstant.scala +13 -0
- data/ext/spark/src/main/scala/RubyMLLibAPI.scala +55 -0
- data/ext/spark/src/main/scala/RubyMLLibUtilAPI.scala +21 -0
- data/ext/spark/src/main/scala/RubyPage.scala +34 -0
- data/ext/spark/src/main/scala/RubyRDD.scala +392 -0
- data/ext/spark/src/main/scala/RubySerializer.scala +14 -0
- data/ext/spark/src/main/scala/RubyTab.scala +11 -0
- data/ext/spark/src/main/scala/RubyUtils.scala +15 -0
- data/ext/spark/src/main/scala/RubyWorker.scala +257 -0
- data/ext/spark/src/test/scala/MarshalSpec.scala +84 -0
- data/lib/ruby-spark.rb +1 -0
- data/lib/spark.rb +198 -0
- data/lib/spark/accumulator.rb +260 -0
- data/lib/spark/broadcast.rb +98 -0
- data/lib/spark/build.rb +43 -0
- data/lib/spark/cli.rb +169 -0
- data/lib/spark/command.rb +86 -0
- data/lib/spark/command/base.rb +158 -0
- data/lib/spark/command/basic.rb +345 -0
- data/lib/spark/command/pair.rb +124 -0
- data/lib/spark/command/sort.rb +51 -0
- data/lib/spark/command/statistic.rb +144 -0
- data/lib/spark/command_builder.rb +141 -0
- data/lib/spark/command_validator.rb +34 -0
- data/lib/spark/config.rb +238 -0
- data/lib/spark/constant.rb +14 -0
- data/lib/spark/context.rb +322 -0
- data/lib/spark/error.rb +50 -0
- data/lib/spark/ext/hash.rb +41 -0
- data/lib/spark/ext/integer.rb +25 -0
- data/lib/spark/ext/io.rb +67 -0
- data/lib/spark/ext/ip_socket.rb +29 -0
- data/lib/spark/ext/module.rb +58 -0
- data/lib/spark/ext/object.rb +24 -0
- data/lib/spark/ext/string.rb +24 -0
- data/lib/spark/helper.rb +10 -0
- data/lib/spark/helper/logger.rb +40 -0
- data/lib/spark/helper/parser.rb +85 -0
- data/lib/spark/helper/serialize.rb +71 -0
- data/lib/spark/helper/statistic.rb +93 -0
- data/lib/spark/helper/system.rb +42 -0
- data/lib/spark/java_bridge.rb +19 -0
- data/lib/spark/java_bridge/base.rb +203 -0
- data/lib/spark/java_bridge/jruby.rb +23 -0
- data/lib/spark/java_bridge/rjb.rb +41 -0
- data/lib/spark/logger.rb +76 -0
- data/lib/spark/mllib.rb +100 -0
- data/lib/spark/mllib/classification/common.rb +31 -0
- data/lib/spark/mllib/classification/logistic_regression.rb +223 -0
- data/lib/spark/mllib/classification/naive_bayes.rb +97 -0
- data/lib/spark/mllib/classification/svm.rb +135 -0
- data/lib/spark/mllib/clustering/gaussian_mixture.rb +82 -0
- data/lib/spark/mllib/clustering/kmeans.rb +118 -0
- data/lib/spark/mllib/matrix.rb +120 -0
- data/lib/spark/mllib/regression/common.rb +73 -0
- data/lib/spark/mllib/regression/labeled_point.rb +41 -0
- data/lib/spark/mllib/regression/lasso.rb +100 -0
- data/lib/spark/mllib/regression/linear.rb +124 -0
- data/lib/spark/mllib/regression/ridge.rb +97 -0
- data/lib/spark/mllib/ruby_matrix/matrix_adapter.rb +53 -0
- data/lib/spark/mllib/ruby_matrix/vector_adapter.rb +57 -0
- data/lib/spark/mllib/stat/distribution.rb +12 -0
- data/lib/spark/mllib/vector.rb +185 -0
- data/lib/spark/rdd.rb +1377 -0
- data/lib/spark/sampler.rb +92 -0
- data/lib/spark/serializer.rb +79 -0
- data/lib/spark/serializer/auto_batched.rb +59 -0
- data/lib/spark/serializer/base.rb +63 -0
- data/lib/spark/serializer/batched.rb +84 -0
- data/lib/spark/serializer/cartesian.rb +13 -0
- data/lib/spark/serializer/compressed.rb +27 -0
- data/lib/spark/serializer/marshal.rb +17 -0
- data/lib/spark/serializer/message_pack.rb +23 -0
- data/lib/spark/serializer/oj.rb +23 -0
- data/lib/spark/serializer/pair.rb +41 -0
- data/lib/spark/serializer/text.rb +25 -0
- data/lib/spark/sort.rb +189 -0
- data/lib/spark/stat_counter.rb +125 -0
- data/lib/spark/storage_level.rb +39 -0
- data/lib/spark/version.rb +3 -0
- data/lib/spark/worker/master.rb +144 -0
- data/lib/spark/worker/spark_files.rb +15 -0
- data/lib/spark/worker/worker.rb +200 -0
- data/ruby-spark.gemspec +47 -0
- data/spec/generator.rb +37 -0
- data/spec/inputs/lorem_300.txt +316 -0
- data/spec/inputs/numbers/1.txt +50 -0
- data/spec/inputs/numbers/10.txt +50 -0
- data/spec/inputs/numbers/11.txt +50 -0
- data/spec/inputs/numbers/12.txt +50 -0
- data/spec/inputs/numbers/13.txt +50 -0
- data/spec/inputs/numbers/14.txt +50 -0
- data/spec/inputs/numbers/15.txt +50 -0
- data/spec/inputs/numbers/16.txt +50 -0
- data/spec/inputs/numbers/17.txt +50 -0
- data/spec/inputs/numbers/18.txt +50 -0
- data/spec/inputs/numbers/19.txt +50 -0
- data/spec/inputs/numbers/2.txt +50 -0
- data/spec/inputs/numbers/20.txt +50 -0
- data/spec/inputs/numbers/3.txt +50 -0
- data/spec/inputs/numbers/4.txt +50 -0
- data/spec/inputs/numbers/5.txt +50 -0
- data/spec/inputs/numbers/6.txt +50 -0
- data/spec/inputs/numbers/7.txt +50 -0
- data/spec/inputs/numbers/8.txt +50 -0
- data/spec/inputs/numbers/9.txt +50 -0
- data/spec/inputs/numbers_0_100.txt +101 -0
- data/spec/inputs/numbers_1_100.txt +100 -0
- data/spec/lib/collect_spec.rb +42 -0
- data/spec/lib/command_spec.rb +68 -0
- data/spec/lib/config_spec.rb +64 -0
- data/spec/lib/context_spec.rb +165 -0
- data/spec/lib/ext_spec.rb +72 -0
- data/spec/lib/external_apps_spec.rb +45 -0
- data/spec/lib/filter_spec.rb +80 -0
- data/spec/lib/flat_map_spec.rb +100 -0
- data/spec/lib/group_spec.rb +109 -0
- data/spec/lib/helper_spec.rb +19 -0
- data/spec/lib/key_spec.rb +41 -0
- data/spec/lib/manipulation_spec.rb +122 -0
- data/spec/lib/map_partitions_spec.rb +87 -0
- data/spec/lib/map_spec.rb +91 -0
- data/spec/lib/mllib/classification_spec.rb +54 -0
- data/spec/lib/mllib/clustering_spec.rb +35 -0
- data/spec/lib/mllib/matrix_spec.rb +32 -0
- data/spec/lib/mllib/regression_spec.rb +116 -0
- data/spec/lib/mllib/vector_spec.rb +77 -0
- data/spec/lib/reduce_by_key_spec.rb +118 -0
- data/spec/lib/reduce_spec.rb +131 -0
- data/spec/lib/sample_spec.rb +46 -0
- data/spec/lib/serializer_spec.rb +88 -0
- data/spec/lib/sort_spec.rb +58 -0
- data/spec/lib/statistic_spec.rb +170 -0
- data/spec/lib/whole_text_files_spec.rb +33 -0
- data/spec/spec_helper.rb +38 -0
- metadata +389 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
# Parse sitemap and search word on every page
|
|
4
|
+
|
|
5
|
+
require 'optparse'
|
|
6
|
+
require 'open-uri'
|
|
7
|
+
require 'nokogiri'
|
|
8
|
+
require 'ruby-spark'
|
|
9
|
+
|
|
10
|
+
options = {
|
|
11
|
+
sitemap: 'http://fit.cvut.cz/sitemap.xml',
|
|
12
|
+
query: 'cvut',
|
|
13
|
+
workers: 2
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
opt_parser = OptionParser.new do |opts|
|
|
17
|
+
opts.banner = 'Usage: website_search.rb [options]'
|
|
18
|
+
|
|
19
|
+
opts.separator ''
|
|
20
|
+
opts.separator 'Specific options:'
|
|
21
|
+
|
|
22
|
+
opts.on('-s', '--sitemap SITEMAP', 'Sitemap URL') do |sitemap|
|
|
23
|
+
options[:sitemap] = sitemap
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
opts.on('-q', '--query QUERY', 'Query to search') do |query|
|
|
27
|
+
options[:query] = query
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
opts.on('-w', '--workers WORKERS_NUM', Integer, 'Number of workers') do |workers|
|
|
31
|
+
options[:workers] = workers
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
opts.on('--quite', 'Run quitely') do |v|
|
|
35
|
+
Spark.logger.disabled
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
opts.on_tail('-h', '--help', 'Show this message') do
|
|
39
|
+
puts opts
|
|
40
|
+
exit
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
opt_parser.parse!
|
|
45
|
+
|
|
46
|
+
@links = []
|
|
47
|
+
|
|
48
|
+
def parse_sitemap(doc)
|
|
49
|
+
doc.xpath('//sitemapindex/sitemap/loc').each do |loc|
|
|
50
|
+
next_doc = Nokogiri::HTML(open(loc.text))
|
|
51
|
+
parse_sitemap(next_doc)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
doc.xpath('//url/loc').each do |loc|
|
|
55
|
+
@links << loc.text
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
doc = Nokogiri::HTML(open(options[:sitemap]))
|
|
60
|
+
parse_sitemap(doc)
|
|
61
|
+
|
|
62
|
+
# Map function
|
|
63
|
+
func = Proc.new do |url|
|
|
64
|
+
begin
|
|
65
|
+
open(url) {|f|
|
|
66
|
+
[url, f.read.scan(query).count]
|
|
67
|
+
}
|
|
68
|
+
rescue
|
|
69
|
+
[url, 0]
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
Spark.start
|
|
74
|
+
|
|
75
|
+
rdd = Spark.sc.parallelize(@links, options[:workers])
|
|
76
|
+
.add_library('open-uri')
|
|
77
|
+
.bind(query: options[:query])
|
|
78
|
+
.map(func)
|
|
79
|
+
.sort_by(lambda{|(_, value)| value}, false)
|
|
80
|
+
|
|
81
|
+
rdd.collect.each do |(url, count)|
|
|
82
|
+
puts "#{url} => #{count}"
|
|
83
|
+
end
|
data/ext/ruby_c/murmur.c
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#include "murmur.h"
|
|
2
|
+
|
|
3
|
+
#if defined(_MSC_VER)
|
|
4
|
+
#define BIG_CONSTANT(x) (x)
|
|
5
|
+
#else
|
|
6
|
+
#define BIG_CONSTANT(x) (x##LLU)
|
|
7
|
+
#endif
|
|
8
|
+
|
|
9
|
+
/*-----------------------------------------------------------------------------
|
|
10
|
+
// MurmurHash2, 64-bit versions, by Austin Appleby
|
|
11
|
+
//
|
|
12
|
+
// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
|
|
13
|
+
// and endian-ness issues if used across multiple platforms.
|
|
14
|
+
//
|
|
15
|
+
// 64-bit hash for 64-bit platforms
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
uint64_t MurmurHash64A(const void * key, int len, uint64_t seed)
|
|
19
|
+
{
|
|
20
|
+
const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995);
|
|
21
|
+
const int r = 47;
|
|
22
|
+
|
|
23
|
+
uint64_t h = seed ^ (len * m);
|
|
24
|
+
|
|
25
|
+
const uint64_t * data = (const uint64_t *)key;
|
|
26
|
+
const uint64_t * end = data + (len/8);
|
|
27
|
+
|
|
28
|
+
while(data != end)
|
|
29
|
+
{
|
|
30
|
+
uint64_t k = *data++;
|
|
31
|
+
|
|
32
|
+
k *= m;
|
|
33
|
+
k ^= k >> r;
|
|
34
|
+
k *= m;
|
|
35
|
+
|
|
36
|
+
h ^= k;
|
|
37
|
+
h *= m;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const unsigned char * data2 = (const unsigned char*)data;
|
|
41
|
+
|
|
42
|
+
switch(len & 7)
|
|
43
|
+
{
|
|
44
|
+
case 7: h ^= ((uint64_t) data2[6]) << 48;
|
|
45
|
+
case 6: h ^= ((uint64_t) data2[5]) << 40;
|
|
46
|
+
case 5: h ^= ((uint64_t) data2[4]) << 32;
|
|
47
|
+
case 4: h ^= ((uint64_t) data2[3]) << 24;
|
|
48
|
+
case 3: h ^= ((uint64_t) data2[2]) << 16;
|
|
49
|
+
case 2: h ^= ((uint64_t) data2[1]) << 8;
|
|
50
|
+
case 1: h ^= ((uint64_t) data2[0]);
|
|
51
|
+
h *= m;
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
h ^= h >> r;
|
|
55
|
+
h *= m;
|
|
56
|
+
h ^= h >> r;
|
|
57
|
+
|
|
58
|
+
return h;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/* 64-bit hash for 32-bit platforms */
|
|
62
|
+
|
|
63
|
+
uint64_t MurmurHash64B(const void * key, int len, uint64_t seed)
|
|
64
|
+
{
|
|
65
|
+
const uint32_t m = 0x5bd1e995;
|
|
66
|
+
const int r = 24;
|
|
67
|
+
|
|
68
|
+
uint32_t h1 = ((uint32_t) seed) ^ len;
|
|
69
|
+
uint32_t h2 = ((uint32_t) (seed >> 32));
|
|
70
|
+
|
|
71
|
+
const uint32_t * data = (const uint32_t *)key;
|
|
72
|
+
|
|
73
|
+
while(len >= 8)
|
|
74
|
+
{
|
|
75
|
+
uint32_t k1 = *data++;
|
|
76
|
+
k1 *= m; k1 ^= k1 >> r; k1 *= m;
|
|
77
|
+
h1 *= m; h1 ^= k1;
|
|
78
|
+
len -= 4;
|
|
79
|
+
|
|
80
|
+
uint32_t k2 = *data++;
|
|
81
|
+
k2 *= m; k2 ^= k2 >> r; k2 *= m;
|
|
82
|
+
h2 *= m; h2 ^= k2;
|
|
83
|
+
len -= 4;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if(len >= 4)
|
|
87
|
+
{
|
|
88
|
+
uint32_t k1 = *data++;
|
|
89
|
+
k1 *= m; k1 ^= k1 >> r; k1 *= m;
|
|
90
|
+
h1 *= m; h1 ^= k1;
|
|
91
|
+
len -= 4;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
switch(len)
|
|
95
|
+
{
|
|
96
|
+
case 3: h2 ^= ((unsigned char*)data)[2] << 16;
|
|
97
|
+
case 2: h2 ^= ((unsigned char*)data)[1] << 8;
|
|
98
|
+
case 1: h2 ^= ((unsigned char*)data)[0];
|
|
99
|
+
h2 *= m;
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
h1 ^= h2 >> 18; h1 *= m;
|
|
103
|
+
h2 ^= h1 >> 22; h2 *= m;
|
|
104
|
+
h1 ^= h2 >> 17; h1 *= m;
|
|
105
|
+
h2 ^= h1 >> 19; h2 *= m;
|
|
106
|
+
|
|
107
|
+
uint64_t h = h1;
|
|
108
|
+
|
|
109
|
+
h = (h << 32) | h2;
|
|
110
|
+
|
|
111
|
+
return h;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
// ================================================================================================
|
|
117
|
+
// Ruby methods
|
|
118
|
+
|
|
119
|
+
#define PORTABLE_HASH_SEED 16154832
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
VALUE murmur2_digest(VALUE rb_str, uint64_t seed)
|
|
123
|
+
{
|
|
124
|
+
StringValue(rb_str);
|
|
125
|
+
|
|
126
|
+
void * key = RSTRING_PTR(rb_str);
|
|
127
|
+
long len = RSTRING_LEN(rb_str);
|
|
128
|
+
|
|
129
|
+
uint64_t result = MurmurHash64A(key, len, seed);
|
|
130
|
+
|
|
131
|
+
return LONG2FIX(result);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// ------------------------------------------------------------------------------------------------
|
|
135
|
+
// Spark::Digest::Murmur2.digest
|
|
136
|
+
|
|
137
|
+
VALUE method_murmur2_digest(int argc, VALUE *argv, VALUE klass)
|
|
138
|
+
{
|
|
139
|
+
if(argc == 0 || argc > 2){
|
|
140
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
uint64_t seed = (argc == 1 ? 0 : NUM2UINT(argv[1]));
|
|
144
|
+
|
|
145
|
+
return murmur2_digest(argv[0], seed);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// ------------------------------------------------------------------------------------------------
|
|
149
|
+
// Spark::Digest.portable_hash
|
|
150
|
+
|
|
151
|
+
VALUE method_portable_hash(int argc, VALUE *argv, VALUE klass)
|
|
152
|
+
{
|
|
153
|
+
if(argc != 1){
|
|
154
|
+
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return murmur2_digest(argv[0], PORTABLE_HASH_SEED);
|
|
158
|
+
}
|
data/ext/ruby_c/murmur.h
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#include "ruby.h"
|
|
2
|
+
#include "murmur.h"
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
VALUE SparkModule;
|
|
6
|
+
VALUE SparkDigestModule;
|
|
7
|
+
VALUE SparkDigestMurmur2Class;
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
void Init_ruby_spark_ext()
|
|
11
|
+
{
|
|
12
|
+
SparkModule = rb_define_module("Spark");
|
|
13
|
+
SparkDigestModule = rb_define_module_under(SparkModule, "Digest");
|
|
14
|
+
SparkDigestMurmur2Class = rb_define_class_under(SparkDigestModule, "Murmur2", rb_cObject);
|
|
15
|
+
|
|
16
|
+
rb_define_singleton_method(SparkDigestModule, "portable_hash", method_portable_hash, -1);
|
|
17
|
+
rb_define_singleton_method(SparkDigestMurmur2Class, "digest", method_murmur2_digest, -1);
|
|
18
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import org.jruby.Ruby;
|
|
2
|
+
import org.jruby.RubyModule;
|
|
3
|
+
import org.jruby.RubyObject;
|
|
4
|
+
import org.jruby.RubyClass;
|
|
5
|
+
import org.jruby.RubyString;
|
|
6
|
+
import org.jruby.RubyFixnum;
|
|
7
|
+
import org.jruby.anno.JRubyModule;
|
|
8
|
+
import org.jruby.anno.JRubyMethod;
|
|
9
|
+
import org.jruby.runtime.ThreadContext;
|
|
10
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
|
11
|
+
|
|
12
|
+
@JRubyModule(name="Spark::Digest")
|
|
13
|
+
public class Digest extends RubyObject{
|
|
14
|
+
|
|
15
|
+
// Have to be the same as in C extension
|
|
16
|
+
final static long PORTABLE_HASH_SEED = 16154832;
|
|
17
|
+
|
|
18
|
+
public Digest(final Ruby ruby, RubyClass rubyClass) {
|
|
19
|
+
super(ruby, rubyClass);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
@JRubyMethod(module=true)
|
|
23
|
+
public static IRubyObject portable_hash(ThreadContext context, IRubyObject self, IRubyObject arg) {
|
|
24
|
+
Ruby ruby = self.getRuntime();
|
|
25
|
+
|
|
26
|
+
RubyString keyString = (RubyString)arg;
|
|
27
|
+
|
|
28
|
+
long hash = Murmur2.hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), PORTABLE_HASH_SEED);
|
|
29
|
+
|
|
30
|
+
RubyFixnum result = new RubyFixnum(ruby, hash);
|
|
31
|
+
|
|
32
|
+
return result;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
}
|
|
36
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import org.jruby.Ruby;
|
|
2
|
+
import org.jruby.RubyClass;
|
|
3
|
+
import org.jruby.RubyObject;
|
|
4
|
+
import org.jruby.RubyString;
|
|
5
|
+
import org.jruby.RubyFixnum;
|
|
6
|
+
import org.jruby.anno.JRubyClass;
|
|
7
|
+
import org.jruby.anno.JRubyMethod;
|
|
8
|
+
import org.jruby.runtime.ThreadContext;
|
|
9
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
|
10
|
+
|
|
11
|
+
/** Murmur hash 2.0.
|
|
12
|
+
*
|
|
13
|
+
* The murmur hash is a relative fast hash function from
|
|
14
|
+
* http://murmurhash.googlepages.com/ for platforms with efficient
|
|
15
|
+
* multiplication.
|
|
16
|
+
*
|
|
17
|
+
* http://d3s.mff.cuni.cz/~holub/sw/javamurmurhash/
|
|
18
|
+
*
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
@JRubyClass(name="Spark::Digest::Murmur2")
|
|
22
|
+
public class Murmur2 extends RubyObject {
|
|
23
|
+
|
|
24
|
+
public Murmur2(final Ruby ruby, RubyClass rubyClass) {
|
|
25
|
+
super(ruby, rubyClass);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
@JRubyMethod(required=1, optional=1, module=true)
|
|
29
|
+
public static IRubyObject digest(ThreadContext context, IRubyObject self, IRubyObject[] args) {
|
|
30
|
+
Ruby ruby = context.getRuntime();
|
|
31
|
+
|
|
32
|
+
RubyString keyString = (RubyString)args[0];
|
|
33
|
+
long seed;
|
|
34
|
+
|
|
35
|
+
if(args.length > 1){
|
|
36
|
+
RubyFixnum rb_seed = (RubyFixnum)args[1];
|
|
37
|
+
seed = rb_seed.getLongValue();
|
|
38
|
+
}
|
|
39
|
+
else{
|
|
40
|
+
seed = 0;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
long hash = hash64(keyString.getBytes(), (int)keyString.length().getLongValue(), seed);
|
|
44
|
+
|
|
45
|
+
RubyFixnum result = new RubyFixnum(ruby, hash);
|
|
46
|
+
return result;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
/** Generates 64 bit hash from byte array of the given length and seed.
|
|
51
|
+
*
|
|
52
|
+
* @param data byte array to hash
|
|
53
|
+
* @param length length of the array to hash
|
|
54
|
+
* @param seed initial seed value
|
|
55
|
+
* @return 64 bit hash of the given array
|
|
56
|
+
*/
|
|
57
|
+
public static long hash64(final byte[] data, int length, long seed) {
|
|
58
|
+
final long m = 0xc6a4a7935bd1e995L;
|
|
59
|
+
final int r = 47;
|
|
60
|
+
|
|
61
|
+
long h = (seed&0xffffffffl)^(length*m);
|
|
62
|
+
|
|
63
|
+
int length8 = length/8;
|
|
64
|
+
|
|
65
|
+
for (int i=0; i<length8; i++) {
|
|
66
|
+
final int i8 = i*8;
|
|
67
|
+
long k = ((long)data[i8+0]&0xff) +(((long)data[i8+1]&0xff)<<8)
|
|
68
|
+
+(((long)data[i8+2]&0xff)<<16) +(((long)data[i8+3]&0xff)<<24)
|
|
69
|
+
+(((long)data[i8+4]&0xff)<<32) +(((long)data[i8+5]&0xff)<<40)
|
|
70
|
+
+(((long)data[i8+6]&0xff)<<48) +(((long)data[i8+7]&0xff)<<56);
|
|
71
|
+
|
|
72
|
+
k *= m;
|
|
73
|
+
k ^= k >>> r;
|
|
74
|
+
k *= m;
|
|
75
|
+
|
|
76
|
+
h ^= k;
|
|
77
|
+
h *= m;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
switch (length%8) {
|
|
81
|
+
case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48;
|
|
82
|
+
case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40;
|
|
83
|
+
case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32;
|
|
84
|
+
case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24;
|
|
85
|
+
case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16;
|
|
86
|
+
case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8;
|
|
87
|
+
case 1: h ^= (long)(data[length&~7]&0xff);
|
|
88
|
+
h *= m;
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
h ^= h >>> r;
|
|
92
|
+
h *= m;
|
|
93
|
+
h ^= h >>> r;
|
|
94
|
+
|
|
95
|
+
return h;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import org.jruby.Ruby;
|
|
2
|
+
import org.jruby.RubyClass;
|
|
3
|
+
import org.jruby.RubyModule;
|
|
4
|
+
import org.jruby.runtime.ObjectAllocator;
|
|
5
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
|
6
|
+
import org.jruby.runtime.load.BasicLibraryService;
|
|
7
|
+
|
|
8
|
+
public class RubySparkExtService implements BasicLibraryService
|
|
9
|
+
{
|
|
10
|
+
public boolean basicLoad(final Ruby ruby) throws java.io.IOException {
|
|
11
|
+
|
|
12
|
+
RubyModule sparkModule = ruby.defineModule("Spark");
|
|
13
|
+
RubyModule sparkDigestModule = sparkModule.defineModuleUnder("Digest");
|
|
14
|
+
RubyClass sparkDigestMurmur2Class = sparkDigestModule.defineClassUnder("Murmur2", ruby.getObject(), sparkDigestMurmur2Allocator);
|
|
15
|
+
|
|
16
|
+
sparkDigestModule.defineAnnotatedMethods(Digest.class);
|
|
17
|
+
sparkDigestMurmur2Class.defineAnnotatedMethods(Murmur2.class);
|
|
18
|
+
|
|
19
|
+
return true;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
public static ObjectAllocator sparkDigestMurmur2Allocator = new ObjectAllocator() {
|
|
23
|
+
public IRubyObject allocate(Ruby ruby, RubyClass rubyClass) {
|
|
24
|
+
return new Murmur2(ruby, rubyClass);
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
}
|