szaru 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,157 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ #### Start of system configuration section. ####
5
+
6
+ srcdir = .
7
+ topdir = /usr/local/lib/ruby/1.8/i686-linux
8
+ hdrdir = $(topdir)
9
+ VPATH = $(srcdir):$(topdir):$(hdrdir)
10
+ exec_prefix = $(prefix)
11
+ prefix = $(DESTDIR)/usr/local
12
+ sharedstatedir = $(prefix)/com
13
+ mandir = $(datarootdir)/man
14
+ psdir = $(docdir)
15
+ oldincludedir = $(DESTDIR)/usr/include
16
+ localedir = $(datarootdir)/locale
17
+ bindir = $(exec_prefix)/bin
18
+ libexecdir = $(exec_prefix)/libexec
19
+ sitedir = $(libdir)/ruby/site_ruby
20
+ htmldir = $(docdir)
21
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
22
+ includedir = $(prefix)/include
23
+ infodir = $(datarootdir)/info
24
+ vendorlibdir = $(vendordir)/$(ruby_version)
25
+ sysconfdir = $(prefix)/etc
26
+ libdir = $(exec_prefix)/lib
27
+ sbindir = $(exec_prefix)/sbin
28
+ rubylibdir = $(libdir)/ruby/$(ruby_version)
29
+ docdir = $(datarootdir)/doc/$(PACKAGE)
30
+ dvidir = $(docdir)
31
+ vendordir = $(libdir)/ruby/vendor_ruby
32
+ datarootdir = $(prefix)/share
33
+ pdfdir = $(docdir)
34
+ archdir = $(rubylibdir)/$(arch)
35
+ sitearchdir = $(sitelibdir)/$(sitearch)
36
+ datadir = $(datarootdir)
37
+ localstatedir = $(prefix)/var
38
+ sitelibdir = $(sitedir)/$(ruby_version)
39
+
40
+ CC = gcc
41
+ LIBRUBY = $(LIBRUBY_A)
42
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
43
+ LIBRUBYARG_SHARED = -Wl,-R -Wl,$(libdir) -L$(libdir)
44
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
45
+
46
+ RUBY_EXTCONF_H =
47
+ CFLAGS = -fPIC -g -O2 $(cflags)
48
+ INCFLAGS = -I. -I. -I/usr/local/lib/ruby/1.8/i686-linux -I.
49
+ DEFS = -D_FILE_OFFSET_BITS=64
50
+ CPPFLAGS = -D_FILE_OFFSET_BITS=64
51
+ CXXFLAGS = $(CFLAGS)
52
+ ldflags = -L. -rdynamic -Wl,-export-dynamic
53
+ dldflags =
54
+ archflag =
55
+ DLDFLAGS = $(ldflags) $(dldflags) $(archflag)
56
+ LDSHARED = $(CC) -shared
57
+ AR = ar
58
+ EXEEXT =
59
+
60
+ RUBY_INSTALL_NAME = ruby
61
+ RUBY_SO_NAME = ruby
62
+ arch = i686-linux
63
+ sitearch = i686-linux
64
+ ruby_version = 1.8
65
+ ruby = /usr/local/bin/ruby
66
+ RUBY = $(ruby)
67
+ RM = rm -f
68
+ MAKEDIRS = mkdir -p
69
+ INSTALL = /usr/bin/install -c
70
+ INSTALL_PROG = $(INSTALL) -m 0755
71
+ INSTALL_DATA = $(INSTALL) -m 644
72
+ COPY = cp
73
+
74
+ #### End of system configuration section. ####
75
+
76
+ preload =
77
+
78
+ libpath = . $(libdir)
79
+ LIBPATH = -L. -L$(libdir) -Wl,-R$(libdir)
80
+ DEFFILE =
81
+
82
+ CLEANFILES = mkmf.log
83
+ DISTCLEANFILES =
84
+
85
+ extout =
86
+ extout_prefix =
87
+ target_prefix =
88
+ LOCAL_LIBS =
89
+ LIBS = -lszaru -lrt -ldl -lcrypt -lm -lc
90
+ SRCS = rb_szaru.cc
91
+ OBJS = rb_szaru.o
92
+ TARGET = szaru
93
+ DLLIB = $(TARGET).so
94
+ EXTSTATIC =
95
+ STATIC_LIB =
96
+
97
+ BINDIR = $(bindir)
98
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
99
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
100
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
101
+
102
+ TARGET_SO = $(DLLIB)
103
+ CLEANLIBS = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map
104
+ CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
105
+
106
+ all: $(DLLIB)
107
+ static: $(STATIC_LIB)
108
+
109
+ clean:
110
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
111
+
112
+ distclean: clean
113
+ @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
114
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
115
+
116
+ realclean: distclean
117
+ install: install-so install-rb
118
+
119
+ install-so: $(RUBYARCHDIR)
120
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
121
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
122
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
123
+ install-rb: pre-install-rb install-rb-default
124
+ install-rb-default: pre-install-rb-default
125
+ pre-install-rb: Makefile
126
+ pre-install-rb-default: Makefile
127
+ $(RUBYARCHDIR):
128
+ $(MAKEDIRS) $@
129
+
130
+ site-install: site-install-so site-install-rb
131
+ site-install-so: install-so
132
+ site-install-rb: install-rb
133
+
134
+ .SUFFIXES: .c .m .cc .cxx .cpp .C .o
135
+
136
+ .cc.o:
137
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
138
+
139
+ .cxx.o:
140
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
141
+
142
+ .cpp.o:
143
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
144
+
145
+ .C.o:
146
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
147
+
148
+ .c.o:
149
+ $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) -c $<
150
+
151
+ $(DLLIB): $(OBJS) Makefile
152
+ @-$(RM) $@
153
+ $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
154
+
155
+
156
+
157
+ $(OBJS): ruby.h defines.h
@@ -0,0 +1,69 @@
1
+ // Copyright 2010 Yuji Kaneda
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ // ------------------------------------------------------------------------
15
+
16
+ // specialization of Conveter only for int32_t, int64_t, double
17
+ template< typename Value >
18
+ class Converter {
19
+ public:
20
+ static inline VALUE ToRuby(Value value);
21
+ static inline Value FromRuby(VALUE value);
22
+ static inline void CheckType(VALUE value);
23
+ };
24
+
25
+ template<> inline
26
+ VALUE Converter<int32_t>::ToRuby(int32_t value){
27
+ return INT2FIX(value);
28
+ }
29
+
30
+ template<> inline
31
+ VALUE Converter<int64_t>::ToRuby(int64_t value){
32
+ return LONG2FIX(value);
33
+ }
34
+
35
+ template<> inline
36
+ VALUE Converter<double>::ToRuby(double value){
37
+ return rb_float_new(value);
38
+ }
39
+
40
+ template<> inline
41
+ int32_t Converter<int32_t>::FromRuby(VALUE value){
42
+ return FIX2INT(value);
43
+ }
44
+
45
+ template<> inline
46
+ int64_t Converter<int64_t>::FromRuby(VALUE value){
47
+ return FIX2LONG(value);
48
+ }
49
+
50
+ template<> inline
51
+ double Converter<double>::FromRuby(VALUE value){
52
+ return RFLOAT(value)->value;
53
+ }
54
+
55
+ template<> inline
56
+ void Converter<int32_t>::CheckType(VALUE value){
57
+ Check_Type(value, T_FIXNUM);
58
+ }
59
+
60
+ template<> inline
61
+ void Converter<int64_t>::CheckType(VALUE value){
62
+ Check_Type(value, T_FIXNUM);
63
+ }
64
+
65
+ template<> inline
66
+ void Converter<double>::CheckType(VALUE value){
67
+ Check_Type(value, T_FLOAT);
68
+ }
69
+
@@ -0,0 +1,5 @@
1
+ require "mkmf"
2
+ dir_config('szaru')
3
+ if have_library('szaru')
4
+ create_makefile('szaru')
5
+ end
@@ -0,0 +1,268 @@
1
+ // Copyright 2010 Yuji Kaneda
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ // ------------------------------------------------------------------------
15
+
16
+ // Ruby Binding of SZaru
17
+
18
+ #include <ruby.h>
19
+ #include <szaru.h>
20
+ // local include file
21
+ #include "converter.h"
22
+
23
+ extern "C" {
24
+ void Init_szaru(void);
25
+ }
26
+
27
+ namespace {
28
+
29
+ template<typename Value>
30
+ class RubyQuantileEstimator {
31
+ private:
32
+ static void
33
+ Free(SZaru::QuantileEstimator<Value> **ptr)
34
+ {
35
+ if (*ptr) {
36
+ delete *ptr;
37
+ }
38
+ }
39
+
40
+ static VALUE
41
+ Alloc(VALUE klass)
42
+ {
43
+ SZaru::QuantileEstimator<Value> **ptr = ALLOC(SZaru::QuantileEstimator<Value>*);
44
+ *ptr = NULL;
45
+ return Data_Wrap_Struct(klass, 0, Free, ptr);
46
+ }
47
+
48
+ static VALUE
49
+ Initialize(VALUE self, VALUE maxElems)
50
+ {
51
+ SZaru::QuantileEstimator<Value> **ptr;
52
+ Data_Get_Struct(self, SZaru::QuantileEstimator<Value>*, ptr);
53
+ *ptr = SZaru::QuantileEstimator<Value>::Create(NUM2LONG(maxElems));
54
+ return Qnil;
55
+ }
56
+
57
+ static VALUE
58
+ AddElem(VALUE self, VALUE elem)
59
+ {
60
+ SZaru::QuantileEstimator<Value> **qe;
61
+ Converter<Value>::CheckType(elem);
62
+ Data_Get_Struct(self, SZaru::QuantileEstimator<Value>*, qe);
63
+ (*qe)->AddElem(Converter<Value>::FromRuby(elem));
64
+ return Qnil;
65
+ }
66
+
67
+ static VALUE
68
+ Estimate(VALUE self)
69
+ {
70
+ SZaru::QuantileEstimator<Value> **qe;
71
+ Data_Get_Struct(self, SZaru::QuantileEstimator<Value>*, qe);
72
+ std::vector<Value> quantiles;
73
+ (*qe)->Estimate(quantiles);
74
+ VALUE ary = rb_ary_new2(quantiles.size());
75
+ for (int i = 0; i < quantiles.size(); i++) {
76
+ rb_ary_push(ary, Converter<Value>::ToRuby(quantiles[i]));
77
+ }
78
+ return ary;
79
+ }
80
+
81
+ public:
82
+
83
+ static VALUE
84
+ Define(VALUE superModule, const char *name)
85
+ {
86
+ VALUE cQuantileEstimator = rb_define_class_under(superModule, name, rb_cObject);
87
+ rb_define_alloc_func(cQuantileEstimator, Alloc);
88
+ rb_define_private_method(cQuantileEstimator, "initialize",
89
+ RUBY_METHOD_FUNC(Initialize), 1);
90
+ rb_define_method(cQuantileEstimator, "add_elem",
91
+ RUBY_METHOD_FUNC(AddElem), 1);
92
+ rb_define_method(cQuantileEstimator, "estimate",
93
+ RUBY_METHOD_FUNC(Estimate), 0);
94
+ return cQuantileEstimator;
95
+ }
96
+ };
97
+
98
+
99
+ template< typename Value >
100
+ class RubyTopEstimator {
101
+ private:
102
+
103
+ static void
104
+ Free(SZaru::TopEstimator<Value> **ptr)
105
+ {
106
+ if (*ptr) {
107
+ delete *ptr;
108
+ }
109
+ }
110
+
111
+ static VALUE
112
+ Alloc(VALUE klass)
113
+ {
114
+ SZaru::TopEstimator<Value> **ptr = ALLOC(SZaru::TopEstimator<Value>*);
115
+ *ptr = NULL;
116
+ return Data_Wrap_Struct(klass, 0, Free, ptr);
117
+ }
118
+
119
+ static VALUE
120
+ Initialize(VALUE self, VALUE maxElems)
121
+ {
122
+ SZaru::TopEstimator<Value> **ptr;
123
+ Check_Type(maxElems, T_FIXNUM);
124
+ Data_Get_Struct(self, SZaru::TopEstimator<Value>*, ptr);
125
+ *ptr = SZaru::TopEstimator<Value>::Create(NUM2LONG(maxElems));
126
+ return Qnil;
127
+ }
128
+
129
+ static VALUE
130
+ AddElem(VALUE self, VALUE elem)
131
+ {
132
+ SZaru::TopEstimator<Value> **te;
133
+ Check_Type(elem, T_STRING);
134
+ Data_Get_Struct(self, SZaru::TopEstimator<Value>*, te);
135
+ (*te)->AddElem(std::string(RSTRING_PTR(elem), RSTRING_LEN(elem)));
136
+ return Qnil;
137
+ }
138
+
139
+ static VALUE
140
+ AddWeightedElem(VALUE self, VALUE elem, VALUE weight)
141
+ {
142
+ SZaru::TopEstimator<Value> **te;
143
+ Check_Type(elem, T_STRING);
144
+ Converter<Value>::CheckType(weight);
145
+ Data_Get_Struct(self, SZaru::TopEstimator<Value>*, te);
146
+ (*te)->AddWeightedElem(std::string(RSTRING_PTR(elem), RSTRING_LEN(elem)),
147
+ Converter<Value>::FromRuby(weight));
148
+ return Qnil;
149
+ }
150
+
151
+ static VALUE
152
+ Estimate(VALUE self)
153
+ {
154
+ SZaru::TopEstimator<Value> **te;
155
+ Data_Get_Struct(self, SZaru::TopEstimator<Value>*, te);
156
+ std::vector<typename SZaru::TopEstimator<Value>::Elem> topElems;
157
+ (*te)->Estimate(topElems);
158
+ VALUE ary = rb_ary_new2(topElems.size());
159
+ for (int i = 0; i < topElems.size(); i++) {
160
+ rb_ary_push(ary, rb_ary_new3(2,
161
+ rb_str_new(topElems[i].value.c_str(), topElems[i].value.size()),
162
+ Converter<Value>::ToRuby(topElems[i].weight)));
163
+ }
164
+ return ary;
165
+ }
166
+
167
+ public:
168
+ static VALUE
169
+ Define(VALUE superModule, const char *name)
170
+ {
171
+ VALUE cTopEstimator = rb_define_class_under(superModule, name, rb_cObject);
172
+ rb_define_alloc_func(cTopEstimator, Alloc);
173
+ rb_define_private_method(cTopEstimator, "initialize",
174
+ RUBY_METHOD_FUNC(Initialize), 1);
175
+ rb_define_method(cTopEstimator, "add_elem",
176
+ RUBY_METHOD_FUNC(AddElem), 1);
177
+ rb_define_method(cTopEstimator, "add_weighted_elem",
178
+ RUBY_METHOD_FUNC(AddWeightedElem), 2);
179
+ rb_define_method(cTopEstimator, "estimate",
180
+ RUBY_METHOD_FUNC(Estimate), 0);
181
+ return cTopEstimator;
182
+ }
183
+
184
+ };
185
+
186
+
187
+ class RubyUniqueEstimator {
188
+ private:
189
+ static void
190
+ Free(SZaru::UniqueEstimator **ptr)
191
+ {
192
+ if (*ptr) {
193
+ delete *ptr;
194
+ }
195
+ }
196
+
197
+ static VALUE
198
+ Alloc(VALUE klass)
199
+ {
200
+ SZaru::UniqueEstimator **ptr = ALLOC(SZaru::UniqueEstimator*);
201
+ *ptr = NULL;
202
+ return Data_Wrap_Struct(klass, 0, Free, ptr);
203
+ }
204
+
205
+ static VALUE
206
+ Initialize(VALUE self, VALUE maxElems)
207
+ {
208
+ SZaru::UniqueEstimator **ptr;
209
+ Data_Get_Struct(self, SZaru::UniqueEstimator*, ptr);
210
+ *ptr = SZaru::UniqueEstimator::Create(NUM2LONG(maxElems));
211
+ return Qnil;
212
+ }
213
+
214
+ static VALUE
215
+ AddElem(VALUE self, VALUE elem)
216
+ {
217
+ SZaru::UniqueEstimator **ue;
218
+ Check_Type(elem, T_STRING);
219
+ Data_Get_Struct(self, SZaru::UniqueEstimator*, ue);
220
+ (*ue)->AddElemInCIF(RSTRING_PTR(elem), RSTRING_LEN(elem));
221
+ return Qnil;
222
+ }
223
+
224
+ static VALUE
225
+ Estimate(VALUE self)
226
+ {
227
+ SZaru::UniqueEstimator **ue;
228
+ Data_Get_Struct(self, SZaru::UniqueEstimator*, ue);
229
+ uint64_t unique = (*ue)->Estimate();
230
+ return LONG2NUM(unique);
231
+ }
232
+
233
+ public:
234
+ static VALUE
235
+ Define(VALUE superModule, const char *name) {
236
+ VALUE cUniqueEstimator = rb_define_class_under(superModule, name, rb_cObject);
237
+ rb_define_alloc_func(cUniqueEstimator, Alloc);
238
+ rb_define_private_method(cUniqueEstimator, "initialize",
239
+ RUBY_METHOD_FUNC(Initialize), 1);
240
+ rb_define_method(cUniqueEstimator, "add_elem",
241
+ RUBY_METHOD_FUNC(AddElem), 1);
242
+ rb_define_method(cUniqueEstimator, "estimate",
243
+ RUBY_METHOD_FUNC(Estimate), 0);
244
+ return cUniqueEstimator;
245
+ }
246
+ };
247
+
248
+ }
249
+
250
+
251
+ void
252
+ Init_szaru(void){
253
+ VALUE mSZaru = rb_define_module("SZaru");
254
+ RubyUniqueEstimator::Define(mSZaru, "UniqueEstimator");
255
+
256
+ // TopEstimator
257
+ VALUE mTopEstimator = rb_define_module_under(mSZaru, "TopEstimator");
258
+ RubyTopEstimator<double>::Define(mTopEstimator, "Double");
259
+ RubyTopEstimator<int32_t>::Define(mTopEstimator, "Int32");
260
+ RubyTopEstimator<int64_t>::Define(mTopEstimator, "Int64");
261
+
262
+ // QuantileEstimator
263
+ VALUE mQuantileEstimator = rb_define_module_under(mSZaru, "QuantileEstimator");
264
+ RubyQuantileEstimator<double>::Define(mQuantileEstimator, "Double");
265
+ RubyQuantileEstimator<int32_t>::Define(mQuantileEstimator, "Int32");
266
+ RubyQuantileEstimator<int64_t>::Define(mQuantileEstimator, "Int64");
267
+
268
+ }
Binary file
Binary file
@@ -0,0 +1,33 @@
1
+ = Ruby Binding of SZaru
2
+
3
+ == Introduction
4
+ {SZaru}[http://llamerada.github.com/SZaru/] is a library to use {Sawzall}[http://code.google.com/p/szl/] aggregators in pure C++, Ruby and Python.
5
+ Currently, I have implemented the following 3 aggregators:
6
+ [Top] Statistical samplings that record the 'top N' data items based on CountSketch algorithm from "Finding Frequent Items in Data Streams", Moses Charikar, Kevin Chen and Martin Farach-Colton, 2002.
7
+ [Unique] Statistical estimators for the total number of unique data items.
8
+ [Quantile] Approximate N-tiles for data items from an ordered domain based on the following paper: Munro & Paterson, "Selection and Sorting with Limited Storage", Theoretical Computer Science, Vol 12, p 315-323, 1980.
9
+
10
+ == Example
11
+ require "szaru"
12
+ unq_est = SZaru::UniqueEstimator.new(10)
13
+ 1000.times do |i|
14
+ unq_est.add_elem(i.to_s + "test")
15
+ end
16
+ puts unq_est.estimate # => 913
17
+
18
+ == License
19
+
20
+ Copyright 2010 Yuji Kaneda
21
+
22
+ Licensed under the Apache License, Version 2.0 (the "License");
23
+ you may not use this file except in compliance with the License.
24
+ You may obtain a copy of the License at
25
+
26
+ http://www.apache.org/licenses/LICENSE-2.0
27
+
28
+ Unless required by applicable law or agreed to in writing, software
29
+ distributed under the License is distributed on an "AS IS" BASIS,
30
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31
+ See the License for the specific language governing permissions and
32
+ limitations under the License.
33
+
@@ -0,0 +1,68 @@
1
+ begin
2
+ require "rubygems"
3
+ require "szaru"
4
+ rescue
5
+ puts "load local library"
6
+ require File.join(File.dirname(__FILE__), "../ext/szaru")
7
+ end
8
+
9
+ def test_unique
10
+ ue = SZaru::UniqueEstimator.new(10)
11
+ 1000.times do |i|
12
+ ue.add_elem(i.to_s + "test")
13
+ end
14
+ puts ue.estimate
15
+ end
16
+
17
+ def test_top
18
+ te = SZaru::TopEstimator::Int32.new(10)
19
+ ary = []
20
+ 100.times do |i|
21
+ i.times do |j|
22
+ ary << "test#{i}"
23
+ end
24
+ end
25
+ ary.sort_by{rand}.each do |e|
26
+ te.add_elem(e)
27
+ end
28
+ p te.estimate
29
+ end
30
+
31
+ def test_top2
32
+ te = SZaru::TopEstimator::Double.new(10)
33
+ ary = []
34
+ 100.times do |i|
35
+ ary << ["test#{i}", i]
36
+ end
37
+ ary.sort_by{rand}.each do |e, w|
38
+ te.add_weighted_elem(e, w.to_f)
39
+ end
40
+ p te.estimate
41
+ end
42
+
43
+ def test_quantile
44
+ te = SZaru::QuantileEstimator::Int64.new(10)
45
+ ary = []
46
+ 1000.times do |i|
47
+ # i.times do |j|
48
+ ary << i
49
+ end
50
+ rand_ary = ary.sort_by{rand}
51
+ # $stdout.sync = true
52
+ # puts "start"
53
+ rand_ary.each_with_index do |e, i|
54
+ te.add_elem(e)
55
+ end
56
+ p te.estimate
57
+ end
58
+
59
+
60
+
61
+ # test_topheap
62
+ # test_sketch
63
+ test_unique
64
+ test_top
65
+ test_top2
66
+ test_quantile
67
+
68
+
@@ -0,0 +1,189 @@
1
+ #--
2
+ # SZaru: Porting of excellent Sawzall aggregators.
3
+ #++
4
+ #:include:overview.rd
5
+
6
+
7
+ # SZaru namespace
8
+ module SZaru
9
+ # Statistical estimators for the total number of unique data items.
10
+ class UniqueEstimator
11
+ # Create a UniqueEstimator object.
12
+ #
13
+ # _max_elems_ is a tuning parameter.
14
+ # If _max_elems_ is bigger, the estimation becomes more accurate but consuming more memory.
15
+ def initialize(max_elems)
16
+ # (native code)
17
+ end
18
+
19
+ # Add a new element to this entry.
20
+ # _element_ must be String object.
21
+ def add_elem(elem)
22
+ # (native code)
23
+ end
24
+
25
+ # Return the stimation the number of unique entries.
26
+ def estimate
27
+ # (native code)
28
+ end
29
+ end # UniqueEstimator
30
+
31
+ # Statistical samplings that record the 'top N' data items.
32
+ module TopEstimator
33
+ # TopEstimator of that weight is int32
34
+ class Int32
35
+ # Create a TopEstimator::Int32 object.
36
+ #
37
+ # _top_elems_ is a number of top elements to be estimate.
38
+ def initialize(top_elems)
39
+ # (native code)
40
+ end
41
+
42
+ # Add a new element to this entry.
43
+ # _element_ must be String object.
44
+ def add_elem(elem)
45
+ # (native code)
46
+ end
47
+
48
+ # Add a new weighted element to this entry.
49
+ # _element_ must be String object.
50
+ # _weight_ msut be Fixnum object.
51
+ def add_weighted_elem(elem, weight)
52
+ # (native code)
53
+ end
54
+
55
+ # Return a top elements with weight.
56
+ # Example: [["abc", 7], ["def", 3]]
57
+ def estimate
58
+ # (native code)
59
+ end
60
+ end # Int32
61
+
62
+ # TopEstimator of that weight is int64
63
+ class Int64
64
+ # Create a TopEstimator::Int64 object.
65
+ #
66
+ # _top_elems_ is a number of top elements to be estimate.
67
+ def initialize(top_elems)
68
+ # (native code)
69
+ end
70
+
71
+ # Add a new element to this entry.
72
+ # _element_ must be String object.
73
+ def add_elem(elem)
74
+ # (native code)
75
+ end
76
+
77
+ # Add a new weighted element to this entry.
78
+ # _element_ must be String object.
79
+ # _weight_ msut be Fixnum object.
80
+ def add_weighted_elem(elem, weight)
81
+ # (native code)
82
+ end
83
+
84
+ # Return a top elements with weight.
85
+ # Example: [["abc", 7], ["def", 3]]
86
+ def estimate
87
+ # (native code)
88
+ end
89
+ end # Int64
90
+
91
+ # TopEstimator of that weight is Double
92
+ class Double
93
+ # Create a TopEstimator::Double object.
94
+ #
95
+ # _top_elems_ is a number of top elements to be estimate.
96
+ def initialize(top_elems)
97
+ # (native code)
98
+ end
99
+
100
+ # Add a new element to this entry.
101
+ # _element_ must be String object.
102
+ def add_elem(elem)
103
+ # (native code)
104
+ end
105
+
106
+ # Add a new weighted element to this entry.
107
+ # _element_ must be String object.
108
+ # _weight_ msut be Float object.
109
+ def add_weighted_elem(elem, weight)
110
+ # (native code)
111
+ end
112
+
113
+ # Return a top elements with weight.
114
+ # Example: [["abc", 7.0], ["def", 3.0]]
115
+ def estimate
116
+ # (native code)
117
+ end
118
+ end # Double
119
+ end # TopEstimator
120
+
121
+ # Approximate N-tiles for data items from an ordered domain.
122
+ module QuantileEstimator
123
+ # TopEstimator of that element is int32
124
+ class Int32
125
+ # Create a QuantileEstimator::Int32 object.
126
+ #
127
+ # _num_quantiles_ is a number of tiles to be estimate.
128
+ def initialize(num_quantiles)
129
+ # (native code)
130
+ end
131
+
132
+ # Add a new element to this entry.
133
+ # _element_ must be Fixnum object.
134
+ def add_elem(elem)
135
+ # (native code)
136
+ end
137
+
138
+ # Return a estimated N tiles.
139
+ # Example: [0, 3, 7, 9]
140
+ def estimate()
141
+ # (native code)
142
+ end
143
+ end # Int32
144
+
145
+ # TopEstimator of that element is int64
146
+ class Int64
147
+ # Create a QuantileEstimator::Int64 object.
148
+ #
149
+ # _num_quantiles_ is a number of tiles to be estimate.
150
+ def initialize(num_quantiles)
151
+ # (native code)
152
+ end
153
+
154
+ # Add a new element to this entry.
155
+ # _element_ must be Fixnum object.
156
+ def add_elem(elem)
157
+ # (native code)
158
+ end
159
+
160
+ # Return a estimated N tiles.
161
+ # Example: [0, 3, 7, 9]
162
+ def estimate()
163
+ # (native code)
164
+ end
165
+ end # Int64
166
+
167
+ # TopEstimator of that element is double
168
+ class Double
169
+ # Create a QuantileEstimator::Double object.
170
+ #
171
+ # _num_quantiles_ is a number of tiles to be estimate.
172
+ def initialize(num_quantiles)
173
+ # (native code)
174
+ end
175
+
176
+ # Add a new element to this entry.
177
+ # _element_ must be Fixnum object.
178
+ def add_elem(elem)
179
+ # (native code)
180
+ end
181
+
182
+ # Return a estimated N tiles.
183
+ # Example: [0.0, 3.2, 6.8, 9.5]
184
+ def estimate()
185
+ # (native code)
186
+ end
187
+ end # Double
188
+ end # QuantileEstimator
189
+ end # SZaru
@@ -0,0 +1,72 @@
1
+ require File.join(File.dirname(__FILE__), "../ext/szaru")
2
+
3
+ include SZaru
4
+ quantile_estimators = [QuantileEstimator::Int32, QuantileEstimator::Int64, QuantileEstimator::Double]
5
+
6
+ def convert_value_from_index(index, value)
7
+ case index
8
+ when 0, 1
9
+ value.to_i
10
+ when 2
11
+ value.to_f
12
+ end
13
+ end
14
+
15
+ quantile_estimators.each_with_index do |quantile_estimator, te_index|
16
+ describe quantile_estimator do
17
+ it "return [0] if no addition exists" do
18
+ te = quantile_estimator.new(10)
19
+ te.estimate.should == [0]
20
+ end
21
+
22
+ it "return [min, max] if quantile_elems is 0" do
23
+ te = quantile_estimator.new(0)
24
+ te.add_elem(convert_value_from_index(te_index, 10))
25
+ te.add_elem(convert_value_from_index(te_index, 7))
26
+ te.estimate.should == [7, 10]
27
+ end
28
+
29
+ it "return exact quantile when the number of elements is small than quantile_elems" do
30
+ te = quantile_estimator.new(10)
31
+ n_elemnts = 5
32
+ n_elemnts.times do |i|
33
+ te.add_elem(convert_value_from_index(te_index, i))
34
+ end
35
+ quantile_elements = te.estimate
36
+ expexcted_values = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
37
+ expexcted = expexcted_values.map{|value| convert_value_from_index(te_index, value)}
38
+ quantile_elements.should == expexcted
39
+ end
40
+
41
+ it "return same result when calling estimate twice" do
42
+ te = quantile_estimator.new(10)
43
+ n_elemnts = 5
44
+ n_elemnts.times do |i|
45
+ te.add_elem(convert_value_from_index(te_index, i))
46
+ end
47
+ # first call
48
+ quantile_elements = te.estimate
49
+ # second call
50
+ quantile_elements = te.estimate
51
+ expexcted_values = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
52
+ expexcted = expexcted_values.map{|value| convert_value_from_index(te_index, value)}
53
+ quantile_elements.should == expexcted
54
+ end
55
+
56
+ it "return approximate number when the number of elements is greater than quantile_elems" do
57
+ qe = quantile_estimator.new(11)
58
+ elems = (0 .. 1000).to_a.sort_by{rand}
59
+ elems.each do |elem|
60
+ qe.add_elem(convert_value_from_index(te_index, elem))
61
+ end
62
+ quantiles = qe.estimate
63
+ quantiles.length.should == 11
64
+ quantiles.each_with_index do |tile, index|
65
+ exact = index * 100
66
+ diff = (tile - exact).abs
67
+ diff.should < 10
68
+ end
69
+ end
70
+
71
+ end
72
+ end
@@ -0,0 +1,110 @@
1
+ require File.join(File.dirname(__FILE__), "../ext/szaru")
2
+
3
+ include SZaru
4
+ top_estimators = [TopEstimator::Int32, TopEstimator::Int64, TopEstimator::Double]
5
+
6
+ def convert_value_from_index(index, value)
7
+ case index
8
+ when 0, 1
9
+ value.to_i
10
+ when 2
11
+ value.to_f
12
+ end
13
+ end
14
+
15
+ top_estimators.each_with_index do |top_estimator, te_index|
16
+ describe top_estimator do
17
+ it "return [] if no addition exists" do
18
+ te = top_estimator.new(10)
19
+ te.estimate.should == []
20
+ end
21
+
22
+ it "return [] if top_elems is 0" do
23
+ te = top_estimator.new(0)
24
+ te.add_elem("test")
25
+ te.estimate.should == []
26
+ end
27
+
28
+ it "return exact number when the number of elements is small than top_elems" do
29
+ te = top_estimator.new(10)
30
+ n_elemnts = 5
31
+ n_elemnts.times do |i|
32
+ te.add_elem("test#{i}")
33
+ te.add_weighted_elem("test#{i}", convert_value_from_index(te_index, i))
34
+ end
35
+ top_elements = te.estimate
36
+ top_elements.length.should == n_elemnts
37
+ n_elemnts.times do |i|
38
+ # check element
39
+ top_elements[i][0].should == "test#{n_elemnts - i - 1}"
40
+ # check weight
41
+ exact_weight = convert_value_from_index(te_index, n_elemnts - i)
42
+ top_elements[i][1].should == exact_weight
43
+ end
44
+ end
45
+
46
+ it "return same result when calling estimate twice" do
47
+ te = top_estimator.new(10)
48
+ n_elemnts = 5
49
+ n_elemnts.times do |i|
50
+ te.add_elem("test#{i}")
51
+ te.add_weighted_elem("test#{i}", convert_value_from_index(te_index, i))
52
+ end
53
+ # first call
54
+ top_elements = te.estimate
55
+ # second call
56
+ top_elements = te.estimate
57
+ top_elements.length.should == n_elemnts
58
+ n_elemnts.times do |i|
59
+ # check element
60
+ top_elements[i][0].should == "test#{n_elemnts - i - 1}"
61
+ # check weight
62
+ exact_weight = convert_value_from_index(te_index, n_elemnts - i)
63
+ top_elements[i][1].should == exact_weight
64
+ end
65
+ end
66
+
67
+ it "return approximate number when the number of elements is greater than top_elems" do
68
+ te = top_estimator.new(10)
69
+ n_large_elemnts = 30
70
+ n_small_elemnts = 1000
71
+ # create input stream
72
+ elems = []
73
+ # large element x_i occurs x_i^2 times.
74
+ n_large_elemnts.times do |i|
75
+ (i * i).times do
76
+ elems << i
77
+ end
78
+ end
79
+ # small element y_i occurs less than 5 times.
80
+ n_small_elemnts.times do |i|
81
+ rand(5).times do
82
+ elems << i
83
+ end
84
+ end
85
+ # run input stream 2 times in random oreder
86
+ 2.times do
87
+ elems.sort_by{ rand }.each do |j|
88
+ te.add_elem("test#{j}")
89
+ end
90
+ end
91
+ # check estimation
92
+ top_elements = te.estimate
93
+ top_elements.length.should == 10
94
+ 10.times do |i|
95
+ exact_index = n_large_elemnts - i - 1
96
+ # check element
97
+ top_elements[i][0] =~ /test(\d*)/
98
+ estimated_index = $1.to_i
99
+ diff = (exact_index - estimated_index).abs
100
+ diff.should < 3
101
+ # check weight
102
+ exact_weight = convert_value_from_index(te_index, 2 * exact_index * exact_index)
103
+ diff = top_elements[i][1] - exact_weight
104
+ error = (diff / exact_weight.to_f).abs
105
+ error.should < 0.1
106
+ end
107
+ end
108
+
109
+ end
110
+ end
@@ -0,0 +1,39 @@
1
+ require File.join(File.dirname(__FILE__), "../ext/szaru")
2
+
3
+ include SZaru
4
+ describe UniqueEstimator do
5
+ it "return 0 if no addition exists" do
6
+ ue = UniqueEstimator.new(10)
7
+ ue.estimate.should == 0
8
+ end
9
+
10
+ it "return 0 if max_elems is 0" do
11
+ ue = UniqueEstimator.new(0)
12
+ ue.add_elem("test")
13
+ ue.estimate.should == 0
14
+ end
15
+
16
+ it "return exact number when the number of elements is small than max_elems" do
17
+ ue = UniqueEstimator.new(10)
18
+ 5.times do |i|
19
+ ue.add_elem("test#{i}")
20
+ end
21
+ ue.estimate.should == 5
22
+ end
23
+
24
+ it "return approximate number when the number of elements is greater than max_elems" do
25
+ ue = UniqueEstimator.new(10)
26
+ n_unique = 997
27
+ elems = Array.new(n_unique){|i| ("test#{i}") }
28
+ # add elems to ue 2 times in random oreder
29
+ 2.times do
30
+ elems.sort_by{ rand }.each do |elm|
31
+ ue.add_elem(elm)
32
+ end
33
+ end
34
+ diff = ue.estimate - n_unique
35
+ error_rate = ( diff / n_unique.to_f).abs
36
+ error_rate.should < 0.1
37
+ end
38
+
39
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: szaru
3
+ version: !ruby/object:Gem::Version
4
+ hash: 25
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 1
10
+ version: 0.1.1
11
+ platform: ruby
12
+ authors:
13
+ - Yuji Kaneda
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-13 00:00:00 +09:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: |
23
+ Portings of excellent Sawzall aggregators.
24
+
25
+ email: llamerada@gmail.com
26
+ executables: []
27
+
28
+ extensions:
29
+ - ext/extconf.rb
30
+ extra_rdoc_files: []
31
+
32
+ files:
33
+ - ext/rb_szaru.cc
34
+ - ext/converter.h
35
+ - ext/szaru.so
36
+ - ext/Makefile
37
+ - ext/extconf.rb
38
+ - ext/rb_szaru.o
39
+ - test/quantile_spec.rb
40
+ - test/top_spec.rb
41
+ - test/unique_spec.rb
42
+ - sample/sample.rb
43
+ - overview.rd
44
+ - szaru-doc.rb
45
+ has_rdoc: true
46
+ homepage: http://llamerada.github.com/SZaru/
47
+ licenses: []
48
+
49
+ post_install_message:
50
+ rdoc_options:
51
+ - szaru-doc.rb
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ hash: 3
60
+ segments:
61
+ - 0
62
+ version: "0"
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ hash: 3
69
+ segments:
70
+ - 0
71
+ version: "0"
72
+ requirements: []
73
+
74
+ rubyforge_project:
75
+ rubygems_version: 1.3.7
76
+ signing_key:
77
+ specification_version: 3
78
+ summary: Portings of excellent Sawzall aggregators
79
+ test_files:
80
+ - test/quantile_spec.rb
81
+ - test/top_spec.rb
82
+ - test/unique_spec.rb