szaru 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ #### Start of system configuration section. ####
5
+
6
+ srcdir = .
7
+ topdir = /usr/local/lib/ruby/1.8/i686-linux
8
+ hdrdir = $(topdir)
9
+ VPATH = $(srcdir):$(topdir):$(hdrdir)
10
+ exec_prefix = $(prefix)
11
+ prefix = $(DESTDIR)/usr/local
12
+ sharedstatedir = $(prefix)/com
13
+ mandir = $(datarootdir)/man
14
+ psdir = $(docdir)
15
+ oldincludedir = $(DESTDIR)/usr/include
16
+ localedir = $(datarootdir)/locale
17
+ bindir = $(exec_prefix)/bin
18
+ libexecdir = $(exec_prefix)/libexec
19
+ sitedir = $(libdir)/ruby/site_ruby
20
+ htmldir = $(docdir)
21
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
22
+ includedir = $(prefix)/include
23
+ infodir = $(datarootdir)/info
24
+ vendorlibdir = $(vendordir)/$(ruby_version)
25
+ sysconfdir = $(prefix)/etc
26
+ libdir = $(exec_prefix)/lib
27
+ sbindir = $(exec_prefix)/sbin
28
+ rubylibdir = $(libdir)/ruby/$(ruby_version)
29
+ docdir = $(datarootdir)/doc/$(PACKAGE)
30
+ dvidir = $(docdir)
31
+ vendordir = $(libdir)/ruby/vendor_ruby
32
+ datarootdir = $(prefix)/share
33
+ pdfdir = $(docdir)
34
+ archdir = $(rubylibdir)/$(arch)
35
+ sitearchdir = $(sitelibdir)/$(sitearch)
36
+ datadir = $(datarootdir)
37
+ localstatedir = $(prefix)/var
38
+ sitelibdir = $(sitedir)/$(ruby_version)
39
+
40
+ CC = gcc
41
+ LIBRUBY = $(LIBRUBY_A)
42
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
43
+ LIBRUBYARG_SHARED = -Wl,-R -Wl,$(libdir) -L$(libdir)
44
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
45
+
46
+ RUBY_EXTCONF_H =
47
+ CFLAGS = -fPIC -g -O2 $(cflags)
48
+ INCFLAGS = -I. -I. -I/usr/local/lib/ruby/1.8/i686-linux -I.
49
+ DEFS = -D_FILE_OFFSET_BITS=64
50
+ CPPFLAGS = -D_FILE_OFFSET_BITS=64
51
+ CXXFLAGS = $(CFLAGS)
52
+ ldflags = -L. -rdynamic -Wl,-export-dynamic
53
+ dldflags =
54
+ archflag =
55
+ DLDFLAGS = $(ldflags) $(dldflags) $(archflag)
56
+ LDSHARED = $(CC) -shared
57
+ AR = ar
58
+ EXEEXT =
59
+
60
+ RUBY_INSTALL_NAME = ruby
61
+ RUBY_SO_NAME = ruby
62
+ arch = i686-linux
63
+ sitearch = i686-linux
64
+ ruby_version = 1.8
65
+ ruby = /usr/local/bin/ruby
66
+ RUBY = $(ruby)
67
+ RM = rm -f
68
+ MAKEDIRS = mkdir -p
69
+ INSTALL = /usr/bin/install -c
70
+ INSTALL_PROG = $(INSTALL) -m 0755
71
+ INSTALL_DATA = $(INSTALL) -m 644
72
+ COPY = cp
73
+
74
+ #### End of system configuration section. ####
75
+
76
+ preload =
77
+
78
+ libpath = . $(libdir)
79
+ LIBPATH = -L. -L$(libdir) -Wl,-R$(libdir)
80
+ DEFFILE =
81
+
82
+ CLEANFILES = mkmf.log
83
+ DISTCLEANFILES =
84
+
85
+ extout =
86
+ extout_prefix =
87
+ target_prefix =
88
+ LOCAL_LIBS =
89
+ LIBS = -lszaru -lrt -ldl -lcrypt -lm -lc
90
+ SRCS = rb_szaru.cc
91
+ OBJS = rb_szaru.o
92
+ TARGET = szaru
93
+ DLLIB = $(TARGET).so
94
+ EXTSTATIC =
95
+ STATIC_LIB =
96
+
97
+ BINDIR = $(bindir)
98
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
99
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
100
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
101
+
102
+ TARGET_SO = $(DLLIB)
103
+ CLEANLIBS = $(TARGET).so $(TARGET).il? $(TARGET).tds $(TARGET).map
104
+ CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
105
+
106
+ all: $(DLLIB)
107
+ static: $(STATIC_LIB)
108
+
109
+ clean:
110
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
111
+
112
+ distclean: clean
113
+ @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
114
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
115
+
116
+ realclean: distclean
117
+ install: install-so install-rb
118
+
119
+ install-so: $(RUBYARCHDIR)
120
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
121
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
122
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
123
+ install-rb: pre-install-rb install-rb-default
124
+ install-rb-default: pre-install-rb-default
125
+ pre-install-rb: Makefile
126
+ pre-install-rb-default: Makefile
127
+ $(RUBYARCHDIR):
128
+ $(MAKEDIRS) $@
129
+
130
+ site-install: site-install-so site-install-rb
131
+ site-install-so: install-so
132
+ site-install-rb: install-rb
133
+
134
+ .SUFFIXES: .c .m .cc .cxx .cpp .C .o
135
+
136
+ .cc.o:
137
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
138
+
139
+ .cxx.o:
140
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
141
+
142
+ .cpp.o:
143
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
144
+
145
+ .C.o:
146
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
147
+
148
+ .c.o:
149
+ $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) -c $<
150
+
151
+ $(DLLIB): $(OBJS) Makefile
152
+ @-$(RM) $@
153
+ $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
154
+
155
+
156
+
157
+ $(OBJS): ruby.h defines.h
@@ -0,0 +1,69 @@
1
+ // Copyright 2010 Yuji Kaneda
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ // ------------------------------------------------------------------------
15
+
16
+ // specialization of Conveter only for int32_t, int64_t, double
17
+ template< typename Value >
18
+ class Converter {
19
+ public:
20
+ static inline VALUE ToRuby(Value value);
21
+ static inline Value FromRuby(VALUE value);
22
+ static inline void CheckType(VALUE value);
23
+ };
24
+
25
+ template<> inline
26
+ VALUE Converter<int32_t>::ToRuby(int32_t value){
27
+ return INT2FIX(value);
28
+ }
29
+
30
+ template<> inline
31
+ VALUE Converter<int64_t>::ToRuby(int64_t value){
32
+ return LONG2FIX(value);
33
+ }
34
+
35
+ template<> inline
36
+ VALUE Converter<double>::ToRuby(double value){
37
+ return rb_float_new(value);
38
+ }
39
+
40
+ template<> inline
41
+ int32_t Converter<int32_t>::FromRuby(VALUE value){
42
+ return FIX2INT(value);
43
+ }
44
+
45
+ template<> inline
46
+ int64_t Converter<int64_t>::FromRuby(VALUE value){
47
+ return FIX2LONG(value);
48
+ }
49
+
50
+ template<> inline
51
+ double Converter<double>::FromRuby(VALUE value){
52
+ return RFLOAT(value)->value;
53
+ }
54
+
55
+ template<> inline
56
+ void Converter<int32_t>::CheckType(VALUE value){
57
+ Check_Type(value, T_FIXNUM);
58
+ }
59
+
60
+ template<> inline
61
+ void Converter<int64_t>::CheckType(VALUE value){
62
+ Check_Type(value, T_FIXNUM);
63
+ }
64
+
65
+ template<> inline
66
+ void Converter<double>::CheckType(VALUE value){
67
+ Check_Type(value, T_FLOAT);
68
+ }
69
+
@@ -0,0 +1,5 @@
1
+ require "mkmf"
2
+ dir_config('szaru')
3
+ if have_library('szaru')
4
+ create_makefile('szaru')
5
+ end
@@ -0,0 +1,268 @@
1
+ // Copyright 2010 Yuji Kaneda
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ // ------------------------------------------------------------------------
15
+
16
+ // Ruby Binding of SZaru
17
+
18
+ #include <ruby.h>
19
+ #include <szaru.h>
20
+ // local include file
21
+ #include "converter.h"
22
+
23
+ extern "C" {
24
+ void Init_szaru(void);
25
+ }
26
+
27
+ namespace {
28
+
29
+ template<typename Value>
30
+ class RubyQuantileEstimator {
31
+ private:
32
+ static void
33
+ Free(SZaru::QuantileEstimator<Value> **ptr)
34
+ {
35
+ if (*ptr) {
36
+ delete *ptr;
37
+ }
38
+ }
39
+
40
+ static VALUE
41
+ Alloc(VALUE klass)
42
+ {
43
+ SZaru::QuantileEstimator<Value> **ptr = ALLOC(SZaru::QuantileEstimator<Value>*);
44
+ *ptr = NULL;
45
+ return Data_Wrap_Struct(klass, 0, Free, ptr);
46
+ }
47
+
48
+ static VALUE
49
+ Initialize(VALUE self, VALUE maxElems)
50
+ {
51
+ SZaru::QuantileEstimator<Value> **ptr;
52
+ Data_Get_Struct(self, SZaru::QuantileEstimator<Value>*, ptr);
53
+ *ptr = SZaru::QuantileEstimator<Value>::Create(NUM2LONG(maxElems));
54
+ return Qnil;
55
+ }
56
+
57
+ static VALUE
58
+ AddElem(VALUE self, VALUE elem)
59
+ {
60
+ SZaru::QuantileEstimator<Value> **qe;
61
+ Converter<Value>::CheckType(elem);
62
+ Data_Get_Struct(self, SZaru::QuantileEstimator<Value>*, qe);
63
+ (*qe)->AddElem(Converter<Value>::FromRuby(elem));
64
+ return Qnil;
65
+ }
66
+
67
+ static VALUE
68
+ Estimate(VALUE self)
69
+ {
70
+ SZaru::QuantileEstimator<Value> **qe;
71
+ Data_Get_Struct(self, SZaru::QuantileEstimator<Value>*, qe);
72
+ std::vector<Value> quantiles;
73
+ (*qe)->Estimate(quantiles);
74
+ VALUE ary = rb_ary_new2(quantiles.size());
75
+ for (int i = 0; i < quantiles.size(); i++) {
76
+ rb_ary_push(ary, Converter<Value>::ToRuby(quantiles[i]));
77
+ }
78
+ return ary;
79
+ }
80
+
81
+ public:
82
+
83
+ static VALUE
84
+ Define(VALUE superModule, const char *name)
85
+ {
86
+ VALUE cQuantileEstimator = rb_define_class_under(superModule, name, rb_cObject);
87
+ rb_define_alloc_func(cQuantileEstimator, Alloc);
88
+ rb_define_private_method(cQuantileEstimator, "initialize",
89
+ RUBY_METHOD_FUNC(Initialize), 1);
90
+ rb_define_method(cQuantileEstimator, "add_elem",
91
+ RUBY_METHOD_FUNC(AddElem), 1);
92
+ rb_define_method(cQuantileEstimator, "estimate",
93
+ RUBY_METHOD_FUNC(Estimate), 0);
94
+ return cQuantileEstimator;
95
+ }
96
+ };
97
+
98
+
99
+ template< typename Value >
100
+ class RubyTopEstimator {
101
+ private:
102
+
103
+ static void
104
+ Free(SZaru::TopEstimator<Value> **ptr)
105
+ {
106
+ if (*ptr) {
107
+ delete *ptr;
108
+ }
109
+ }
110
+
111
+ static VALUE
112
+ Alloc(VALUE klass)
113
+ {
114
+ SZaru::TopEstimator<Value> **ptr = ALLOC(SZaru::TopEstimator<Value>*);
115
+ *ptr = NULL;
116
+ return Data_Wrap_Struct(klass, 0, Free, ptr);
117
+ }
118
+
119
+ static VALUE
120
+ Initialize(VALUE self, VALUE maxElems)
121
+ {
122
+ SZaru::TopEstimator<Value> **ptr;
123
+ Check_Type(maxElems, T_FIXNUM);
124
+ Data_Get_Struct(self, SZaru::TopEstimator<Value>*, ptr);
125
+ *ptr = SZaru::TopEstimator<Value>::Create(NUM2LONG(maxElems));
126
+ return Qnil;
127
+ }
128
+
129
+ static VALUE
130
+ AddElem(VALUE self, VALUE elem)
131
+ {
132
+ SZaru::TopEstimator<Value> **te;
133
+ Check_Type(elem, T_STRING);
134
+ Data_Get_Struct(self, SZaru::TopEstimator<Value>*, te);
135
+ (*te)->AddElem(std::string(RSTRING_PTR(elem), RSTRING_LEN(elem)));
136
+ return Qnil;
137
+ }
138
+
139
+ static VALUE
140
+ AddWeightedElem(VALUE self, VALUE elem, VALUE weight)
141
+ {
142
+ SZaru::TopEstimator<Value> **te;
143
+ Check_Type(elem, T_STRING);
144
+ Converter<Value>::CheckType(weight);
145
+ Data_Get_Struct(self, SZaru::TopEstimator<Value>*, te);
146
+ (*te)->AddWeightedElem(std::string(RSTRING_PTR(elem), RSTRING_LEN(elem)),
147
+ Converter<Value>::FromRuby(weight));
148
+ return Qnil;
149
+ }
150
+
151
+ static VALUE
152
+ Estimate(VALUE self)
153
+ {
154
+ SZaru::TopEstimator<Value> **te;
155
+ Data_Get_Struct(self, SZaru::TopEstimator<Value>*, te);
156
+ std::vector<typename SZaru::TopEstimator<Value>::Elem> topElems;
157
+ (*te)->Estimate(topElems);
158
+ VALUE ary = rb_ary_new2(topElems.size());
159
+ for (int i = 0; i < topElems.size(); i++) {
160
+ rb_ary_push(ary, rb_ary_new3(2,
161
+ rb_str_new(topElems[i].value.c_str(), topElems[i].value.size()),
162
+ Converter<Value>::ToRuby(topElems[i].weight)));
163
+ }
164
+ return ary;
165
+ }
166
+
167
+ public:
168
+ static VALUE
169
+ Define(VALUE superModule, const char *name)
170
+ {
171
+ VALUE cTopEstimator = rb_define_class_under(superModule, name, rb_cObject);
172
+ rb_define_alloc_func(cTopEstimator, Alloc);
173
+ rb_define_private_method(cTopEstimator, "initialize",
174
+ RUBY_METHOD_FUNC(Initialize), 1);
175
+ rb_define_method(cTopEstimator, "add_elem",
176
+ RUBY_METHOD_FUNC(AddElem), 1);
177
+ rb_define_method(cTopEstimator, "add_weighted_elem",
178
+ RUBY_METHOD_FUNC(AddWeightedElem), 2);
179
+ rb_define_method(cTopEstimator, "estimate",
180
+ RUBY_METHOD_FUNC(Estimate), 0);
181
+ return cTopEstimator;
182
+ }
183
+
184
+ };
185
+
186
+
187
+ class RubyUniqueEstimator {
188
+ private:
189
+ static void
190
+ Free(SZaru::UniqueEstimator **ptr)
191
+ {
192
+ if (*ptr) {
193
+ delete *ptr;
194
+ }
195
+ }
196
+
197
+ static VALUE
198
+ Alloc(VALUE klass)
199
+ {
200
+ SZaru::UniqueEstimator **ptr = ALLOC(SZaru::UniqueEstimator*);
201
+ *ptr = NULL;
202
+ return Data_Wrap_Struct(klass, 0, Free, ptr);
203
+ }
204
+
205
+ static VALUE
206
+ Initialize(VALUE self, VALUE maxElems)
207
+ {
208
+ SZaru::UniqueEstimator **ptr;
209
+ Data_Get_Struct(self, SZaru::UniqueEstimator*, ptr);
210
+ *ptr = SZaru::UniqueEstimator::Create(NUM2LONG(maxElems));
211
+ return Qnil;
212
+ }
213
+
214
+ static VALUE
215
+ AddElem(VALUE self, VALUE elem)
216
+ {
217
+ SZaru::UniqueEstimator **ue;
218
+ Check_Type(elem, T_STRING);
219
+ Data_Get_Struct(self, SZaru::UniqueEstimator*, ue);
220
+ (*ue)->AddElemInCIF(RSTRING_PTR(elem), RSTRING_LEN(elem));
221
+ return Qnil;
222
+ }
223
+
224
+ static VALUE
225
+ Estimate(VALUE self)
226
+ {
227
+ SZaru::UniqueEstimator **ue;
228
+ Data_Get_Struct(self, SZaru::UniqueEstimator*, ue);
229
+ uint64_t unique = (*ue)->Estimate();
230
+ return LONG2NUM(unique);
231
+ }
232
+
233
+ public:
234
+ static VALUE
235
+ Define(VALUE superModule, const char *name) {
236
+ VALUE cUniqueEstimator = rb_define_class_under(superModule, name, rb_cObject);
237
+ rb_define_alloc_func(cUniqueEstimator, Alloc);
238
+ rb_define_private_method(cUniqueEstimator, "initialize",
239
+ RUBY_METHOD_FUNC(Initialize), 1);
240
+ rb_define_method(cUniqueEstimator, "add_elem",
241
+ RUBY_METHOD_FUNC(AddElem), 1);
242
+ rb_define_method(cUniqueEstimator, "estimate",
243
+ RUBY_METHOD_FUNC(Estimate), 0);
244
+ return cUniqueEstimator;
245
+ }
246
+ };
247
+
248
+ }
249
+
250
+
251
+ void
252
+ Init_szaru(void){
253
+ VALUE mSZaru = rb_define_module("SZaru");
254
+ RubyUniqueEstimator::Define(mSZaru, "UniqueEstimator");
255
+
256
+ // TopEstimator
257
+ VALUE mTopEstimator = rb_define_module_under(mSZaru, "TopEstimator");
258
+ RubyTopEstimator<double>::Define(mTopEstimator, "Double");
259
+ RubyTopEstimator<int32_t>::Define(mTopEstimator, "Int32");
260
+ RubyTopEstimator<int64_t>::Define(mTopEstimator, "Int64");
261
+
262
+ // QuantileEstimator
263
+ VALUE mQuantileEstimator = rb_define_module_under(mSZaru, "QuantileEstimator");
264
+ RubyQuantileEstimator<double>::Define(mQuantileEstimator, "Double");
265
+ RubyQuantileEstimator<int32_t>::Define(mQuantileEstimator, "Int32");
266
+ RubyQuantileEstimator<int64_t>::Define(mQuantileEstimator, "Int64");
267
+
268
+ }
Binary file
Binary file
@@ -0,0 +1,33 @@
1
+ = Ruby Binding of SZaru
2
+
3
+ == Introduction
4
+ {SZaru}[http://llamerada.github.com/SZaru/] is a library to use {Sawzall}[http://code.google.com/p/szl/] aggregators in pure C++, Ruby and Python.
5
+ Currently, I have implemented the following 3 aggregators:
6
+ [Top] Statistical samplings that record the 'top N' data items based on CountSketch algorithm from "Finding Frequent Items in Data Streams", Moses Charikar, Kevin Chen and Martin Farach-Colton, 2002.
7
+ [Unique] Statistical estimators for the total number of unique data items.
8
+ [Quantile] Approximate N-tiles for data items from an ordered domain based on the following paper: Munro & Paterson, "Selection and Sorting with Limited Storage", Theoretical Computer Science, Vol 12, p 315-323, 1980.
9
+
10
+ == Example
11
+ require "szaru"
12
+ unq_est = SZaru::UniqueEstimator.new(10)
13
+ 1000.times do |i|
14
+ unq_est.add_elem(i.to_s + "test")
15
+ end
16
+ puts unq_est.estimate # => 913
17
+
18
+ == License
19
+
20
+ Copyright 2010 Yuji Kaneda
21
+
22
+ Licensed under the Apache License, Version 2.0 (the "License");
23
+ you may not use this file except in compliance with the License.
24
+ You may obtain a copy of the License at
25
+
26
+ http://www.apache.org/licenses/LICENSE-2.0
27
+
28
+ Unless required by applicable law or agreed to in writing, software
29
+ distributed under the License is distributed on an "AS IS" BASIS,
30
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31
+ See the License for the specific language governing permissions and
32
+ limitations under the License.
33
+
@@ -0,0 +1,68 @@
1
+ begin
2
+ require "rubygems"
3
+ require "szaru"
4
+ rescue
5
+ puts "load local library"
6
+ require File.join(File.dirname(__FILE__), "../ext/szaru")
7
+ end
8
+
9
+ def test_unique
10
+ ue = SZaru::UniqueEstimator.new(10)
11
+ 1000.times do |i|
12
+ ue.add_elem(i.to_s + "test")
13
+ end
14
+ puts ue.estimate
15
+ end
16
+
17
+ def test_top
18
+ te = SZaru::TopEstimator::Int32.new(10)
19
+ ary = []
20
+ 100.times do |i|
21
+ i.times do |j|
22
+ ary << "test#{i}"
23
+ end
24
+ end
25
+ ary.sort_by{rand}.each do |e|
26
+ te.add_elem(e)
27
+ end
28
+ p te.estimate
29
+ end
30
+
31
+ def test_top2
32
+ te = SZaru::TopEstimator::Double.new(10)
33
+ ary = []
34
+ 100.times do |i|
35
+ ary << ["test#{i}", i]
36
+ end
37
+ ary.sort_by{rand}.each do |e, w|
38
+ te.add_weighted_elem(e, w.to_f)
39
+ end
40
+ p te.estimate
41
+ end
42
+
43
+ def test_quantile
44
+ te = SZaru::QuantileEstimator::Int64.new(10)
45
+ ary = []
46
+ 1000.times do |i|
47
+ # i.times do |j|
48
+ ary << i
49
+ end
50
+ rand_ary = ary.sort_by{rand}
51
+ # $stdout.sync = true
52
+ # puts "start"
53
+ rand_ary.each_with_index do |e, i|
54
+ te.add_elem(e)
55
+ end
56
+ p te.estimate
57
+ end
58
+
59
+
60
+
61
+ # test_topheap
62
+ # test_sketch
63
+ test_unique
64
+ test_top
65
+ test_top2
66
+ test_quantile
67
+
68
+
@@ -0,0 +1,189 @@
1
+ #--
2
+ # SZaru: Porting of excellent Sawzall aggregators.
3
+ #++
4
+ #:include:overview.rd
5
+
6
+
7
+ # SZaru namespace
8
+ module SZaru
9
+ # Statistical estimators for the total number of unique data items.
10
+ class UniqueEstimator
11
+ # Create a UniqueEstimator object.
12
+ #
13
+ # _max_elems_ is a tuning parameter.
14
+ # If _max_elems_ is bigger, the estimation becomes more accurate but consuming more memory.
15
+ def initialize(max_elems)
16
+ # (native code)
17
+ end
18
+
19
+ # Add a new element to this entry.
20
+ # _element_ must be String object.
21
+ def add_elem(elem)
22
+ # (native code)
23
+ end
24
+
25
+ # Return the stimation the number of unique entries.
26
+ def estimate
27
+ # (native code)
28
+ end
29
+ end # UniqueEstimator
30
+
31
+ # Statistical samplings that record the 'top N' data items.
32
+ module TopEstimator
33
+ # TopEstimator of that weight is int32
34
+ class Int32
35
+ # Create a TopEstimator::Int32 object.
36
+ #
37
+ # _top_elems_ is a number of top elements to be estimate.
38
+ def initialize(top_elems)
39
+ # (native code)
40
+ end
41
+
42
+ # Add a new element to this entry.
43
+ # _element_ must be String object.
44
+ def add_elem(elem)
45
+ # (native code)
46
+ end
47
+
48
+ # Add a new weighted element to this entry.
49
+ # _element_ must be String object.
50
+ # _weight_ msut be Fixnum object.
51
+ def add_weighted_elem(elem, weight)
52
+ # (native code)
53
+ end
54
+
55
+ # Return a top elements with weight.
56
+ # Example: [["abc", 7], ["def", 3]]
57
+ def estimate
58
+ # (native code)
59
+ end
60
+ end # Int32
61
+
62
+ # TopEstimator of that weight is int64
63
+ class Int64
64
+ # Create a TopEstimator::Int64 object.
65
+ #
66
+ # _top_elems_ is a number of top elements to be estimate.
67
+ def initialize(top_elems)
68
+ # (native code)
69
+ end
70
+
71
+ # Add a new element to this entry.
72
+ # _element_ must be String object.
73
+ def add_elem(elem)
74
+ # (native code)
75
+ end
76
+
77
+ # Add a new weighted element to this entry.
78
+ # _element_ must be String object.
79
+ # _weight_ msut be Fixnum object.
80
+ def add_weighted_elem(elem, weight)
81
+ # (native code)
82
+ end
83
+
84
+ # Return a top elements with weight.
85
+ # Example: [["abc", 7], ["def", 3]]
86
+ def estimate
87
+ # (native code)
88
+ end
89
+ end # Int64
90
+
91
+ # TopEstimator of that weight is Double
92
+ class Double
93
+ # Create a TopEstimator::Double object.
94
+ #
95
+ # _top_elems_ is a number of top elements to be estimate.
96
+ def initialize(top_elems)
97
+ # (native code)
98
+ end
99
+
100
+ # Add a new element to this entry.
101
+ # _element_ must be String object.
102
+ def add_elem(elem)
103
+ # (native code)
104
+ end
105
+
106
+ # Add a new weighted element to this entry.
107
+ # _element_ must be String object.
108
+ # _weight_ msut be Float object.
109
+ def add_weighted_elem(elem, weight)
110
+ # (native code)
111
+ end
112
+
113
+ # Return a top elements with weight.
114
+ # Example: [["abc", 7.0], ["def", 3.0]]
115
+ def estimate
116
+ # (native code)
117
+ end
118
+ end # Double
119
+ end # TopEstimator
120
+
121
+ # Approximate N-tiles for data items from an ordered domain.
122
+ module QuantileEstimator
123
+ # TopEstimator of that element is int32
124
+ class Int32
125
+ # Create a QuantileEstimator::Int32 object.
126
+ #
127
+ # _num_quantiles_ is a number of tiles to be estimate.
128
+ def initialize(num_quantiles)
129
+ # (native code)
130
+ end
131
+
132
+ # Add a new element to this entry.
133
+ # _element_ must be Fixnum object.
134
+ def add_elem(elem)
135
+ # (native code)
136
+ end
137
+
138
+ # Return a estimated N tiles.
139
+ # Example: [0, 3, 7, 9]
140
+ def estimate()
141
+ # (native code)
142
+ end
143
+ end # Int32
144
+
145
+ # TopEstimator of that element is int64
146
+ class Int64
147
+ # Create a QuantileEstimator::Int64 object.
148
+ #
149
+ # _num_quantiles_ is a number of tiles to be estimate.
150
+ def initialize(num_quantiles)
151
+ # (native code)
152
+ end
153
+
154
+ # Add a new element to this entry.
155
+ # _element_ must be Fixnum object.
156
+ def add_elem(elem)
157
+ # (native code)
158
+ end
159
+
160
+ # Return a estimated N tiles.
161
+ # Example: [0, 3, 7, 9]
162
+ def estimate()
163
+ # (native code)
164
+ end
165
+ end # Int64
166
+
167
+ # TopEstimator of that element is double
168
+ class Double
169
+ # Create a QuantileEstimator::Double object.
170
+ #
171
+ # _num_quantiles_ is a number of tiles to be estimate.
172
+ def initialize(num_quantiles)
173
+ # (native code)
174
+ end
175
+
176
+ # Add a new element to this entry.
177
+ # _element_ must be Fixnum object.
178
+ def add_elem(elem)
179
+ # (native code)
180
+ end
181
+
182
+ # Return a estimated N tiles.
183
+ # Example: [0.0, 3.2, 6.8, 9.5]
184
+ def estimate()
185
+ # (native code)
186
+ end
187
+ end # Double
188
+ end # QuantileEstimator
189
+ end # SZaru
@@ -0,0 +1,72 @@
1
+ require File.join(File.dirname(__FILE__), "../ext/szaru")
2
+
3
+ include SZaru
4
+ quantile_estimators = [QuantileEstimator::Int32, QuantileEstimator::Int64, QuantileEstimator::Double]
5
+
6
+ def convert_value_from_index(index, value)
7
+ case index
8
+ when 0, 1
9
+ value.to_i
10
+ when 2
11
+ value.to_f
12
+ end
13
+ end
14
+
15
+ quantile_estimators.each_with_index do |quantile_estimator, te_index|
16
+ describe quantile_estimator do
17
+ it "return [0] if no addition exists" do
18
+ te = quantile_estimator.new(10)
19
+ te.estimate.should == [0]
20
+ end
21
+
22
+ it "return [min, max] if quantile_elems is 0" do
23
+ te = quantile_estimator.new(0)
24
+ te.add_elem(convert_value_from_index(te_index, 10))
25
+ te.add_elem(convert_value_from_index(te_index, 7))
26
+ te.estimate.should == [7, 10]
27
+ end
28
+
29
+ it "return exact quantile when the number of elements is small than quantile_elems" do
30
+ te = quantile_estimator.new(10)
31
+ n_elemnts = 5
32
+ n_elemnts.times do |i|
33
+ te.add_elem(convert_value_from_index(te_index, i))
34
+ end
35
+ quantile_elements = te.estimate
36
+ expexcted_values = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
37
+ expexcted = expexcted_values.map{|value| convert_value_from_index(te_index, value)}
38
+ quantile_elements.should == expexcted
39
+ end
40
+
41
+ it "return same result when calling estimate twice" do
42
+ te = quantile_estimator.new(10)
43
+ n_elemnts = 5
44
+ n_elemnts.times do |i|
45
+ te.add_elem(convert_value_from_index(te_index, i))
46
+ end
47
+ # first call
48
+ quantile_elements = te.estimate
49
+ # second call
50
+ quantile_elements = te.estimate
51
+ expexcted_values = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
52
+ expexcted = expexcted_values.map{|value| convert_value_from_index(te_index, value)}
53
+ quantile_elements.should == expexcted
54
+ end
55
+
56
+ it "return approximate number when the number of elements is greater than quantile_elems" do
57
+ qe = quantile_estimator.new(11)
58
+ elems = (0 .. 1000).to_a.sort_by{rand}
59
+ elems.each do |elem|
60
+ qe.add_elem(convert_value_from_index(te_index, elem))
61
+ end
62
+ quantiles = qe.estimate
63
+ quantiles.length.should == 11
64
+ quantiles.each_with_index do |tile, index|
65
+ exact = index * 100
66
+ diff = (tile - exact).abs
67
+ diff.should < 10
68
+ end
69
+ end
70
+
71
+ end
72
+ end
@@ -0,0 +1,110 @@
1
+ require File.join(File.dirname(__FILE__), "../ext/szaru")
2
+
3
+ include SZaru
4
+ top_estimators = [TopEstimator::Int32, TopEstimator::Int64, TopEstimator::Double]
5
+
6
+ def convert_value_from_index(index, value)
7
+ case index
8
+ when 0, 1
9
+ value.to_i
10
+ when 2
11
+ value.to_f
12
+ end
13
+ end
14
+
15
+ top_estimators.each_with_index do |top_estimator, te_index|
16
+ describe top_estimator do
17
+ it "return [] if no addition exists" do
18
+ te = top_estimator.new(10)
19
+ te.estimate.should == []
20
+ end
21
+
22
+ it "return [] if top_elems is 0" do
23
+ te = top_estimator.new(0)
24
+ te.add_elem("test")
25
+ te.estimate.should == []
26
+ end
27
+
28
+ it "return exact number when the number of elements is small than top_elems" do
29
+ te = top_estimator.new(10)
30
+ n_elemnts = 5
31
+ n_elemnts.times do |i|
32
+ te.add_elem("test#{i}")
33
+ te.add_weighted_elem("test#{i}", convert_value_from_index(te_index, i))
34
+ end
35
+ top_elements = te.estimate
36
+ top_elements.length.should == n_elemnts
37
+ n_elemnts.times do |i|
38
+ # check element
39
+ top_elements[i][0].should == "test#{n_elemnts - i - 1}"
40
+ # check weight
41
+ exact_weight = convert_value_from_index(te_index, n_elemnts - i)
42
+ top_elements[i][1].should == exact_weight
43
+ end
44
+ end
45
+
46
+ it "return same result when calling estimate twice" do
47
+ te = top_estimator.new(10)
48
+ n_elemnts = 5
49
+ n_elemnts.times do |i|
50
+ te.add_elem("test#{i}")
51
+ te.add_weighted_elem("test#{i}", convert_value_from_index(te_index, i))
52
+ end
53
+ # first call
54
+ top_elements = te.estimate
55
+ # second call
56
+ top_elements = te.estimate
57
+ top_elements.length.should == n_elemnts
58
+ n_elemnts.times do |i|
59
+ # check element
60
+ top_elements[i][0].should == "test#{n_elemnts - i - 1}"
61
+ # check weight
62
+ exact_weight = convert_value_from_index(te_index, n_elemnts - i)
63
+ top_elements[i][1].should == exact_weight
64
+ end
65
+ end
66
+
67
+ it "return approximate number when the number of elements is greater than top_elems" do
68
+ te = top_estimator.new(10)
69
+ n_large_elemnts = 30
70
+ n_small_elemnts = 1000
71
+ # create input stream
72
+ elems = []
73
+ # large element x_i occurs x_i^2 times.
74
+ n_large_elemnts.times do |i|
75
+ (i * i).times do
76
+ elems << i
77
+ end
78
+ end
79
+ # small element y_i occurs less than 5 times.
80
+ n_small_elemnts.times do |i|
81
+ rand(5).times do
82
+ elems << i
83
+ end
84
+ end
85
+ # run input stream 2 times in random oreder
86
+ 2.times do
87
+ elems.sort_by{ rand }.each do |j|
88
+ te.add_elem("test#{j}")
89
+ end
90
+ end
91
+ # check estimation
92
+ top_elements = te.estimate
93
+ top_elements.length.should == 10
94
+ 10.times do |i|
95
+ exact_index = n_large_elemnts - i - 1
96
+ # check element
97
+ top_elements[i][0] =~ /test(\d*)/
98
+ estimated_index = $1.to_i
99
+ diff = (exact_index - estimated_index).abs
100
+ diff.should < 3
101
+ # check weight
102
+ exact_weight = convert_value_from_index(te_index, 2 * exact_index * exact_index)
103
+ diff = top_elements[i][1] - exact_weight
104
+ error = (diff / exact_weight.to_f).abs
105
+ error.should < 0.1
106
+ end
107
+ end
108
+
109
+ end
110
+ end
@@ -0,0 +1,39 @@
1
+ require File.join(File.dirname(__FILE__), "../ext/szaru")
2
+
3
+ include SZaru
4
+ describe UniqueEstimator do
5
+ it "return 0 if no addition exists" do
6
+ ue = UniqueEstimator.new(10)
7
+ ue.estimate.should == 0
8
+ end
9
+
10
+ it "return 0 if max_elems is 0" do
11
+ ue = UniqueEstimator.new(0)
12
+ ue.add_elem("test")
13
+ ue.estimate.should == 0
14
+ end
15
+
16
+ it "return exact number when the number of elements is small than max_elems" do
17
+ ue = UniqueEstimator.new(10)
18
+ 5.times do |i|
19
+ ue.add_elem("test#{i}")
20
+ end
21
+ ue.estimate.should == 5
22
+ end
23
+
24
+ it "return approximate number when the number of elements is greater than max_elems" do
25
+ ue = UniqueEstimator.new(10)
26
+ n_unique = 997
27
+ elems = Array.new(n_unique){|i| ("test#{i}") }
28
+ # add elems to ue 2 times in random oreder
29
+ 2.times do
30
+ elems.sort_by{ rand }.each do |elm|
31
+ ue.add_elem(elm)
32
+ end
33
+ end
34
+ diff = ue.estimate - n_unique
35
+ error_rate = ( diff / n_unique.to_f).abs
36
+ error_rate.should < 0.1
37
+ end
38
+
39
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: szaru
3
+ version: !ruby/object:Gem::Version
4
+ hash: 25
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 1
10
+ version: 0.1.1
11
+ platform: ruby
12
+ authors:
13
+ - Yuji Kaneda
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-13 00:00:00 +09:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: |
23
+ Portings of excellent Sawzall aggregators.
24
+
25
+ email: llamerada@gmail.com
26
+ executables: []
27
+
28
+ extensions:
29
+ - ext/extconf.rb
30
+ extra_rdoc_files: []
31
+
32
+ files:
33
+ - ext/rb_szaru.cc
34
+ - ext/converter.h
35
+ - ext/szaru.so
36
+ - ext/Makefile
37
+ - ext/extconf.rb
38
+ - ext/rb_szaru.o
39
+ - test/quantile_spec.rb
40
+ - test/top_spec.rb
41
+ - test/unique_spec.rb
42
+ - sample/sample.rb
43
+ - overview.rd
44
+ - szaru-doc.rb
45
+ has_rdoc: true
46
+ homepage: http://llamerada.github.com/SZaru/
47
+ licenses: []
48
+
49
+ post_install_message:
50
+ rdoc_options:
51
+ - szaru-doc.rb
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ hash: 3
60
+ segments:
61
+ - 0
62
+ version: "0"
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ none: false
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ hash: 3
69
+ segments:
70
+ - 0
71
+ version: "0"
72
+ requirements: []
73
+
74
+ rubyforge_project:
75
+ rubygems_version: 1.3.7
76
+ signing_key:
77
+ specification_version: 3
78
+ summary: Portings of excellent Sawzall aggregators
79
+ test_files:
80
+ - test/quantile_spec.rb
81
+ - test/top_spec.rb
82
+ - test/unique_spec.rb