rfpset 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in rfpset.gemspec
4
+ gemspec
data/README.markdown ADDED
@@ -0,0 +1,41 @@
1
+ FPSet - Fast, Persistent Sets
2
+ =============================
3
+
4
+ FPSet is a very specialized library for performing large set
5
+ intersections against data that is too large to fit into memory. It
6
+ does this by storing the sets on disk in an ordered binary format and
7
+ performing the intersection as it streams just-enough data from
8
+ disk. The result is a very memory friendly and performant set
9
+ intersection that's appropriate for very large sets.
10
+
11
+ To use:
12
+
13
+ Ahead of time, presumably off-line in a monthly cron-job or something,
14
+ we build our sets:
15
+
16
+ ``` ruby
17
+ setNames.each do |name|
18
+ strings = fetch_set_named(name)
19
+ FPSet.to_file(name, strings)
20
+ end
21
+ ```
22
+
23
+ Then, presumably at runtime, we can do our big set intersections:
24
+
25
+ ``` ruby
26
+ common_terms = FPSet.intersect_files(setNames)
27
+ ```
28
+
29
+ To slurp in a set from just one of the files:
30
+
31
+ ```ruby
32
+ set = FBSet.from_file(setNames[0])
33
+ ```
34
+
35
+ This is a bundler created gem. To build and install just run:
36
+
37
+ ``` bash
38
+ gem build rfpset.gemspec
39
+ gem install rfpset-0.0.1.gem
40
+ ```
41
+
data/Rakefile ADDED
@@ -0,0 +1,38 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+ require 'rake/clean'
4
+
5
+ NAME = 'rfpset'
6
+
7
+ # rule to build the extension: this says
8
+ # that the extension should be rebuilt
9
+ # after any change to the files in ext
10
+ file "lib/#{NAME}/#{NAME}.bundle" =>
11
+ Dir.glob("ext/#{NAME}/*{.rb,.c}") do
12
+ Dir.chdir("ext/#{NAME}") do
13
+ # this does essentially the same thing
14
+ # as what RubyGems does
15
+ ruby "extconf.rb"
16
+ sh "make"
17
+ end
18
+ cp "ext/#{NAME}/#{NAME}.bundle", "lib/#{NAME}"
19
+ end
20
+
21
+ # make the :test task depend on the shared
22
+ # object, so it will be built automatically
23
+ # before running the tests
24
+ task :test => "lib/#{NAME}/#{NAME}.bundle"
25
+
26
+ # use 'rake clean' and 'rake clobber' to
27
+ # easily delete generated files
28
+ CLEAN.include('ext/**/*{.o,.log,.bundle}')
29
+ CLEAN.include('ext/**/Makefile')
30
+ CLOBBER.include('lib/**/*.bundle')
31
+
32
+ # the same as before
33
+ Rake::TestTask.new do |t|
34
+ t.libs << 'test'
35
+ end
36
+
37
+ desc "Run tests"
38
+ task :default => :test
data/bench.rb ADDED
@@ -0,0 +1,49 @@
1
+ require 'rubygems'
2
+ require 'rfpset'
3
+
4
+ # generate some random data
5
+ $alphabet = Array('a'..'z')
6
+ def generate_ngrams(count, width)
7
+ count.times.map do
8
+ (width.times.map { $alphabet[Random.rand($alphabet.size)] }).join('')
9
+ end
10
+ end
11
+
12
+ # make n big sets
13
+ #num_sets = 7
14
+ puts "num_sets, size, fpset_write, fpset_intersect, array_intersect, set_size"
15
+ (1..7).each do |num_sets|
16
+ (1..5).each do |pre_size|
17
+
18
+ size = 30000 * pre_size
19
+
20
+ fpset_write = 0
21
+ sets = num_sets.times.map do |ii|
22
+ ngrams = generate_ngrams(size, 4)
23
+ start = Time.now
24
+ FPSet.to_file(ngrams, ii.to_s)
25
+ stop = Time.now
26
+ fpset_write += (stop - start)
27
+ ngrams
28
+ end
29
+
30
+ start = Time.now
31
+ join = FPSet.intersect_files(num_sets.times.map { |x| x.to_s })
32
+ stop = Time.now
33
+ fpset_intersect = stop - start
34
+
35
+ set_size = join.size
36
+
37
+ start = Time.now
38
+ result = sets.reduce do |last, current|
39
+ last & current
40
+ end
41
+
42
+ stop = Time.now
43
+ array_intersect = stop - start
44
+
45
+ puts "#{num_sets}, #{size}, #{(fpset_write*1000)}, #{(fpset_intersect*1000)}, #{(array_intersect*1000)}, #{set_size}"
46
+
47
+ end
48
+ end
49
+
@@ -0,0 +1,8 @@
1
+ require 'mkmf'
2
+
3
+ # $CFLAGS << ' -g -pg -ggdb '
4
+ # $LDFLAGS << ' -g -pg '
5
+
6
+ #have_library('zlib', 'zlibVersion')
7
+ $LDFLAGS << ' -lz '
8
+ create_makefile('rfpset/rfpset')
@@ -0,0 +1,254 @@
1
+ #include <ruby.h>
2
+
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <zlib.h>
7
+
8
+ typedef struct {
9
+ size_t size;
10
+ size_t reserved_size;
11
+ char data[0];
12
+ } blob;
13
+
14
+ typedef char* bytes;
15
+
16
+ /**
17
+ * Read COUNT bytes from SRC into DATA. Return 1 if all bytes were
18
+ * read successfully or 0 if it wasn't possible to read the requested
19
+ * number of bytes.
20
+ */
21
+ int read_confidently(gzFile src, size_t count, bytes data) {
22
+ return gzread(src, data, count);
23
+ }
24
+
25
+ blob* blob_make(size_t reserved_size) {
26
+ blob* new_blob = malloc(sizeof(blob) + reserved_size);
27
+ new_blob->size = 0;
28
+ new_blob->reserved_size = reserved_size;
29
+ return new_blob;
30
+ }
31
+
32
+ blob* blob_ensure_reserved_size(blob* datum, size_t reserved_size) {
33
+ if(reserved_size > datum->reserved_size) {
34
+ datum = realloc(datum, reserved_size);
35
+ datum->reserved_size = reserved_size;
36
+ }
37
+ return datum;
38
+ }
39
+
40
+ /**
41
+ * Read a blob from SRC into DATUM. May realloc DATUM- if not large
42
+ * enough to contain the blob so the returned value should always be
43
+ * used in place of DATUM. Returns null if read was impossible.
44
+ */
45
+ blob* blob_read(gzFile src, blob* datum) {
46
+ if(!read_confidently(src, sizeof(size_t), (char*)datum)) return NULL;
47
+ datum = blob_ensure_reserved_size(datum, datum->size);
48
+ if(!read_confidently(src, datum->size, datum->data)) return NULL;
49
+ return datum;
50
+ }
51
+
52
+ /**
53
+ * Writes DATUM to DST. Returns bytes written if success or 0 if
54
+ * failure. (A blob always has size > 0)
55
+ */
56
+ int rstring_write(gzFile dst, VALUE string) {
57
+ size_t size = RSTRING_LEN(string);
58
+ gzwrite(dst, &size, sizeof(size_t));
59
+ return gzwrite(dst, RSTRING_PTR(string), RSTRING_LEN(string));
60
+ }
61
+
62
+ int rstring_compare(const void* va, const void* vb) {
63
+ VALUE a = *(VALUE*)va;
64
+ VALUE b = *(VALUE*)vb;
65
+
66
+ size_t size_a = RSTRING_LEN(a);
67
+ size_t size_b = RSTRING_LEN(b);
68
+
69
+ if(size_a < size_b) return -1;
70
+ if(size_a > size_b) return 1;
71
+ return memcmp(RSTRING_PTR(a), RSTRING_PTR(b), size_a);
72
+ }
73
+
74
+ int blob_compare(const void * va, const void * vb) {
75
+ blob* a = *(blob**)va;
76
+ blob* b = *(blob**)vb;
77
+
78
+ if(a->size < b->size) return -1;
79
+ if(a->size > b->size) return 1;
80
+ return memcmp(a->data, b->data, a->size);
81
+ }
82
+
83
+ /**
84
+ * Sort an array of blobs in place
85
+ */
86
+ int rstring_sort_array(VALUE* strings, size_t count) {
87
+ qsort(strings, count, sizeof(VALUE), rstring_compare);
88
+ }
89
+
90
+ VALUE blob_intersect_files(gzFile* files, int file_count) {
91
+ VALUE result = rb_ary_new();
92
+
93
+ if(file_count == 0) return result;
94
+
95
+ int master_idx = 0;
96
+ int ii = 0;
97
+ blob* master_blob = blob_make(512);
98
+ blob* next_blob = blob_make(512);
99
+
100
+ // bootstrap
101
+ master_blob = blob_read(files[0], master_blob);
102
+
103
+ // until a file runs out of data
104
+ while(1) {
105
+ int all_match = 1;
106
+ int end_of_file = 0;
107
+
108
+ for(ii = 0; ii < file_count; ++ii) {
109
+ if(ii == master_idx) continue;
110
+
111
+ // read blobs from this file until they aren't less than the
112
+ // master blob
113
+ int compare_result = 0;
114
+ while(1) {
115
+ next_blob = blob_read(files[ii], next_blob);
116
+ if(next_blob == NULL) {
117
+ end_of_file = 1;
118
+ break;
119
+ } else {
120
+ compare_result = blob_compare(&next_blob, &master_blob);
121
+ if(compare_result >= 0) break;
122
+ }
123
+ }
124
+
125
+ // if any file ever reaches the end while we're looking it means
126
+ // that we've found the entire intersection
127
+ if(end_of_file) {
128
+ all_match = 0;
129
+ break;
130
+ }
131
+
132
+ // if we ever get a non-zero compare result then that means the
133
+ // current candidate is a failure and we have a new candidate to
134
+ // try
135
+ if(compare_result != 0) {
136
+ all_match = 0;
137
+ break;
138
+ }
139
+ }
140
+
141
+ // finish bailing out on end of file
142
+ if(end_of_file) break;
143
+
144
+ // store the match if we had one
145
+ if(all_match) {
146
+ rb_ary_push(result, rb_str_new(master_blob->data,
147
+ master_blob->size));
148
+ } else {
149
+ // if we didn't have a match then whichever blob failed first
150
+ // becomes the new master and we try again
151
+ blob* temp = master_blob;
152
+ master_blob = next_blob;
153
+ next_blob = temp;
154
+ master_idx = ii;
155
+ }
156
+ }
157
+
158
+ free(master_blob);
159
+ free(next_blob);
160
+
161
+ return result;
162
+ }
163
+
164
+ static VALUE rfpset_spit_array(VALUE self, VALUE array, VALUE filename) {
165
+ gzFile out = gzopen(RSTRING_PTR(filename), "wb2");
166
+ if(out == NULL) return rb_fix_new(-1);
167
+
168
+ long ii;
169
+ long size = RARRAY_LEN(array);
170
+
171
+ // sort the array in place
172
+ VALUE* values = RARRAY_PTR(array);
173
+ rstring_sort_array(values, size);
174
+
175
+ // spit them at the disk, freeing as we go
176
+ VALUE last_value = 0;
177
+ for(ii = 0; ii < size; ++ii) {
178
+ if(!last_value
179
+ || RSTRING_LEN(values[ii]) != RSTRING_LEN(last_value)
180
+ || memcmp(RSTRING_PTR(values[ii]), RSTRING_PTR(last_value),
181
+ RSTRING_LEN(last_value)) != 0) {
182
+ // this blob is unique. Write it
183
+ rstring_write(out, values[ii]);
184
+ last_value = values[ii];
185
+ }
186
+ }
187
+
188
+ gzclose(out);
189
+
190
+ // return the number of blobs written
191
+ return rb_fix_new(size);
192
+ }
193
+
194
+ VALUE rfpset_slurp_array(VALUE self, VALUE filename) {
195
+ gzFile in = gzopen(RSTRING_PTR(filename), "rb");
196
+ if(in == NULL) return rb_fix_new(-1);
197
+
198
+ VALUE array = rb_ary_new();
199
+
200
+ blob* next_blob = blob_make(512);
201
+ while((next_blob = blob_read(in, next_blob)) != NULL) {
202
+ rb_ary_push(array, rb_str_new(next_blob->data, next_blob->size));
203
+ }
204
+ gzclose(in);
205
+ free(next_blob);
206
+
207
+ return array;
208
+ }
209
+
210
+ VALUE rfpset_intersect_files(VALUE self, VALUE filenames) {
211
+ long file_count = RARRAY_LEN(filenames);
212
+ VALUE* values = RARRAY_PTR(filenames);
213
+ int ii;
214
+
215
+ if(file_count == 1) {
216
+ return rfpset_slurp_array(self, values[0]);
217
+ }
218
+
219
+ gzFile* files = malloc(sizeof(gzFile) * file_count);
220
+
221
+ // open all the files
222
+ for(ii = 0; ii < file_count; ++ii) {
223
+ const char* name = RSTRING_PTR(values[ii]);
224
+ gzFile file = gzopen(name, "rb");
225
+ if(file == NULL) break; // failure!
226
+ files[ii] = file;
227
+ }
228
+
229
+ // make sure they all opened
230
+ if(ii < file_count) {
231
+ // close them all
232
+ int jj;
233
+ for(jj = 0; jj < ii; ++jj) {
234
+ gzclose(files[jj]);
235
+ }
236
+ return rb_fix_new(-1);
237
+ }
238
+
239
+ VALUE array = blob_intersect_files(files, file_count);
240
+
241
+ // close the files
242
+ for(ii = 0; ii < file_count; ++ii) {
243
+ gzclose(files[ii]);
244
+ }
245
+
246
+ return array;
247
+ }
248
+
249
+ void Init_rfpset() {
250
+ VALUE klass = rb_define_class("FPSetInternal", rb_cObject);
251
+ rb_define_singleton_method(klass, "spit_array", rfpset_spit_array, 2);
252
+ rb_define_singleton_method(klass, "slurp_array", rfpset_slurp_array, 1);
253
+ rb_define_singleton_method(klass, "intersect_files", rfpset_intersect_files, 1);
254
+ }
@@ -0,0 +1,3 @@
1
+ module Rfpset
2
+ VERSION = "0.0.1"
3
+ end
data/lib/rfpset.rb ADDED
@@ -0,0 +1,77 @@
1
+ require "rfpset/version"
2
+ require "rfpset/rfpset"
3
+ require 'set'
4
+
5
+ module FPSet
6
+ # Create a new set-file
7
+ #
8
+ # Example:
9
+ # >> FPSet.to_file([1,2,5,3,5,4], "numbers.dat")
10
+ #
11
+ # Arguments:
12
+ # data: (Enumerable, can contain duplicates)
13
+ # filename: (String)
14
+
15
+ def to_file(data, filename)
16
+ array = (data.collect { |d| Marshal.dump(d) }).to_a
17
+ result = FPSetInternal.spit_array(array, filename)
18
+ raise "does the file #{filename} exist?" if result == -1
19
+ return result
20
+ end
21
+ module_function :to_file
22
+
23
+ # Create a new set-file. Mutates provided data to save memory.
24
+ #
25
+ # Example:
26
+ # >> arr = [1,2,5,3,5,4]
27
+ # >> FPSet.to_file!(arr, "numbers.dat")
28
+ # >> arr = nil # array is full of garbage now
29
+ #
30
+ # Arguments:
31
+ # data: (Array, will be mutated)
32
+ # filename: (String)
33
+
34
+ def to_file!(data, filename)
35
+ return to_file(data, filename) if not data.kind_of?(Array)
36
+
37
+ data.collect! { |d| Marshal.dump(d) }
38
+ result = FPSetInternal.spit_array(data, filename)
39
+ raise "does the file #{filename} exist?" if result == -1
40
+ return result
41
+ end
42
+ module_function :to_file!
43
+
44
+ # Slurp a set-file from disk into a Ruby set.
45
+ #
46
+ # Example:
47
+ # >> set = FPSet.from_file("numbers.dat")
48
+ #
49
+ # Arguments:
50
+ # filename: (String)
51
+
52
+ def from_file(filename)
53
+ result = FPSetInternal.slurp_array(filename)
54
+ raise "does the file #{filename} exist?" if result == -1
55
+ return Set.new( result.map { |s| Marshal.load(s) } )
56
+ end
57
+ module_function :from_file
58
+
59
+ # Compute the intersection of set-files
60
+ #
61
+ # Example:
62
+ # >> set = FPSet.intersect_files(["numbers1.dat", "numbers2.dat"])
63
+ #
64
+ # Arguments:
65
+ # filenames: (Enumerable of Strings)
66
+
67
+ def intersect_files(filenames)
68
+ array = Array(filenames.collect { |f| f.to_s })
69
+ result = FPSetInternal.intersect_files(array)
70
+ if result == -1 then
71
+ names = array.join(", ")
72
+ raise "do all files exist? (#{names})"
73
+ end
74
+ return result.map { |s| Marshal.load(s) }
75
+ end
76
+ module_function :intersect_files
77
+ end
data/rfpset.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "rfpset/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "rfpset"
7
+ s.version = Rfpset::VERSION
8
+ s.authors = ["Brian Taylor"]
9
+ s.email = ["el.wubo@gmail.com"]
10
+ s.homepage = "http://www.50ply.com/blog/2012/07/21/introducing-fast/"
11
+ s.summary = %q{Fast, persistent sets}
12
+ s.description = %q{Fast, persistent sets supporting efficient intersections of many very large sets.}
13
+
14
+ s.rubyforge_project = "rfpset"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.extensions = ['ext/rfpset/extconf.rb']
18
+
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ # specify any dependencies here; for example:
24
+ # s.add_development_dependency "rspec"
25
+ # s.add_runtime_dependency "rest-client"
26
+ end
@@ -0,0 +1,44 @@
1
+ require 'test/unit'
2
+ require 'rfpset'
3
+ require 'set'
4
+
5
+ class FPSetTest < Test::Unit::TestCase
6
+
7
+ def test_primitive
8
+ data = ["one", "two", "three"]
9
+ testfile = "test.dat"
10
+ testfile2 = "test2.dat"
11
+
12
+ assert_equal 3, FPSetInternal.spit_array(data, testfile)
13
+
14
+ new_data = FPSetInternal.slurp_array(testfile)
15
+ assert_equal data, new_data
16
+
17
+ data2 = ["three", "four", "five"]
18
+ assert_equal 3, FPSetInternal.spit_array(data2, testfile2)
19
+
20
+ intersect = FPSetInternal.intersect_files([testfile, testfile2])
21
+ assert_equal 1, intersect.size
22
+ assert_equal "three", intersect[0]
23
+ end
24
+
25
+ def test_porcelain
26
+ test1 = "test.dat"
27
+ test2 = "test2.dat"
28
+
29
+ FPSet.to_file(Array(1..5).concat(Array(1..5)), test1)
30
+ FPSet.to_file(3..6, test2)
31
+ assert_equal 3, FPSet.intersect_files([test1, test2]).size
32
+ assert_equal (1..5).to_set, FPSet.from_file(test1)
33
+ assert_equal Array(1..5), FPSet.intersect_files([test1])
34
+
35
+ # are more interesting types preserved?
36
+ test3_data = 5.times.collect { |i| [i.to_s] }
37
+ test3 = "test3.dat"
38
+ FPSet.to_file(test3_data, test3)
39
+ assert_equal test3_data.to_set, FPSet.from_file(test3)
40
+ FPSet.to_file!(Array.new(test3_data), test3)
41
+ assert_equal test3_data.to_set, FPSet.from_file(test3)
42
+ end
43
+ end
44
+
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rfpset
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Brian Taylor
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-21 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: Fast, persistent sets supporting efficient intersections of many very
15
+ large sets.
16
+ email:
17
+ - el.wubo@gmail.com
18
+ executables: []
19
+ extensions:
20
+ - ext/rfpset/extconf.rb
21
+ extra_rdoc_files: []
22
+ files:
23
+ - .gitignore
24
+ - Gemfile
25
+ - README.markdown
26
+ - Rakefile
27
+ - bench.rb
28
+ - ext/rfpset/extconf.rb
29
+ - ext/rfpset/rfpset.c
30
+ - lib/rfpset.rb
31
+ - lib/rfpset/version.rb
32
+ - rfpset.gemspec
33
+ - test/test_fpset.rb
34
+ homepage: http://www.50ply.com/blog/2012/07/21/introducing-fast/
35
+ licenses: []
36
+ post_install_message:
37
+ rdoc_options: []
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ none: false
48
+ requirements:
49
+ - - ! '>='
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ requirements: []
53
+ rubyforge_project: rfpset
54
+ rubygems_version: 1.8.10
55
+ signing_key:
56
+ specification_version: 3
57
+ summary: Fast, persistent sets
58
+ test_files:
59
+ - test/test_fpset.rb