rfpset 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in rfpset.gemspec
4
+ gemspec
data/README.markdown ADDED
@@ -0,0 +1,41 @@
1
+ FPSet - Fast, Persistent Sets
2
+ =============================
3
+
4
+ FPSet is a very specialized library for performing large set
5
+ intersections against data that is too large to fit into memory. It
6
+ does this by storing the sets on disk in an ordered binary format and
7
+ performing the intersection as it streams just-enough data from
8
+ disk. The result is a very memory friendly and performant set
9
+ intersection that's appropriate for very large sets.
10
+
11
+ To use:
12
+
13
+ Ahead of time, presumably off-line in a monthly cron-job or something,
14
+ we build our sets:
15
+
16
+ ``` ruby
17
+ setNames.each do |name|
18
+ strings = fetch_set_named(name)
19
+ FPSet.to_file(name, strings)
20
+ end
21
+ ```
22
+
23
+ Then, presumably at runtime, we can do our big set intersections:
24
+
25
+ ``` ruby
26
+ common_terms = FPSet.intersect_files(setNames)
27
+ ```
28
+
29
+ To slurp in a set from just one of the files:
30
+
31
+ ```ruby
32
+ set = FBSet.from_file(setNames[0])
33
+ ```
34
+
35
+ This is a bundler created gem. To build and install just run:
36
+
37
+ ``` bash
38
+ gem build rfpset.gemspec
39
+ gem install rfpset-0.0.1.gem
40
+ ```
41
+
data/Rakefile ADDED
@@ -0,0 +1,38 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+ require 'rake/clean'
4
+
5
+ NAME = 'rfpset'
6
+
7
+ # rule to build the extension: this says
8
+ # that the extension should be rebuilt
9
+ # after any change to the files in ext
10
+ file "lib/#{NAME}/#{NAME}.bundle" =>
11
+ Dir.glob("ext/#{NAME}/*{.rb,.c}") do
12
+ Dir.chdir("ext/#{NAME}") do
13
+ # this does essentially the same thing
14
+ # as what RubyGems does
15
+ ruby "extconf.rb"
16
+ sh "make"
17
+ end
18
+ cp "ext/#{NAME}/#{NAME}.bundle", "lib/#{NAME}"
19
+ end
20
+
21
+ # make the :test task depend on the shared
22
+ # object, so it will be built automatically
23
+ # before running the tests
24
+ task :test => "lib/#{NAME}/#{NAME}.bundle"
25
+
26
+ # use 'rake clean' and 'rake clobber' to
27
+ # easily delete generated files
28
+ CLEAN.include('ext/**/*{.o,.log,.bundle}')
29
+ CLEAN.include('ext/**/Makefile')
30
+ CLOBBER.include('lib/**/*.bundle')
31
+
32
+ # the same as before
33
+ Rake::TestTask.new do |t|
34
+ t.libs << 'test'
35
+ end
36
+
37
+ desc "Run tests"
38
+ task :default => :test
data/bench.rb ADDED
@@ -0,0 +1,49 @@
1
+ require 'rubygems'
2
+ require 'rfpset'
3
+
4
+ # generate some random data
5
+ $alphabet = Array('a'..'z')
6
+ def generate_ngrams(count, width)
7
+ count.times.map do
8
+ (width.times.map { $alphabet[Random.rand($alphabet.size)] }).join('')
9
+ end
10
+ end
11
+
12
+ # make n big sets
13
+ #num_sets = 7
14
+ puts "num_sets, size, fpset_write, fpset_intersect, array_intersect, set_size"
15
+ (1..7).each do |num_sets|
16
+ (1..5).each do |pre_size|
17
+
18
+ size = 30000 * pre_size
19
+
20
+ fpset_write = 0
21
+ sets = num_sets.times.map do |ii|
22
+ ngrams = generate_ngrams(size, 4)
23
+ start = Time.now
24
+ FPSet.to_file(ngrams, ii.to_s)
25
+ stop = Time.now
26
+ fpset_write += (stop - start)
27
+ ngrams
28
+ end
29
+
30
+ start = Time.now
31
+ join = FPSet.intersect_files(num_sets.times.map { |x| x.to_s })
32
+ stop = Time.now
33
+ fpset_intersect = stop - start
34
+
35
+ set_size = join.size
36
+
37
+ start = Time.now
38
+ result = sets.reduce do |last, current|
39
+ last & current
40
+ end
41
+
42
+ stop = Time.now
43
+ array_intersect = stop - start
44
+
45
+ puts "#{num_sets}, #{size}, #{(fpset_write*1000)}, #{(fpset_intersect*1000)}, #{(array_intersect*1000)}, #{set_size}"
46
+
47
+ end
48
+ end
49
+
@@ -0,0 +1,8 @@
1
+ require 'mkmf'
2
+
3
+ # $CFLAGS << ' -g -pg -ggdb '
4
+ # $LDFLAGS << ' -g -pg '
5
+
6
+ #have_library('zlib', 'zlibVersion')
7
+ $LDFLAGS << ' -lz '
8
+ create_makefile('rfpset/rfpset')
@@ -0,0 +1,254 @@
1
+ #include <ruby.h>
2
+
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+ #include <string.h>
6
+ #include <zlib.h>
7
+
8
+ typedef struct {
9
+ size_t size;
10
+ size_t reserved_size;
11
+ char data[0];
12
+ } blob;
13
+
14
+ typedef char* bytes;
15
+
16
+ /**
17
+ * Read COUNT bytes from SRC into DATA. Return 1 if all bytes were
18
+ * read successfully or 0 if it wasn't possible to read the requested
19
+ * number of bytes.
20
+ */
21
+ int read_confidently(gzFile src, size_t count, bytes data) {
22
+ return gzread(src, data, count);
23
+ }
24
+
25
+ blob* blob_make(size_t reserved_size) {
26
+ blob* new_blob = malloc(sizeof(blob) + reserved_size);
27
+ new_blob->size = 0;
28
+ new_blob->reserved_size = reserved_size;
29
+ return new_blob;
30
+ }
31
+
32
+ blob* blob_ensure_reserved_size(blob* datum, size_t reserved_size) {
33
+ if(reserved_size > datum->reserved_size) {
34
+ datum = realloc(datum, reserved_size);
35
+ datum->reserved_size = reserved_size;
36
+ }
37
+ return datum;
38
+ }
39
+
40
+ /**
41
+ * Read a blob from SRC into DATUM. May realloc DATUM- if not large
42
+ * enough to contain the blob so the returned value should always be
43
+ * used in place of DATUM. Returns null if read was impossible.
44
+ */
45
+ blob* blob_read(gzFile src, blob* datum) {
46
+ if(!read_confidently(src, sizeof(size_t), (char*)datum)) return NULL;
47
+ datum = blob_ensure_reserved_size(datum, datum->size);
48
+ if(!read_confidently(src, datum->size, datum->data)) return NULL;
49
+ return datum;
50
+ }
51
+
52
+ /**
53
+ * Writes DATUM to DST. Returns bytes written if success or 0 if
54
+ * failure. (A blob always has size > 0)
55
+ */
56
+ int rstring_write(gzFile dst, VALUE string) {
57
+ size_t size = RSTRING_LEN(string);
58
+ gzwrite(dst, &size, sizeof(size_t));
59
+ return gzwrite(dst, RSTRING_PTR(string), RSTRING_LEN(string));
60
+ }
61
+
62
+ int rstring_compare(const void* va, const void* vb) {
63
+ VALUE a = *(VALUE*)va;
64
+ VALUE b = *(VALUE*)vb;
65
+
66
+ size_t size_a = RSTRING_LEN(a);
67
+ size_t size_b = RSTRING_LEN(b);
68
+
69
+ if(size_a < size_b) return -1;
70
+ if(size_a > size_b) return 1;
71
+ return memcmp(RSTRING_PTR(a), RSTRING_PTR(b), size_a);
72
+ }
73
+
74
+ int blob_compare(const void * va, const void * vb) {
75
+ blob* a = *(blob**)va;
76
+ blob* b = *(blob**)vb;
77
+
78
+ if(a->size < b->size) return -1;
79
+ if(a->size > b->size) return 1;
80
+ return memcmp(a->data, b->data, a->size);
81
+ }
82
+
83
+ /**
84
+ * Sort an array of blobs in place
85
+ */
86
+ int rstring_sort_array(VALUE* strings, size_t count) {
87
+ qsort(strings, count, sizeof(VALUE), rstring_compare);
88
+ }
89
+
90
+ VALUE blob_intersect_files(gzFile* files, int file_count) {
91
+ VALUE result = rb_ary_new();
92
+
93
+ if(file_count == 0) return result;
94
+
95
+ int master_idx = 0;
96
+ int ii = 0;
97
+ blob* master_blob = blob_make(512);
98
+ blob* next_blob = blob_make(512);
99
+
100
+ // bootstrap
101
+ master_blob = blob_read(files[0], master_blob);
102
+
103
+ // until a file runs out of data
104
+ while(1) {
105
+ int all_match = 1;
106
+ int end_of_file = 0;
107
+
108
+ for(ii = 0; ii < file_count; ++ii) {
109
+ if(ii == master_idx) continue;
110
+
111
+ // read blobs from this file until they aren't less than the
112
+ // master blob
113
+ int compare_result = 0;
114
+ while(1) {
115
+ next_blob = blob_read(files[ii], next_blob);
116
+ if(next_blob == NULL) {
117
+ end_of_file = 1;
118
+ break;
119
+ } else {
120
+ compare_result = blob_compare(&next_blob, &master_blob);
121
+ if(compare_result >= 0) break;
122
+ }
123
+ }
124
+
125
+ // if any file ever reaches the end while we're looking it means
126
+ // that we've found the entire intersection
127
+ if(end_of_file) {
128
+ all_match = 0;
129
+ break;
130
+ }
131
+
132
+ // if we ever get a non-zero compare result then that means the
133
+ // current candidate is a failure and we have a new candidate to
134
+ // try
135
+ if(compare_result != 0) {
136
+ all_match = 0;
137
+ break;
138
+ }
139
+ }
140
+
141
+ // finish bailing out on end of file
142
+ if(end_of_file) break;
143
+
144
+ // store the match if we had one
145
+ if(all_match) {
146
+ rb_ary_push(result, rb_str_new(master_blob->data,
147
+ master_blob->size));
148
+ } else {
149
+ // if we didn't have a match then whichever blob failed first
150
+ // becomes the new master and we try again
151
+ blob* temp = master_blob;
152
+ master_blob = next_blob;
153
+ next_blob = temp;
154
+ master_idx = ii;
155
+ }
156
+ }
157
+
158
+ free(master_blob);
159
+ free(next_blob);
160
+
161
+ return result;
162
+ }
163
+
164
+ static VALUE rfpset_spit_array(VALUE self, VALUE array, VALUE filename) {
165
+ gzFile out = gzopen(RSTRING_PTR(filename), "wb2");
166
+ if(out == NULL) return rb_fix_new(-1);
167
+
168
+ long ii;
169
+ long size = RARRAY_LEN(array);
170
+
171
+ // sort the array in place
172
+ VALUE* values = RARRAY_PTR(array);
173
+ rstring_sort_array(values, size);
174
+
175
+ // spit them at the disk, freeing as we go
176
+ VALUE last_value = 0;
177
+ for(ii = 0; ii < size; ++ii) {
178
+ if(!last_value
179
+ || RSTRING_LEN(values[ii]) != RSTRING_LEN(last_value)
180
+ || memcmp(RSTRING_PTR(values[ii]), RSTRING_PTR(last_value),
181
+ RSTRING_LEN(last_value)) != 0) {
182
+ // this blob is unique. Write it
183
+ rstring_write(out, values[ii]);
184
+ last_value = values[ii];
185
+ }
186
+ }
187
+
188
+ gzclose(out);
189
+
190
+ // return the number of blobs written
191
+ return rb_fix_new(size);
192
+ }
193
+
194
+ VALUE rfpset_slurp_array(VALUE self, VALUE filename) {
195
+ gzFile in = gzopen(RSTRING_PTR(filename), "rb");
196
+ if(in == NULL) return rb_fix_new(-1);
197
+
198
+ VALUE array = rb_ary_new();
199
+
200
+ blob* next_blob = blob_make(512);
201
+ while((next_blob = blob_read(in, next_blob)) != NULL) {
202
+ rb_ary_push(array, rb_str_new(next_blob->data, next_blob->size));
203
+ }
204
+ gzclose(in);
205
+ free(next_blob);
206
+
207
+ return array;
208
+ }
209
+
210
+ VALUE rfpset_intersect_files(VALUE self, VALUE filenames) {
211
+ long file_count = RARRAY_LEN(filenames);
212
+ VALUE* values = RARRAY_PTR(filenames);
213
+ int ii;
214
+
215
+ if(file_count == 1) {
216
+ return rfpset_slurp_array(self, values[0]);
217
+ }
218
+
219
+ gzFile* files = malloc(sizeof(gzFile) * file_count);
220
+
221
+ // open all the files
222
+ for(ii = 0; ii < file_count; ++ii) {
223
+ const char* name = RSTRING_PTR(values[ii]);
224
+ gzFile file = gzopen(name, "rb");
225
+ if(file == NULL) break; // failure!
226
+ files[ii] = file;
227
+ }
228
+
229
+ // make sure they all opened
230
+ if(ii < file_count) {
231
+ // close them all
232
+ int jj;
233
+ for(jj = 0; jj < ii; ++jj) {
234
+ gzclose(files[jj]);
235
+ }
236
+ return rb_fix_new(-1);
237
+ }
238
+
239
+ VALUE array = blob_intersect_files(files, file_count);
240
+
241
+ // close the files
242
+ for(ii = 0; ii < file_count; ++ii) {
243
+ gzclose(files[ii]);
244
+ }
245
+
246
+ return array;
247
+ }
248
+
249
+ void Init_rfpset() {
250
+ VALUE klass = rb_define_class("FPSetInternal", rb_cObject);
251
+ rb_define_singleton_method(klass, "spit_array", rfpset_spit_array, 2);
252
+ rb_define_singleton_method(klass, "slurp_array", rfpset_slurp_array, 1);
253
+ rb_define_singleton_method(klass, "intersect_files", rfpset_intersect_files, 1);
254
+ }
@@ -0,0 +1,3 @@
1
+ module Rfpset
2
+ VERSION = "0.0.1"
3
+ end
data/lib/rfpset.rb ADDED
@@ -0,0 +1,77 @@
1
+ require "rfpset/version"
2
+ require "rfpset/rfpset"
3
+ require 'set'
4
+
5
+ module FPSet
6
+ # Create a new set-file
7
+ #
8
+ # Example:
9
+ # >> FPSet.to_file([1,2,5,3,5,4], "numbers.dat")
10
+ #
11
+ # Arguments:
12
+ # data: (Enumerable, can contain duplicates)
13
+ # filename: (String)
14
+
15
+ def to_file(data, filename)
16
+ array = (data.collect { |d| Marshal.dump(d) }).to_a
17
+ result = FPSetInternal.spit_array(array, filename)
18
+ raise "does the file #{filename} exist?" if result == -1
19
+ return result
20
+ end
21
+ module_function :to_file
22
+
23
+ # Create a new set-file. Mutates provided data to save memory.
24
+ #
25
+ # Example:
26
+ # >> arr = [1,2,5,3,5,4]
27
+ # >> FPSet.to_file!(arr, "numbers.dat")
28
+ # >> arr = nil # array is full of garbage now
29
+ #
30
+ # Arguments:
31
+ # data: (Array, will be mutated)
32
+ # filename: (String)
33
+
34
+ def to_file!(data, filename)
35
+ return to_file(data, filename) if not data.kind_of?(Array)
36
+
37
+ data.collect! { |d| Marshal.dump(d) }
38
+ result = FPSetInternal.spit_array(data, filename)
39
+ raise "does the file #{filename} exist?" if result == -1
40
+ return result
41
+ end
42
+ module_function :to_file!
43
+
44
+ # Slurp a set-file from disk into a Ruby set.
45
+ #
46
+ # Example:
47
+ # >> set = FPSet.from_file("numbers.dat")
48
+ #
49
+ # Arguments:
50
+ # filename: (String)
51
+
52
+ def from_file(filename)
53
+ result = FPSetInternal.slurp_array(filename)
54
+ raise "does the file #{filename} exist?" if result == -1
55
+ return Set.new( result.map { |s| Marshal.load(s) } )
56
+ end
57
+ module_function :from_file
58
+
59
+ # Compute the intersection of set-files
60
+ #
61
+ # Example:
62
+ # >> set = FPSet.intersect_files(["numbers1.dat", "numbers2.dat"])
63
+ #
64
+ # Arguments:
65
+ # filenames: (Enumerable of Strings)
66
+
67
+ def intersect_files(filenames)
68
+ array = Array(filenames.collect { |f| f.to_s })
69
+ result = FPSetInternal.intersect_files(array)
70
+ if result == -1 then
71
+ names = array.join(", ")
72
+ raise "do all files exist? (#{names})"
73
+ end
74
+ return result.map { |s| Marshal.load(s) }
75
+ end
76
+ module_function :intersect_files
77
+ end
data/rfpset.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "rfpset/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "rfpset"
7
+ s.version = Rfpset::VERSION
8
+ s.authors = ["Brian Taylor"]
9
+ s.email = ["el.wubo@gmail.com"]
10
+ s.homepage = "http://www.50ply.com/blog/2012/07/21/introducing-fast/"
11
+ s.summary = %q{Fast, persistent sets}
12
+ s.description = %q{Fast, persistent sets supporting efficient intersections of many very large sets.}
13
+
14
+ s.rubyforge_project = "rfpset"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.extensions = ['ext/rfpset/extconf.rb']
18
+
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ # specify any dependencies here; for example:
24
+ # s.add_development_dependency "rspec"
25
+ # s.add_runtime_dependency "rest-client"
26
+ end
@@ -0,0 +1,44 @@
1
+ require 'test/unit'
2
+ require 'rfpset'
3
+ require 'set'
4
+
5
+ class FPSetTest < Test::Unit::TestCase
6
+
7
+ def test_primitive
8
+ data = ["one", "two", "three"]
9
+ testfile = "test.dat"
10
+ testfile2 = "test2.dat"
11
+
12
+ assert_equal 3, FPSetInternal.spit_array(data, testfile)
13
+
14
+ new_data = FPSetInternal.slurp_array(testfile)
15
+ assert_equal data, new_data
16
+
17
+ data2 = ["three", "four", "five"]
18
+ assert_equal 3, FPSetInternal.spit_array(data2, testfile2)
19
+
20
+ intersect = FPSetInternal.intersect_files([testfile, testfile2])
21
+ assert_equal 1, intersect.size
22
+ assert_equal "three", intersect[0]
23
+ end
24
+
25
+ def test_porcelain
26
+ test1 = "test.dat"
27
+ test2 = "test2.dat"
28
+
29
+ FPSet.to_file(Array(1..5).concat(Array(1..5)), test1)
30
+ FPSet.to_file(3..6, test2)
31
+ assert_equal 3, FPSet.intersect_files([test1, test2]).size
32
+ assert_equal (1..5).to_set, FPSet.from_file(test1)
33
+ assert_equal Array(1..5), FPSet.intersect_files([test1])
34
+
35
+ # are more interesting types preserved?
36
+ test3_data = 5.times.collect { |i| [i.to_s] }
37
+ test3 = "test3.dat"
38
+ FPSet.to_file(test3_data, test3)
39
+ assert_equal test3_data.to_set, FPSet.from_file(test3)
40
+ FPSet.to_file!(Array.new(test3_data), test3)
41
+ assert_equal test3_data.to_set, FPSet.from_file(test3)
42
+ end
43
+ end
44
+
metadata ADDED
@@ -0,0 +1,59 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rfpset
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Brian Taylor
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-21 00:00:00.000000000Z
13
+ dependencies: []
14
+ description: Fast, persistent sets supporting efficient intersections of many very
15
+ large sets.
16
+ email:
17
+ - el.wubo@gmail.com
18
+ executables: []
19
+ extensions:
20
+ - ext/rfpset/extconf.rb
21
+ extra_rdoc_files: []
22
+ files:
23
+ - .gitignore
24
+ - Gemfile
25
+ - README.markdown
26
+ - Rakefile
27
+ - bench.rb
28
+ - ext/rfpset/extconf.rb
29
+ - ext/rfpset/rfpset.c
30
+ - lib/rfpset.rb
31
+ - lib/rfpset/version.rb
32
+ - rfpset.gemspec
33
+ - test/test_fpset.rb
34
+ homepage: http://www.50ply.com/blog/2012/07/21/introducing-fast/
35
+ licenses: []
36
+ post_install_message:
37
+ rdoc_options: []
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ none: false
48
+ requirements:
49
+ - - ! '>='
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ requirements: []
53
+ rubyforge_project: rfpset
54
+ rubygems_version: 1.8.10
55
+ signing_key:
56
+ specification_version: 3
57
+ summary: Fast, persistent sets
58
+ test_files:
59
+ - test/test_fpset.rb