rfpset 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.markdown +41 -0
- data/Rakefile +38 -0
- data/bench.rb +49 -0
- data/ext/rfpset/extconf.rb +8 -0
- data/ext/rfpset/rfpset.c +254 -0
- data/lib/rfpset/version.rb +3 -0
- data/lib/rfpset.rb +77 -0
- data/rfpset.gemspec +26 -0
- data/test/test_fpset.rb +44 -0
- metadata +59 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
FPSet - Fast, Persistent Sets
|
2
|
+
=============================
|
3
|
+
|
4
|
+
FPSet is a very specialized library for performing large set
|
5
|
+
intersections against data that is too large to fit into memory. It
|
6
|
+
does this by storing the sets on disk in an ordered binary format and
|
7
|
+
performing the intersection as it streams just-enough data from
|
8
|
+
disk. The result is a very memory friendly and performant set
|
9
|
+
intersection that's appropriate for very large sets.
|
10
|
+
|
11
|
+
To use:
|
12
|
+
|
13
|
+
Ahead of time, presumably off-line in a monthly cron-job or something,
|
14
|
+
we build our sets:
|
15
|
+
|
16
|
+
``` ruby
|
17
|
+
setNames.each do |name|
|
18
|
+
strings = fetch_set_named(name)
|
19
|
+
FPSet.to_file(name, strings)
|
20
|
+
end
|
21
|
+
```
|
22
|
+
|
23
|
+
Then, presumably at runtime, we can do our big set intersections:
|
24
|
+
|
25
|
+
``` ruby
|
26
|
+
common_terms = FPSet.intersect_files(setNames)
|
27
|
+
```
|
28
|
+
|
29
|
+
To slurp in a set from just one of the files:
|
30
|
+
|
31
|
+
```ruby
|
32
|
+
set = FBSet.from_file(setNames[0])
|
33
|
+
```
|
34
|
+
|
35
|
+
This is a bundler created gem. To build and install just run:
|
36
|
+
|
37
|
+
``` bash
|
38
|
+
gem build rfpset.gemspec
|
39
|
+
gem install rfpset-0.0.1.gem
|
40
|
+
```
|
41
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rake/clean'
|
4
|
+
|
5
|
+
NAME = 'rfpset'
|
6
|
+
|
7
|
+
# rule to build the extension: this says
|
8
|
+
# that the extension should be rebuilt
|
9
|
+
# after any change to the files in ext
|
10
|
+
file "lib/#{NAME}/#{NAME}.bundle" =>
|
11
|
+
Dir.glob("ext/#{NAME}/*{.rb,.c}") do
|
12
|
+
Dir.chdir("ext/#{NAME}") do
|
13
|
+
# this does essentially the same thing
|
14
|
+
# as what RubyGems does
|
15
|
+
ruby "extconf.rb"
|
16
|
+
sh "make"
|
17
|
+
end
|
18
|
+
cp "ext/#{NAME}/#{NAME}.bundle", "lib/#{NAME}"
|
19
|
+
end
|
20
|
+
|
21
|
+
# make the :test task depend on the shared
|
22
|
+
# object, so it will be built automatically
|
23
|
+
# before running the tests
|
24
|
+
task :test => "lib/#{NAME}/#{NAME}.bundle"
|
25
|
+
|
26
|
+
# use 'rake clean' and 'rake clobber' to
|
27
|
+
# easily delete generated files
|
28
|
+
CLEAN.include('ext/**/*{.o,.log,.bundle}')
|
29
|
+
CLEAN.include('ext/**/Makefile')
|
30
|
+
CLOBBER.include('lib/**/*.bundle')
|
31
|
+
|
32
|
+
# the same as before
|
33
|
+
Rake::TestTask.new do |t|
|
34
|
+
t.libs << 'test'
|
35
|
+
end
|
36
|
+
|
37
|
+
desc "Run tests"
|
38
|
+
task :default => :test
|
data/bench.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rfpset'
|
3
|
+
|
4
|
+
# generate some random data
|
5
|
+
$alphabet = Array('a'..'z')
|
6
|
+
def generate_ngrams(count, width)
|
7
|
+
count.times.map do
|
8
|
+
(width.times.map { $alphabet[Random.rand($alphabet.size)] }).join('')
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
# make n big sets
|
13
|
+
#num_sets = 7
|
14
|
+
puts "num_sets, size, fpset_write, fpset_intersect, array_intersect, set_size"
|
15
|
+
(1..7).each do |num_sets|
|
16
|
+
(1..5).each do |pre_size|
|
17
|
+
|
18
|
+
size = 30000 * pre_size
|
19
|
+
|
20
|
+
fpset_write = 0
|
21
|
+
sets = num_sets.times.map do |ii|
|
22
|
+
ngrams = generate_ngrams(size, 4)
|
23
|
+
start = Time.now
|
24
|
+
FPSet.to_file(ngrams, ii.to_s)
|
25
|
+
stop = Time.now
|
26
|
+
fpset_write += (stop - start)
|
27
|
+
ngrams
|
28
|
+
end
|
29
|
+
|
30
|
+
start = Time.now
|
31
|
+
join = FPSet.intersect_files(num_sets.times.map { |x| x.to_s })
|
32
|
+
stop = Time.now
|
33
|
+
fpset_intersect = stop - start
|
34
|
+
|
35
|
+
set_size = join.size
|
36
|
+
|
37
|
+
start = Time.now
|
38
|
+
result = sets.reduce do |last, current|
|
39
|
+
last & current
|
40
|
+
end
|
41
|
+
|
42
|
+
stop = Time.now
|
43
|
+
array_intersect = stop - start
|
44
|
+
|
45
|
+
puts "#{num_sets}, #{size}, #{(fpset_write*1000)}, #{(fpset_intersect*1000)}, #{(array_intersect*1000)}, #{set_size}"
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
data/ext/rfpset/rfpset.c
ADDED
@@ -0,0 +1,254 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <stdlib.h>
|
5
|
+
#include <string.h>
|
6
|
+
#include <zlib.h>
|
7
|
+
|
8
|
+
typedef struct {
|
9
|
+
size_t size;
|
10
|
+
size_t reserved_size;
|
11
|
+
char data[0];
|
12
|
+
} blob;
|
13
|
+
|
14
|
+
typedef char* bytes;
|
15
|
+
|
16
|
+
/**
|
17
|
+
* Read COUNT bytes from SRC into DATA. Return 1 if all bytes were
|
18
|
+
* read successfully or 0 if it wasn't possible to read the requested
|
19
|
+
* number of bytes.
|
20
|
+
*/
|
21
|
+
int read_confidently(gzFile src, size_t count, bytes data) {
|
22
|
+
return gzread(src, data, count);
|
23
|
+
}
|
24
|
+
|
25
|
+
blob* blob_make(size_t reserved_size) {
|
26
|
+
blob* new_blob = malloc(sizeof(blob) + reserved_size);
|
27
|
+
new_blob->size = 0;
|
28
|
+
new_blob->reserved_size = reserved_size;
|
29
|
+
return new_blob;
|
30
|
+
}
|
31
|
+
|
32
|
+
blob* blob_ensure_reserved_size(blob* datum, size_t reserved_size) {
|
33
|
+
if(reserved_size > datum->reserved_size) {
|
34
|
+
datum = realloc(datum, reserved_size);
|
35
|
+
datum->reserved_size = reserved_size;
|
36
|
+
}
|
37
|
+
return datum;
|
38
|
+
}
|
39
|
+
|
40
|
+
/**
|
41
|
+
* Read a blob from SRC into DATUM. May realloc DATUM- if not large
|
42
|
+
* enough to contain the blob so the returned value should always be
|
43
|
+
* used in place of DATUM. Returns null if read was impossible.
|
44
|
+
*/
|
45
|
+
blob* blob_read(gzFile src, blob* datum) {
|
46
|
+
if(!read_confidently(src, sizeof(size_t), (char*)datum)) return NULL;
|
47
|
+
datum = blob_ensure_reserved_size(datum, datum->size);
|
48
|
+
if(!read_confidently(src, datum->size, datum->data)) return NULL;
|
49
|
+
return datum;
|
50
|
+
}
|
51
|
+
|
52
|
+
/**
|
53
|
+
* Writes DATUM to DST. Returns bytes written if success or 0 if
|
54
|
+
* failure. (A blob always has size > 0)
|
55
|
+
*/
|
56
|
+
int rstring_write(gzFile dst, VALUE string) {
|
57
|
+
size_t size = RSTRING_LEN(string);
|
58
|
+
gzwrite(dst, &size, sizeof(size_t));
|
59
|
+
return gzwrite(dst, RSTRING_PTR(string), RSTRING_LEN(string));
|
60
|
+
}
|
61
|
+
|
62
|
+
int rstring_compare(const void* va, const void* vb) {
|
63
|
+
VALUE a = *(VALUE*)va;
|
64
|
+
VALUE b = *(VALUE*)vb;
|
65
|
+
|
66
|
+
size_t size_a = RSTRING_LEN(a);
|
67
|
+
size_t size_b = RSTRING_LEN(b);
|
68
|
+
|
69
|
+
if(size_a < size_b) return -1;
|
70
|
+
if(size_a > size_b) return 1;
|
71
|
+
return memcmp(RSTRING_PTR(a), RSTRING_PTR(b), size_a);
|
72
|
+
}
|
73
|
+
|
74
|
+
int blob_compare(const void * va, const void * vb) {
|
75
|
+
blob* a = *(blob**)va;
|
76
|
+
blob* b = *(blob**)vb;
|
77
|
+
|
78
|
+
if(a->size < b->size) return -1;
|
79
|
+
if(a->size > b->size) return 1;
|
80
|
+
return memcmp(a->data, b->data, a->size);
|
81
|
+
}
|
82
|
+
|
83
|
+
/**
|
84
|
+
* Sort an array of blobs in place
|
85
|
+
*/
|
86
|
+
int rstring_sort_array(VALUE* strings, size_t count) {
|
87
|
+
qsort(strings, count, sizeof(VALUE), rstring_compare);
|
88
|
+
}
|
89
|
+
|
90
|
+
VALUE blob_intersect_files(gzFile* files, int file_count) {
|
91
|
+
VALUE result = rb_ary_new();
|
92
|
+
|
93
|
+
if(file_count == 0) return result;
|
94
|
+
|
95
|
+
int master_idx = 0;
|
96
|
+
int ii = 0;
|
97
|
+
blob* master_blob = blob_make(512);
|
98
|
+
blob* next_blob = blob_make(512);
|
99
|
+
|
100
|
+
// bootstrap
|
101
|
+
master_blob = blob_read(files[0], master_blob);
|
102
|
+
|
103
|
+
// until a file runs out of data
|
104
|
+
while(1) {
|
105
|
+
int all_match = 1;
|
106
|
+
int end_of_file = 0;
|
107
|
+
|
108
|
+
for(ii = 0; ii < file_count; ++ii) {
|
109
|
+
if(ii == master_idx) continue;
|
110
|
+
|
111
|
+
// read blobs from this file until they aren't less than the
|
112
|
+
// master blob
|
113
|
+
int compare_result = 0;
|
114
|
+
while(1) {
|
115
|
+
next_blob = blob_read(files[ii], next_blob);
|
116
|
+
if(next_blob == NULL) {
|
117
|
+
end_of_file = 1;
|
118
|
+
break;
|
119
|
+
} else {
|
120
|
+
compare_result = blob_compare(&next_blob, &master_blob);
|
121
|
+
if(compare_result >= 0) break;
|
122
|
+
}
|
123
|
+
}
|
124
|
+
|
125
|
+
// if any file ever reaches the end while we're looking it means
|
126
|
+
// that we've found the entire intersection
|
127
|
+
if(end_of_file) {
|
128
|
+
all_match = 0;
|
129
|
+
break;
|
130
|
+
}
|
131
|
+
|
132
|
+
// if we ever get a non-zero compare result then that means the
|
133
|
+
// current candidate is a failure and we have a new candidate to
|
134
|
+
// try
|
135
|
+
if(compare_result != 0) {
|
136
|
+
all_match = 0;
|
137
|
+
break;
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
// finish bailing out on end of file
|
142
|
+
if(end_of_file) break;
|
143
|
+
|
144
|
+
// store the match if we had one
|
145
|
+
if(all_match) {
|
146
|
+
rb_ary_push(result, rb_str_new(master_blob->data,
|
147
|
+
master_blob->size));
|
148
|
+
} else {
|
149
|
+
// if we didn't have a match then whichever blob failed first
|
150
|
+
// becomes the new master and we try again
|
151
|
+
blob* temp = master_blob;
|
152
|
+
master_blob = next_blob;
|
153
|
+
next_blob = temp;
|
154
|
+
master_idx = ii;
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
free(master_blob);
|
159
|
+
free(next_blob);
|
160
|
+
|
161
|
+
return result;
|
162
|
+
}
|
163
|
+
|
164
|
+
static VALUE rfpset_spit_array(VALUE self, VALUE array, VALUE filename) {
|
165
|
+
gzFile out = gzopen(RSTRING_PTR(filename), "wb2");
|
166
|
+
if(out == NULL) return rb_fix_new(-1);
|
167
|
+
|
168
|
+
long ii;
|
169
|
+
long size = RARRAY_LEN(array);
|
170
|
+
|
171
|
+
// sort the array in place
|
172
|
+
VALUE* values = RARRAY_PTR(array);
|
173
|
+
rstring_sort_array(values, size);
|
174
|
+
|
175
|
+
// spit them at the disk, freeing as we go
|
176
|
+
VALUE last_value = 0;
|
177
|
+
for(ii = 0; ii < size; ++ii) {
|
178
|
+
if(!last_value
|
179
|
+
|| RSTRING_LEN(values[ii]) != RSTRING_LEN(last_value)
|
180
|
+
|| memcmp(RSTRING_PTR(values[ii]), RSTRING_PTR(last_value),
|
181
|
+
RSTRING_LEN(last_value)) != 0) {
|
182
|
+
// this blob is unique. Write it
|
183
|
+
rstring_write(out, values[ii]);
|
184
|
+
last_value = values[ii];
|
185
|
+
}
|
186
|
+
}
|
187
|
+
|
188
|
+
gzclose(out);
|
189
|
+
|
190
|
+
// return the number of blobs written
|
191
|
+
return rb_fix_new(size);
|
192
|
+
}
|
193
|
+
|
194
|
+
VALUE rfpset_slurp_array(VALUE self, VALUE filename) {
|
195
|
+
gzFile in = gzopen(RSTRING_PTR(filename), "rb");
|
196
|
+
if(in == NULL) return rb_fix_new(-1);
|
197
|
+
|
198
|
+
VALUE array = rb_ary_new();
|
199
|
+
|
200
|
+
blob* next_blob = blob_make(512);
|
201
|
+
while((next_blob = blob_read(in, next_blob)) != NULL) {
|
202
|
+
rb_ary_push(array, rb_str_new(next_blob->data, next_blob->size));
|
203
|
+
}
|
204
|
+
gzclose(in);
|
205
|
+
free(next_blob);
|
206
|
+
|
207
|
+
return array;
|
208
|
+
}
|
209
|
+
|
210
|
+
VALUE rfpset_intersect_files(VALUE self, VALUE filenames) {
|
211
|
+
long file_count = RARRAY_LEN(filenames);
|
212
|
+
VALUE* values = RARRAY_PTR(filenames);
|
213
|
+
int ii;
|
214
|
+
|
215
|
+
if(file_count == 1) {
|
216
|
+
return rfpset_slurp_array(self, values[0]);
|
217
|
+
}
|
218
|
+
|
219
|
+
gzFile* files = malloc(sizeof(gzFile) * file_count);
|
220
|
+
|
221
|
+
// open all the files
|
222
|
+
for(ii = 0; ii < file_count; ++ii) {
|
223
|
+
const char* name = RSTRING_PTR(values[ii]);
|
224
|
+
gzFile file = gzopen(name, "rb");
|
225
|
+
if(file == NULL) break; // failure!
|
226
|
+
files[ii] = file;
|
227
|
+
}
|
228
|
+
|
229
|
+
// make sure they all opened
|
230
|
+
if(ii < file_count) {
|
231
|
+
// close them all
|
232
|
+
int jj;
|
233
|
+
for(jj = 0; jj < ii; ++jj) {
|
234
|
+
gzclose(files[jj]);
|
235
|
+
}
|
236
|
+
return rb_fix_new(-1);
|
237
|
+
}
|
238
|
+
|
239
|
+
VALUE array = blob_intersect_files(files, file_count);
|
240
|
+
|
241
|
+
// close the files
|
242
|
+
for(ii = 0; ii < file_count; ++ii) {
|
243
|
+
gzclose(files[ii]);
|
244
|
+
}
|
245
|
+
|
246
|
+
return array;
|
247
|
+
}
|
248
|
+
|
249
|
+
void Init_rfpset() {
|
250
|
+
VALUE klass = rb_define_class("FPSetInternal", rb_cObject);
|
251
|
+
rb_define_singleton_method(klass, "spit_array", rfpset_spit_array, 2);
|
252
|
+
rb_define_singleton_method(klass, "slurp_array", rfpset_slurp_array, 1);
|
253
|
+
rb_define_singleton_method(klass, "intersect_files", rfpset_intersect_files, 1);
|
254
|
+
}
|
data/lib/rfpset.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
require "rfpset/version"
|
2
|
+
require "rfpset/rfpset"
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module FPSet
|
6
|
+
# Create a new set-file
|
7
|
+
#
|
8
|
+
# Example:
|
9
|
+
# >> FPSet.to_file([1,2,5,3,5,4], "numbers.dat")
|
10
|
+
#
|
11
|
+
# Arguments:
|
12
|
+
# data: (Enumerable, can contain duplicates)
|
13
|
+
# filename: (String)
|
14
|
+
|
15
|
+
def to_file(data, filename)
|
16
|
+
array = (data.collect { |d| Marshal.dump(d) }).to_a
|
17
|
+
result = FPSetInternal.spit_array(array, filename)
|
18
|
+
raise "does the file #{filename} exist?" if result == -1
|
19
|
+
return result
|
20
|
+
end
|
21
|
+
module_function :to_file
|
22
|
+
|
23
|
+
# Create a new set-file. Mutates provided data to save memory.
|
24
|
+
#
|
25
|
+
# Example:
|
26
|
+
# >> arr = [1,2,5,3,5,4]
|
27
|
+
# >> FPSet.to_file!(arr, "numbers.dat")
|
28
|
+
# >> arr = nil # array is full of garbage now
|
29
|
+
#
|
30
|
+
# Arguments:
|
31
|
+
# data: (Array, will be mutated)
|
32
|
+
# filename: (String)
|
33
|
+
|
34
|
+
def to_file!(data, filename)
|
35
|
+
return to_file(data, filename) if not data.kind_of?(Array)
|
36
|
+
|
37
|
+
data.collect! { |d| Marshal.dump(d) }
|
38
|
+
result = FPSetInternal.spit_array(data, filename)
|
39
|
+
raise "does the file #{filename} exist?" if result == -1
|
40
|
+
return result
|
41
|
+
end
|
42
|
+
module_function :to_file!
|
43
|
+
|
44
|
+
# Slurp a set-file from disk into a Ruby set.
|
45
|
+
#
|
46
|
+
# Example:
|
47
|
+
# >> set = FPSet.from_file("numbers.dat")
|
48
|
+
#
|
49
|
+
# Arguments:
|
50
|
+
# filename: (String)
|
51
|
+
|
52
|
+
def from_file(filename)
|
53
|
+
result = FPSetInternal.slurp_array(filename)
|
54
|
+
raise "does the file #{filename} exist?" if result == -1
|
55
|
+
return Set.new( result.map { |s| Marshal.load(s) } )
|
56
|
+
end
|
57
|
+
module_function :from_file
|
58
|
+
|
59
|
+
# Compute the intersection of set-files
|
60
|
+
#
|
61
|
+
# Example:
|
62
|
+
# >> set = FPSet.intersect_files(["numbers1.dat", "numbers2.dat"])
|
63
|
+
#
|
64
|
+
# Arguments:
|
65
|
+
# filenames: (Enumerable of Strings)
|
66
|
+
|
67
|
+
def intersect_files(filenames)
|
68
|
+
array = Array(filenames.collect { |f| f.to_s })
|
69
|
+
result = FPSetInternal.intersect_files(array)
|
70
|
+
if result == -1 then
|
71
|
+
names = array.join(", ")
|
72
|
+
raise "do all files exist? (#{names})"
|
73
|
+
end
|
74
|
+
return result.map { |s| Marshal.load(s) }
|
75
|
+
end
|
76
|
+
module_function :intersect_files
|
77
|
+
end
|
data/rfpset.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "rfpset/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "rfpset"
|
7
|
+
s.version = Rfpset::VERSION
|
8
|
+
s.authors = ["Brian Taylor"]
|
9
|
+
s.email = ["el.wubo@gmail.com"]
|
10
|
+
s.homepage = "http://www.50ply.com/blog/2012/07/21/introducing-fast/"
|
11
|
+
s.summary = %q{Fast, persistent sets}
|
12
|
+
s.description = %q{Fast, persistent sets supporting efficient intersections of many very large sets.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "rfpset"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.extensions = ['ext/rfpset/extconf.rb']
|
18
|
+
|
19
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
20
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
21
|
+
s.require_paths = ["lib"]
|
22
|
+
|
23
|
+
# specify any dependencies here; for example:
|
24
|
+
# s.add_development_dependency "rspec"
|
25
|
+
# s.add_runtime_dependency "rest-client"
|
26
|
+
end
|
data/test/test_fpset.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'rfpset'
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
class FPSetTest < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def test_primitive
|
8
|
+
data = ["one", "two", "three"]
|
9
|
+
testfile = "test.dat"
|
10
|
+
testfile2 = "test2.dat"
|
11
|
+
|
12
|
+
assert_equal 3, FPSetInternal.spit_array(data, testfile)
|
13
|
+
|
14
|
+
new_data = FPSetInternal.slurp_array(testfile)
|
15
|
+
assert_equal data, new_data
|
16
|
+
|
17
|
+
data2 = ["three", "four", "five"]
|
18
|
+
assert_equal 3, FPSetInternal.spit_array(data2, testfile2)
|
19
|
+
|
20
|
+
intersect = FPSetInternal.intersect_files([testfile, testfile2])
|
21
|
+
assert_equal 1, intersect.size
|
22
|
+
assert_equal "three", intersect[0]
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_porcelain
|
26
|
+
test1 = "test.dat"
|
27
|
+
test2 = "test2.dat"
|
28
|
+
|
29
|
+
FPSet.to_file(Array(1..5).concat(Array(1..5)), test1)
|
30
|
+
FPSet.to_file(3..6, test2)
|
31
|
+
assert_equal 3, FPSet.intersect_files([test1, test2]).size
|
32
|
+
assert_equal (1..5).to_set, FPSet.from_file(test1)
|
33
|
+
assert_equal Array(1..5), FPSet.intersect_files([test1])
|
34
|
+
|
35
|
+
# are more interesting types preserved?
|
36
|
+
test3_data = 5.times.collect { |i| [i.to_s] }
|
37
|
+
test3 = "test3.dat"
|
38
|
+
FPSet.to_file(test3_data, test3)
|
39
|
+
assert_equal test3_data.to_set, FPSet.from_file(test3)
|
40
|
+
FPSet.to_file!(Array.new(test3_data), test3)
|
41
|
+
assert_equal test3_data.to_set, FPSet.from_file(test3)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rfpset
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Brian Taylor
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-21 00:00:00.000000000Z
|
13
|
+
dependencies: []
|
14
|
+
description: Fast, persistent sets supporting efficient intersections of many very
|
15
|
+
large sets.
|
16
|
+
email:
|
17
|
+
- el.wubo@gmail.com
|
18
|
+
executables: []
|
19
|
+
extensions:
|
20
|
+
- ext/rfpset/extconf.rb
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- .gitignore
|
24
|
+
- Gemfile
|
25
|
+
- README.markdown
|
26
|
+
- Rakefile
|
27
|
+
- bench.rb
|
28
|
+
- ext/rfpset/extconf.rb
|
29
|
+
- ext/rfpset/rfpset.c
|
30
|
+
- lib/rfpset.rb
|
31
|
+
- lib/rfpset/version.rb
|
32
|
+
- rfpset.gemspec
|
33
|
+
- test/test_fpset.rb
|
34
|
+
homepage: http://www.50ply.com/blog/2012/07/21/introducing-fast/
|
35
|
+
licenses: []
|
36
|
+
post_install_message:
|
37
|
+
rdoc_options: []
|
38
|
+
require_paths:
|
39
|
+
- lib
|
40
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
47
|
+
none: false
|
48
|
+
requirements:
|
49
|
+
- - ! '>='
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '0'
|
52
|
+
requirements: []
|
53
|
+
rubyforge_project: rfpset
|
54
|
+
rubygems_version: 1.8.10
|
55
|
+
signing_key:
|
56
|
+
specification_version: 3
|
57
|
+
summary: Fast, persistent sets
|
58
|
+
test_files:
|
59
|
+
- test/test_fpset.rb
|