RubyGems - rfpset - Versions diffs - 0.0.1 - Mend

rfpset 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

data/.gitignore ADDED Viewed

@@ -0,0 +1,4 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in rfpset.gemspec
+gemspec

data/README.markdown ADDED Viewed

@@ -0,0 +1,41 @@
+FPSet - Fast, Persistent Sets
+=============================
+FPSet is a very specialized library for performing large set
+intersections against data that is too large to fit into memory. It
+does this by storing the sets on disk in an ordered binary format and
+performing the intersection as it streams just-enough data from
+disk. The result is a very memory friendly and performant set
+intersection that's appropriate for very large sets.
+To use:
+Ahead of time, presumably off-line in a monthly cron-job or something,
+we build our sets:
+``` ruby
+setNames.each do |name|
+  strings = fetch_set_named(name)
+  FPSet.to_file(name, strings)
+end
+```
+Then, presumably at runtime, we can do our big set intersections:
+``` ruby
+common_terms = FPSet.intersect_files(setNames)
+```
+To slurp in a set from just one of the files:
+```ruby
+set = FBSet.from_file(setNames[0])
+```
+This is a bundler created gem. To build and install just run:
+``` bash
+gem build rfpset.gemspec
+gem install rfpset-0.0.1.gem
+```

data/Rakefile ADDED Viewed

@@ -0,0 +1,38 @@
+require "bundler/gem_tasks"
+require 'rake/testtask'
+require 'rake/clean'
+NAME = 'rfpset'
+# rule to build the extension: this says
+# that the extension should be rebuilt
+# after any change to the files in ext
+file "lib/#{NAME}/#{NAME}.bundle" =>
+    Dir.glob("ext/#{NAME}/*{.rb,.c}") do
+  Dir.chdir("ext/#{NAME}") do
+    # this does essentially the same thing
+    # as what RubyGems does
+    ruby "extconf.rb"
+    sh "make"
+  end
+  cp "ext/#{NAME}/#{NAME}.bundle", "lib/#{NAME}"
+end
+# make the :test task depend on the shared
+# object, so it will be built automatically
+# before running the tests
+task :test => "lib/#{NAME}/#{NAME}.bundle"
+# use 'rake clean' and 'rake clobber' to
+# easily delete generated files
+CLEAN.include('ext/**/*{.o,.log,.bundle}')
+CLEAN.include('ext/**/Makefile')
+CLOBBER.include('lib/**/*.bundle')
+# the same as before
+Rake::TestTask.new do |t|
+  t.libs << 'test'
+end
+desc "Run tests"
+task :default => :test

data/bench.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require 'rubygems'
+require 'rfpset'
+# generate some random data
+$alphabet = Array('a'..'z')
+def generate_ngrams(count, width)
+  count.times.map do
+    (width.times.map { $alphabet[Random.rand($alphabet.size)] }).join('')
+  end
+end
+# make n big sets
+#num_sets = 7
+puts "num_sets, size, fpset_write, fpset_intersect, array_intersect, set_size"
+(1..7).each do |num_sets|
+  (1..5).each do |pre_size|
+    size = 30000 * pre_size
+    fpset_write = 0
+    sets = num_sets.times.map do |ii|
+      ngrams = generate_ngrams(size, 4)
+      start = Time.now
+      FPSet.to_file(ngrams, ii.to_s)
+      stop = Time.now
+      fpset_write += (stop - start)
+      ngrams
+    end
+    start = Time.now
+    join = FPSet.intersect_files(num_sets.times.map { |x| x.to_s })
+    stop = Time.now
+    fpset_intersect = stop - start
+    set_size = join.size
+    start = Time.now
+    result = sets.reduce do |last, current|
+      last & current
+    end
+    stop = Time.now
+    array_intersect = stop - start
+    puts "#{num_sets}, #{size}, #{(fpset_write*1000)}, #{(fpset_intersect*1000)}, #{(array_intersect*1000)}, #{set_size}"
+  end
+end

data/ext/rfpset/extconf.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require 'mkmf'
+# $CFLAGS << ' -g -pg -ggdb '
+# $LDFLAGS << ' -g -pg '
+#have_library('zlib', 'zlibVersion')
+$LDFLAGS << ' -lz '
+create_makefile('rfpset/rfpset')

data/ext/rfpset/rfpset.c ADDED Viewed

@@ -0,0 +1,254 @@
+#include <ruby.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+typedef struct {
+  size_t size;
+  size_t reserved_size;
+  char data[0];
+} blob;
+typedef char* bytes;
+/**
+ * Read COUNT bytes from SRC into DATA. Return 1 if all bytes were
+ * read successfully or 0 if it wasn't possible to read the requested
+ * number of bytes.
+ */
+int read_confidently(gzFile src, size_t count, bytes data) {
+  return gzread(src, data, count);
+}
+blob* blob_make(size_t reserved_size) {
+  blob* new_blob = malloc(sizeof(blob) + reserved_size);
+  new_blob->size = 0;
+  new_blob->reserved_size = reserved_size;
+  return new_blob;
+}
+blob* blob_ensure_reserved_size(blob* datum, size_t reserved_size) {
+  if(reserved_size > datum->reserved_size) {
+    datum = realloc(datum, reserved_size);
+    datum->reserved_size = reserved_size;
+  }
+  return datum;
+}
+/**
+ * Read a blob from SRC into DATUM. May realloc DATUM- if not large
+ * enough to contain the blob so the returned value should always be
+ * used in place of DATUM. Returns null if read was impossible.
+ */
+blob* blob_read(gzFile src, blob* datum) {
+  if(!read_confidently(src, sizeof(size_t), (char*)datum)) return NULL;
+  datum = blob_ensure_reserved_size(datum, datum->size);
+  if(!read_confidently(src, datum->size, datum->data)) return NULL;
+  return datum;
+}
+/**
+ * Writes DATUM to DST. Returns bytes written if success or 0 if
+ * failure. (A blob always has size > 0)
+ */
+int rstring_write(gzFile dst, VALUE string) {
+  size_t size = RSTRING_LEN(string);
+  gzwrite(dst, &size, sizeof(size_t));
+  return gzwrite(dst, RSTRING_PTR(string), RSTRING_LEN(string));
+}
+int rstring_compare(const void* va, const void* vb) {
+  VALUE a = *(VALUE*)va;
+  VALUE b = *(VALUE*)vb;
+  size_t size_a = RSTRING_LEN(a);
+  size_t size_b = RSTRING_LEN(b);
+  if(size_a < size_b) return -1;
+  if(size_a > size_b) return 1;
+  return memcmp(RSTRING_PTR(a), RSTRING_PTR(b), size_a);
+}
+int blob_compare(const void * va, const void * vb) {
+  blob* a = *(blob**)va;
+  blob* b = *(blob**)vb;
+  if(a->size < b->size) return -1;
+  if(a->size > b->size) return 1;
+  return memcmp(a->data, b->data, a->size);
+}
+/**
+ * Sort an array of blobs in place
+ */
+int rstring_sort_array(VALUE* strings, size_t count) {
+  qsort(strings, count, sizeof(VALUE), rstring_compare);
+}
+VALUE blob_intersect_files(gzFile* files, int file_count) {
+  VALUE result = rb_ary_new();
+  if(file_count == 0) return result;
+  int master_idx = 0;
+  int ii = 0;
+  blob* master_blob = blob_make(512);
+  blob* next_blob = blob_make(512);
+  // bootstrap
+  master_blob = blob_read(files[0], master_blob);
+  // until a file runs out of data
+  while(1) {
+    int all_match = 1;
+    int end_of_file = 0;
+    for(ii = 0; ii < file_count; ++ii) {
+      if(ii == master_idx) continue;
+      // read blobs from this file until they aren't less than the
+      // master blob
+      int compare_result = 0;
+      while(1) {
+        next_blob = blob_read(files[ii], next_blob);
+        if(next_blob == NULL) {
+          end_of_file = 1;
+          break;
+        } else {
+          compare_result = blob_compare(&next_blob, &master_blob);
+          if(compare_result >= 0) break;
+        }
+      }
+      // if any file ever reaches the end while we're looking it means
+      // that we've found the entire intersection
+      if(end_of_file) {
+        all_match = 0;
+        break;
+      }
+      // if we ever get a non-zero compare result then that means the
+      // current candidate is a failure and we have a new candidate to
+      // try
+      if(compare_result != 0) {
+        all_match = 0;
+        break;
+      }
+    }
+    // finish bailing out on end of file
+    if(end_of_file) break;
+    // store the match if we had one
+    if(all_match) {
+      rb_ary_push(result, rb_str_new(master_blob->data,
+                                     master_blob->size));
+    } else {
+      // if we didn't have a match then whichever blob failed first
+      // becomes the new master and we try again
+      blob* temp = master_blob;
+      master_blob = next_blob;
+      next_blob = temp;
+      master_idx = ii;
+    }
+  }
+  free(master_blob);
+  free(next_blob);
+  return result;
+}
+static VALUE rfpset_spit_array(VALUE self, VALUE array, VALUE filename) {
+  gzFile out = gzopen(RSTRING_PTR(filename), "wb2");
+  if(out == NULL) return rb_fix_new(-1);
+  long ii;
+  long size = RARRAY_LEN(array);
+  // sort the array in place
+  VALUE* values = RARRAY_PTR(array);
+  rstring_sort_array(values, size);
+  // spit them at the disk, freeing as we go
+  VALUE last_value = 0;
+  for(ii = 0; ii < size; ++ii) {
+    if(!last_value
+       || RSTRING_LEN(values[ii]) != RSTRING_LEN(last_value)
+       || memcmp(RSTRING_PTR(values[ii]), RSTRING_PTR(last_value),
+                 RSTRING_LEN(last_value)) != 0) {
+      // this blob is unique. Write it
+      rstring_write(out, values[ii]);
+      last_value = values[ii];
+    }
+  }
+  gzclose(out);
+  // return the number of blobs written
+  return rb_fix_new(size);
+}
+VALUE rfpset_slurp_array(VALUE self, VALUE filename) {
+  gzFile in = gzopen(RSTRING_PTR(filename), "rb");
+  if(in == NULL) return rb_fix_new(-1);
+  VALUE array = rb_ary_new();
+  blob* next_blob = blob_make(512);
+  while((next_blob = blob_read(in, next_blob)) != NULL) {
+    rb_ary_push(array, rb_str_new(next_blob->data, next_blob->size));
+  }
+  gzclose(in);
+  free(next_blob);
+  return array;
+}
+VALUE rfpset_intersect_files(VALUE self, VALUE filenames) {
+  long file_count = RARRAY_LEN(filenames);
+  VALUE* values = RARRAY_PTR(filenames);
+  int ii;
+  if(file_count == 1) {
+    return rfpset_slurp_array(self, values[0]);
+  }
+  gzFile* files = malloc(sizeof(gzFile) * file_count);
+  // open all the files
+  for(ii = 0; ii < file_count; ++ii) {
+    const char* name = RSTRING_PTR(values[ii]);
+    gzFile file = gzopen(name, "rb");
+    if(file == NULL) break; // failure!
+    files[ii] = file;
+  }
+  // make sure they all opened
+  if(ii < file_count) {
+    // close them all
+    int jj;
+    for(jj = 0; jj < ii; ++jj) {
+      gzclose(files[jj]);
+    }
+    return rb_fix_new(-1);
+  }
+  VALUE array = blob_intersect_files(files, file_count);
+  // close the files
+  for(ii = 0; ii < file_count; ++ii) {
+    gzclose(files[ii]);
+  }
+  return array;
+}
+void Init_rfpset() {
+  VALUE klass = rb_define_class("FPSetInternal", rb_cObject);
+  rb_define_singleton_method(klass, "spit_array", rfpset_spit_array, 2);
+  rb_define_singleton_method(klass, "slurp_array", rfpset_slurp_array, 1);
+  rb_define_singleton_method(klass, "intersect_files", rfpset_intersect_files, 1);
+}

data/lib/rfpset/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Rfpset
+  VERSION = "0.0.1"
+end

data/lib/rfpset.rb ADDED Viewed

@@ -0,0 +1,77 @@
+require "rfpset/version"
+require "rfpset/rfpset"
+require 'set'
+module FPSet
+  # Create a new set-file
+  #
+  # Example:
+  #   >> FPSet.to_file([1,2,5,3,5,4], "numbers.dat")
+  #
+  # Arguments:
+  #   data: (Enumerable, can contain duplicates)
+  #   filename: (String)
+  def to_file(data, filename)
+    array = (data.collect { |d| Marshal.dump(d) }).to_a
+    result = FPSetInternal.spit_array(array, filename)
+    raise "does the file #{filename} exist?" if result == -1
+    return result
+  end
+  module_function :to_file
+  # Create a new set-file. Mutates provided data to save memory.
+  #
+  # Example:
+  #   >> arr = [1,2,5,3,5,4]
+  #   >> FPSet.to_file!(arr, "numbers.dat")
+  #   >> arr = nil # array is full of garbage now
+  #
+  # Arguments:
+  #   data: (Array, will be mutated)
+  #   filename: (String)
+  def to_file!(data, filename)
+    return to_file(data, filename) if not data.kind_of?(Array)
+    data.collect! { |d| Marshal.dump(d) }
+    result = FPSetInternal.spit_array(data, filename)
+    raise "does the file #{filename} exist?" if result == -1
+    return result
+  end
+  module_function :to_file!
+  # Slurp a set-file from disk into a Ruby set.
+  #
+  # Example:
+  #   >> set = FPSet.from_file("numbers.dat")
+  #
+  # Arguments:
+  #   filename: (String)
+  def from_file(filename)
+    result = FPSetInternal.slurp_array(filename)
+    raise "does the file #{filename} exist?" if result == -1
+    return Set.new( result.map { |s| Marshal.load(s) } )
+  end
+  module_function :from_file
+  # Compute the intersection of set-files
+  #
+  # Example:
+  #   >> set = FPSet.intersect_files(["numbers1.dat", "numbers2.dat"])
+  #
+  # Arguments:
+  #   filenames: (Enumerable of Strings)
+  def intersect_files(filenames)
+    array = Array(filenames.collect { |f| f.to_s })
+    result = FPSetInternal.intersect_files(array)
+    if result == -1 then
+      names = array.join(", ")
+      raise "do all files exist? (#{names})"
+    end
+    return result.map { |s| Marshal.load(s) }
+  end
+  module_function :intersect_files
+end

data/rfpset.gemspec ADDED Viewed

@@ -0,0 +1,26 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "rfpset/version"
+Gem::Specification.new do |s|
+  s.name        = "rfpset"
+  s.version     = Rfpset::VERSION
+  s.authors     = ["Brian Taylor"]
+  s.email       = ["el.wubo@gmail.com"]
+  s.homepage    = "http://www.50ply.com/blog/2012/07/21/introducing-fast/"
+  s.summary     = %q{Fast, persistent sets}
+  s.description = %q{Fast, persistent sets supporting efficient intersections of many very large sets.}
+  s.rubyforge_project = "rfpset"
+  s.files         = `git ls-files`.split("\n")
+  s.extensions    = ['ext/rfpset/extconf.rb']
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  # specify any dependencies here; for example:
+  # s.add_development_dependency "rspec"
+  # s.add_runtime_dependency "rest-client"
+end

data/test/test_fpset.rb ADDED Viewed

@@ -0,0 +1,44 @@
+require 'test/unit'
+require 'rfpset'
+require 'set'
+class FPSetTest < Test::Unit::TestCase
+  def test_primitive
+    data = ["one", "two", "three"]
+    testfile = "test.dat"
+    testfile2 = "test2.dat"
+    assert_equal 3, FPSetInternal.spit_array(data, testfile)
+    new_data = FPSetInternal.slurp_array(testfile)
+    assert_equal data, new_data
+    data2 = ["three", "four", "five"]
+    assert_equal 3, FPSetInternal.spit_array(data2, testfile2)
+    intersect = FPSetInternal.intersect_files([testfile, testfile2])
+    assert_equal 1, intersect.size
+    assert_equal "three", intersect[0]
+  end
+  def test_porcelain
+    test1 = "test.dat"
+    test2 = "test2.dat"
+    FPSet.to_file(Array(1..5).concat(Array(1..5)), test1)
+    FPSet.to_file(3..6, test2)
+    assert_equal 3, FPSet.intersect_files([test1, test2]).size
+    assert_equal (1..5).to_set, FPSet.from_file(test1)
+    assert_equal Array(1..5), FPSet.intersect_files([test1])
+    # are more interesting types preserved?
+    test3_data = 5.times.collect { |i| [i.to_s] }
+    test3 = "test3.dat"
+    FPSet.to_file(test3_data, test3)
+    assert_equal test3_data.to_set, FPSet.from_file(test3)
+    FPSet.to_file!(Array.new(test3_data), test3)
+    assert_equal test3_data.to_set, FPSet.from_file(test3)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,59 @@
+--- !ruby/object:Gem::Specification
+name: rfpset
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- Brian Taylor
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-07-21 00:00:00.000000000Z
+dependencies: []
+description: Fast, persistent sets supporting efficient intersections of many very
+  large sets.
+email:
+- el.wubo@gmail.com
+executables: []
+extensions:
+- ext/rfpset/extconf.rb
+extra_rdoc_files: []
+files:
+- .gitignore
+- Gemfile
+- README.markdown
+- Rakefile
+- bench.rb
+- ext/rfpset/extconf.rb
+- ext/rfpset/rfpset.c
+- lib/rfpset.rb
+- lib/rfpset/version.rb
+- rfpset.gemspec
+- test/test_fpset.rb
+homepage: http://www.50ply.com/blog/2012/07/21/introducing-fast/
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project: rfpset
+rubygems_version: 1.8.10
+signing_key:
+specification_version: 3
+summary: Fast, persistent sets
+test_files:
+- test/test_fpset.rb