RubyGems - squish - Versions diffs - 0.0.1 - Mend

squish 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/CHANGELOG ADDED

@@ -0,0 +1,7 @@
+=== Squish 0.0.1
+  * initial release
+  * kinda neat
+  * needs work
+  * currently melt-your-brain slow
+  * may not be useful until it's rewritten entirely in C
+  * but it's functional and it has an easy-to-use API

data/README ADDED

@@ -0,0 +1,4 @@
+Squish is a simple classification library that uses a modified Huffman
+compression algorithm to classify resources into buckets.  While it is
+orders of magnitude slower than a naive Bayes classifier, it is potentially
+more effective for certain types of data.

data/lib/squish.rb ADDED

@@ -0,0 +1,444 @@
+#--
+# Squish, Copyright (c) 2006 Robert Aman
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+class Array
+  def evaporate #:nodoc:
+    # I didn't write this method.  I haven't looked at this code long
+    # enough to figure out what it does.
+    0.upto(size - 2) do |position|
+      self[position] = at(position) & at(position + 1)
+    end
+    self[size - 1] = []
+  end
+end
+module Squish
+  # Classifies a document, based on an array of supplied buckets.
+  def self.classify(document, buckets)
+    best_result = nil
+    best_score = nil
+    for bucket in buckets
+      score = bucket.compress(document)
+      if best_score == nil || (score < best_score)
+        best_score = score
+        best_result = bucket.name
+      end
+    end
+    return best_result
+  end
+  # Classifies a document, based on an array of supplied buckets.
+  # The document is automatically added to the bucket after classification.
+  def self.classify!(document, buckets)
+    result = self.classify(document, buckets)
+    for bucket in buckets
+      bucket << document if bucket.name == result
+    end
+    return result
+  end
+  class Bucket
+    # Creates a new bucket with a given name.
+    def initialize(name)
+      @name = name
+    end
+    # Returns the name of the bucket.
+    def name
+      return @name
+    end
+    # Returns the list of documents contained within the bucket.  Each
+    # document is simply a Hash object.
+    def documents
+      if !defined?(@documents) || @documents.nil?
+        @documents = []
+      end
+      return @documents
+    end
+    # Magnets are Strings or Regexps which can be attached to a
+    # bucket.  They cause any incoming document that matches them to be very,
+    # very strongly attracted to the bucket that they are attached to.
+    # In essence, it makes the string that the magnet matches infinitely
+    # compressible by that bucket.
+    def magnets
+      if !defined?(@magnets) || @magnets.nil?
+        @magnets = []
+      end
+      return @magnets
+    end
+    # Adds a document to the bucket.  The supplied document must be a Hash.
+    # Suggested convention is to use a Hash such as this:
+    #
+    #  {
+    #    :name => "Bob Aman",
+    #    :email => "bob@sporkmonger.com",
+    #    :body => <<-TEXT
+    #      This is some example text from a hypothetical comment I left on
+    #      someone's blog.
+    #    TEXT
+    #  }
+    #
+    # Supplying a String will convert the string to the form:
+    #
+    #  {
+    #    :body => string
+    #  }
+    #
+    # Automatically invalidates the previously calculated bucket data.
+    def <<(document)
+      self.invalidate()
+      document = {:body => document} if document.kind_of?(String)
+      self.documents << document
+      return self.documents
+    end
+    # Returns the compression ratio for the given document with this bucket.
+    # The lower this number is, the better the fit.
+    #
+    # Supplying a String will convert the string to the form:
+    #
+    #  {
+    #    :body => string
+    #  }
+    #
+    def compress(document)
+      document = {:body => document} if document.kind_of?(String)
+      # Magically compress anything matched by the magnets to nothing.
+      # This strongly attracts the document to this bucket.
+      magnetized_document = document.dup
+      for magnet in self.magnets
+        regexp = nil
+        if magnet.kind_of?(String)
+          regexp = Regexp.new(Regexp.escape(magnet))
+        elsif magnet.kind_of?(Regexp)
+          regexp = magnet
+        end
+        if regexp != nil
+          for key, value in magnetized_document
+            value.gsub!(regexp, "")
+          end
+        end
+      end
+      document_bytes = Marshal.dump(
+        Squish.filter_document(magnetized_document))
+      document_compressed_binary = ""
+      document_compressed_bytes = ""
+      sorted_symbol_table =
+        self.symbol_table.sort { |a, b| b[0].size <=> a[0].size }
+      while document_bytes.size > 0
+        for symbol, coding in sorted_symbol_table
+          symbol_regexp = Regexp.new("^" + Regexp.escape(symbol))
+          if document_bytes =~ symbol_regexp
+            document_bytes.gsub!(symbol_regexp, "")
+            document_compressed_binary << coding
+            break
+          end
+        end
+      end
+      while document_compressed_binary != nil &&
+          document_compressed_binary.size > 0
+        document_compressed_bytes <<
+          document_compressed_binary[0...8].to_i(2).chr
+        document_compressed_binary = document_compressed_binary[8..-1]
+      end
+      return (document_compressed_bytes.size.to_f /
+        Marshal.dump(document).size.to_f)
+    end
+    # Invalidates the bucket compression data.  This method should be called
+    # any time the bucket's list of documents changes.  The << method calls
+    # this method automatically.
+    def invalidate
+      @tree = nil
+      @symbol_table = nil
+    end
+  protected
+    # Returns a list of documents that have been processed by the filter.
+    def processed_documents #:nodoc:
+      processed_documents = []
+      for document in self.documents
+        processed_documents << Squish.filter_document(document)
+      end
+      return processed_documents
+    end
+    # Returns the raw document list used for compression.
+    def raw #:nodoc:
+      return Marshal.dump(self.processed_documents)
+    end
+    # Returns a hash table of symbols and their huffman codings.
+    def symbol_table #:nodoc:
+      if !defined?(@symbol_table) || @symbol_table == nil
+        table = {}
+        self.build_table(self.tree, table)
+        @symbol_table = table
+      end
+      return @symbol_table
+    end
+    # Tree traversal helper method.
+    # Originally written by Aggelos Orfanakos
+    def build_table(root, table, bitstream=[]) #:nodoc:
+      if root.kind_of?(Squish::Internal)
+        bitstream.push '0'
+        self.build_table(root.left, table, bitstream)
+        bitstream[-1] = '1'
+        self.build_table(root.right, table, bitstream)
+        bitstream.pop
+      else
+        table[root.data] = bitstream.join
+      end
+    end
+    # Returns the huffman code tree for the bucket.  This is used to test
+    # whether a document is a good match for a bucket or not.
+    def tree #:nodoc:
+      if !defined?(@tree) || @tree == nil
+        # Adapted from code written by Aggelos Orfanakos
+        forest = []
+        self.symbol_weights.each do |pair|
+          forest.push(Leaf.new(*pair.reverse))
+        end
+        # Sort once, we'll try to keep the array sorted after this
+        forest.sort!
+        while forest.length > 1
+          a = forest.pop
+          b = forest.pop
+          new_node = Internal.new(a.weight + b.weight, a, b)
+          inserted = false
+          for i in 0...forest.size
+            index = forest.size - i - 1
+            if forest[index].weight > new_node.weight
+              forest.insert(index + 1, new_node)
+              inserted = true
+              break
+            end
+          end
+          forest.unshift(new_node) if !inserted
+        end
+        @tree = forest.first
+      end
+      return @tree
+    end
+    # Returns a hash of keys and values, where each key is a string that has
+    # occurred at least once in the source, and each value is the number of
+    # times its corresponding key has appeared.
+    def symbol_occurrences(source=(self.raw + Squish.all_bytes)) #:nodoc:
+      # This method is can't help being a performance bottleneck.  Anything
+      # that can be done to improve it performance-wise will be much
+      # appreciated.
+      # I didn't write this method.  Someone on IRC improved on my original
+      # method that was quite terrible, and I haven't quite figured out how
+      # his replacement method works.
+      symbol_occurrences = {}
+      char_positions = Hash.new { |h, k| h[k] = [] }
+      source.unpack('C*').each_with_index do |ch, pos|
+        char_positions[ch] << pos
+      end
+      offsets = Array.new(source.length)
+      char_positions.each do |char, positions|
+        positions.each do |position|
+          offsets[position] = (positions.dup.delete_if do |other_position|
+            other_position <= position
+          end).map { |other_position| other_position - position }
+        end
+      end
+      (1..10).each do |length|
+        offsets.each_with_index do |offset_list, position|
+          if length == 1 || (offset_list.size + 1 >= 2)
+            if symbol_occurrences[source[position, length]] == nil
+              symbol_occurrences[source[position, length]] = []
+            end
+            symbol_occurrences[source[position, length]] << position
+            symbol_occurrences[source[position, length]].concat(
+              offset_list.map { |offset| position + offset }
+            )
+          end
+        end
+        offsets.evaporate
+      end
+      for key in symbol_occurrences.keys
+        symbol_occurrences[key] = symbol_occurrences[key].uniq.size
+      end
+      return symbol_occurrences
+    end
+    # Returns a hash of keys and values, where each key is a string that has
+    # occurred at least once in the source, and each value is a weighting
+    # of occurrances multiplied by the length of the key.
+    def symbol_weights(symbol_occurrences=self.symbol_occurrences) #:nodoc:
+      symbol_weights = {}
+      for key in symbol_occurrences.keys
+        symbol_weights[key] = (key.size * symbol_occurrences[key])
+      end
+      return symbol_weights
+    end
+  end
+  # Originally written by Aggelos Orfanakos
+  class Node # :nodoc:
+    include Comparable
+    attr_reader :weight
+    def initialize(weight)
+      @weight = weight
+    end
+    def <=>(other)
+      other.weight <=> @weight
+    end
+  end
+  # Originally written by Aggelos Orfanakos
+  class Internal < Node # :nodoc:
+    attr_reader :left, :right
+    def initialize(weight, left, right)
+      super(weight)
+      @left = left
+      @right = right
+    end
+  end
+  # Originally written by Aggelos Orfanakos
+  class Leaf < Node # :nodoc:
+    attr_reader :data
+    def initialize(weight, data)
+      super(weight)
+      @data = data
+    end
+  end
+  # Returns a string containing all possible bytes.  This is appended to the
+  # raw bucket dump to ensure that all bytes can be handled by the tree,
+  # since incoming documents may contain bytes not previously encountered
+  # within training data.
+  def self.all_bytes #:nodoc:
+    if !defined?(@all_bytes) || @all_bytes == nil
+      all_bytes = ""
+      for i in 0...256
+        all_bytes << i.chr
+      end
+      @all_bytes = all_bytes
+    end
+    return @all_bytes
+  end
+  # Filters an entire document (Hash)
+  def self.filter_document(document) #:nodoc:
+    filtered_document = {}
+    for key in document.keys
+      filtered_document[key] = filter_value(document[key])
+    end
+    return filtered_document
+  end
+  # Does a visual reduction of the characters contained within the value.
+  # This prevents "1337" speak from degrading the effectiveness of the
+  # algorithm in any way.  This is intentionally a VERY lossy algorithm, and
+  # isn't particularly efficient, but it works.  The main advantage of this
+  # algorithm is that while some information may be lost from legitimate
+  # documents, more patterns will be revealed in illegitimate documents,
+  # with ultimately more critical information revealed than is lost.
+  def self.filter_value(value) #:nodoc:
+    filtered_value = value.to_s.dup
+    # Remove whitespace because spammers sometimes insert extraneous
+    # whitespace, and the main algorithm shouldn't give false positives due
+    # to a lack of whitespace, but it may give false positives due to extra
+    # whitespace.
+    filtered_value.gsub!(/\s/, "")
+    filtered_value.gsub!(/~/, "-")
+    filtered_value.gsub!(/\|/, "I")
+    filtered_value.gsub!(/!/, "I")
+    filtered_value.gsub!(/1/, "I")
+    filtered_value.gsub!(/l/, "I")
+    filtered_value.gsub!(/\+/, "t")
+    filtered_value.gsub!(/3/, "e")
+    filtered_value.gsub!(/7/, "T")
+    filtered_value.gsub!(/@/, "a")
+    filtered_value.gsub!(/4/, "A")
+    filtered_value.gsub!(/8/, "B")
+    filtered_value.gsub!(/6/, "G")
+    filtered_value.gsub!(/\$/, "S")
+    filtered_value.gsub!(/0/, "O")
+    filtered_value.gsub!(/\(\)/, "O")
+    filtered_value.gsub!(/I\)/, "D")
+    filtered_value.gsub!(/\]\)/, "D")
+    filtered_value.gsub!(/\[\)/, "D")
+    filtered_value.gsub!(/I\*/, "P")
+    filtered_value.gsub!(/\]\*/, "P")
+    filtered_value.gsub!(/\*/, "a")
+    filtered_value.gsub!(/I2/, "R")
+    filtered_value.gsub!(/I=/, "F")
+    filtered_value.gsub!(/I\\I/, "N")
+    filtered_value.gsub!(/\`\//, "Y")
+    filtered_value.gsub!(/\/\\\/\\/, "M")
+    filtered_value.gsub!(/\\\/\\\//, "W")
+    filtered_value.gsub!(/\\\/\\\//, "W")
+    filtered_value.gsub!(/I\\\/I/, "M")
+    filtered_value.gsub!(/IVI/i, "M")
+    filtered_value.gsub!(/VV/, "W")
+    filtered_value.gsub!(/\\X\//, "W")
+    filtered_value.gsub!(/\/\\\//, "N")
+    filtered_value.gsub!(/\\\/\\/, "N")
+    filtered_value.gsub!(/\/V\\/i, "M")
+    filtered_value.gsub!(/\/V/i, "N")
+    filtered_value.gsub!(/\\N/, "W")
+    filtered_value.gsub!(/\\\//, "V")
+    filtered_value.gsub!(/\>\</, "X")
+    filtered_value.gsub!(/I-I/, "H")
+    filtered_value.gsub!(/\]-\[/, "H")
+    filtered_value.gsub!(/\}\{/, "H")
+    filtered_value.gsub!(/I_I/, "U")
+    filtered_value.gsub!(/I\</, "K")
+    filtered_value.gsub!(/\]\</, "K")
+    filtered_value.gsub!(/\(/, "C")
+    filtered_value.gsub!(/\//, "I")
+    filtered_value.gsub!(/\\/, "I")
+    filtered_value.downcase!
+    return filtered_value
+  end
+end

data/lib/squish/version.rb ADDED

@@ -0,0 +1,9 @@
+module Squish
+  module SQUISH_VERSION #:nodoc:
+    MAJOR = 0
+    MINOR = 0
+    TINY  = 1
+    STRING = [MAJOR, MINOR, TINY].join('.')
+  end
+end

data/rakefile ADDED

@@ -0,0 +1,252 @@
+require 'rubygems'
+require 'rake'
+require 'rake/testtask'
+require 'rake/rdoctask'
+require 'rake/packagetask'
+require 'rake/gempackagetask'
+require 'rake/contrib/rubyforgepublisher'
+require 'spec/rake/spectask'
+require File.join(File.dirname(__FILE__), 'lib/squish', 'version')
+PKG_DISPLAY_NAME   = 'Squish'
+PKG_NAME           = PKG_DISPLAY_NAME.downcase
+PKG_VERSION        = Squish::SQUISH_VERSION::STRING
+PKG_FILE_NAME      = "#{PKG_NAME}-#{PKG_VERSION}"
+RELEASE_NAME       = "REL #{PKG_VERSION}"
+RUBY_FORGE_PROJECT = PKG_NAME
+RUBY_FORGE_USER    = "sporkmonger"
+PKG_SUMMARY        = "Resource classification library."
+PKG_DESCRIPTION    = <<-TEXT
+Squish is a simple classification library that uses a modified Huffman
+compression algorithm to classify resources into buckets.  While it is
+orders of magnitude slower than a naive Bayes classifier, it is potentially
+more effective for certain types of data.
+TEXT
+PKG_FILES = FileList[
+    "lib/**/*", "spec/**/*", "doc/**/*", "vendor/**/*", "[A-Z]*", "rakefile"
+].exclude(/\bCVS\b|~$/).exclude(/database\.yml/).exclude(/[_\.]svn$/)
+module Rake
+  def self.browse(filepath)
+    if RUBY_PLATFORM =~ /mswin/
+      system(filepath)
+    else
+      try_browsers = lambda do
+        result = true
+        if !(`which firefox 2>&1` =~ /no firefox/)
+          system("firefox #{filepath}")
+        elsif !(`which mozilla 2>&1` =~ /no mozilla/)
+          system("mozilla #{filepath}")
+        elsif !(`which netscape 2>&1` =~ /no netscape/)
+          system("netscape #{filepath}")
+        elsif !(`which links 2>&1` =~ /no links/)
+          system("links #{filepath}")
+        elsif !(`which lynx 2>&1` =~ /no lynx/)
+          system("lynx #{filepath}")
+        else
+          result = false
+        end
+        result
+      end
+      opened = false
+      if RUBY_PLATFORM =~ /darwin/
+        opened = true
+        system("open #{filepath}")
+      elsif !(`which gnome-open 2>&1` =~ /no gnome-open/)
+        success =
+          !(`gnome-open #{filepath} 2>&1` =~ /There is no default action/)
+        if !success
+          opened = try_browsers.call()
+        else
+          opened = true
+        end
+      else
+        opened = try_browsers.call()
+      end
+      if !opened
+        puts "Don't know how to browse to location."
+      end
+    end
+  end
+end
+task :default => [ "spec:run" ]
+gem_spec = Gem::Specification.new do |s|
+  s.name = PKG_NAME
+  s.version = PKG_VERSION
+  s.summary = PKG_SUMMARY
+  s.description = PKG_DESCRIPTION
+  s.files = PKG_FILES.to_a
+  s.has_rdoc = true
+  s.extra_rdoc_files = %w( README )
+  s.rdoc_options.concat ['--main',  'README']
+  s.add_dependency('rake', '>= 0.7.2')
+  s.add_dependency('rspec', '>= 0.7.1')
+  s.require_path = 'lib'
+  s.author = "Bob Aman"
+  s.email = "bob@sporkmonger.com"
+  s.homepage = "http://sporkmonger.com/"
+  s.rubyforge_project = "squish"
+end
+Rake::GemPackageTask.new(gem_spec) do |p|
+  p.gem_spec = gem_spec
+  p.need_tar = true
+  p.need_zip = true
+end
+Rake::RDocTask.new do |rdoc|
+  rdoc.rdoc_dir = 'doc'
+  rdoc.title    = "Squish -- simple resource classification"
+  rdoc.options << '--line-numbers' << '--inline-source' <<
+    '--accessor' << 'cattr_accessor=object'
+  rdoc.template = "#{ENV['template']}.rb" if ENV['template']
+  rdoc.rdoc_files.include('README', 'CHANGELOG', 'TODO', 'LICENSE')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end
+namespace :rcov do
+  desc 'Open the RCov code coverage report in a browser.'
+  task :browse do
+    if !File.exists?(File.expand_path(
+        File.dirname(__FILE__) + '/coverage/index.html'))
+      Rake::Task["spec:run"].invoke
+    end
+    Rake.browse(File.expand_path(
+      File.dirname(__FILE__) + '/coverage/index.html'))
+  end
+end
+namespace :spec do
+  desc "Run all the specs"
+  Spec::Rake::SpecTask.new(:run) do |t|
+    t.spec_files = FileList['spec/**/*_spec.rb']
+    t.spec_opts = ['--color']
+    t.rcov = true
+    t.rcov_opts = [
+      # Don't include the actual spec files in the coverage report
+      '--exclude', '"spec\/.*"'
+    ]
+  end
+  desc "Run all the specs"
+  Spec::Rake::SpecTask.new(:run_without_rcov) do |t|
+    t.spec_files = FileList['spec/**/*_spec.rb']
+    t.spec_opts = ['--color']
+  end
+  # desc "Start up autotest for RSpec"
+  # task :autospec do
+  #   require 'autotest'
+  #   require 'autotest/growl'
+  #   require 'autotest/redgreen'
+  #   require 'vendor/autospec/lib/autospec'
+  #   Autospec.run
+  # end
+  desc "Print Specdoc for all specs"
+  Spec::Rake::SpecTask.new(:doc) do |t|
+    t.spec_files = FileList[
+      'spec/**/*_spec.rb'
+    ]
+    t.spec_opts = ["--format", "specdoc"]
+  end
+  desc "Generate HTML Specdocs for all specs"
+  Spec::Rake::SpecTask.new(:html) do |t|
+    if !File.exists?(
+        File.expand_path(File.dirname(__FILE__) + '/doc/'))
+      puts "Creating doc folder..."
+      Dir.mkdir(File.expand_path(File.dirname(__FILE__) + '/doc/'))
+    end
+    if !File.exists?(
+        File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
+      puts "Creating specs folder..."
+      Dir.mkdir(File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
+    end
+    t.spec_files = FileList['spec/**/*_spec.rb']
+    t.spec_opts = ["--format", "html"]
+    t.out = File.expand_path(
+      File.dirname(__FILE__) + '/doc/specs/index.html')
+  end
+  desc 'Open the RSpec HTML specifications in a browser.'
+  task :browse => [ "spec:html" ] do
+    Rake.browse(File.expand_path(
+      File.dirname(__FILE__) + '/doc/specs/index.html'))
+  end
+end
+namespace :publish do
+  desc "Publish the coverage report"
+  task :coverage => [ "spec:run" ] do
+    Rake::SshDirPublisher.new(
+      "sporkmonger@sporkmonger.com",
+      "projects/squish/coverage/",
+      "coverage/"
+    ).upload
+  end
+  desc "Publish the specifications"
+  task :specs => [ "spec:html" ] do
+    Rake::SshDirPublisher.new(
+      "sporkmonger@sporkmonger.com",
+      "projects/squish/specs/",
+      "doc/specs/"
+    ).upload
+  end
+  desc "Publish the API documentation"
+  task :api => [ "rdoc" ] do
+    if !File.exists?(
+        File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
+      puts "Creating specs folder..."
+      Dir.mkdir(File.expand_path(File.dirname(__FILE__) + '/doc/specs/'))
+    end
+    Rake::SshDirPublisher.new(
+      "sporkmonger@sporkmonger.com",
+      "projects/squish/api/",
+      "doc/"
+    ).upload
+  end
+  desc "Runs all of the publishing tasks"
+  task :all => ["publish:coverage", "publish:api", "publish:specs"] do
+  end
+end
+task :lines do
+  lines, codelines, total_lines, total_codelines = 0, 0, 0, 0
+  for file_name in FileList["lib/**/*.rb"]
+    f = File.open(file_name)
+    while line = f.gets
+      lines += 1
+      next if line =~ /^\s*$/
+      next if line =~ /^\s*#/
+      codelines += 1
+    end
+    puts "L: #{sprintf("%4d", lines)}, LOC #{sprintf("%4d", codelines)} | #{file_name}"
+    total_lines     += lines
+    total_codelines += codelines
+    lines, codelines = 0, 0
+  end
+  puts "Total: Lines #{total_lines}, LOC #{total_codelines}"
+end

data/spec/squish/squish_spec.rb ADDED

@@ -0,0 +1,241 @@
+#--
+# Squish, Copyright (c) 2006 Robert Aman
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#++
+$:.unshift(File.expand_path(File.dirname(__FILE__) + '/../../lib'))
+$:.uniq!
+require 'squish'
+lorem_bucket = Squish::Bucket.new("lorem")
+lorem_bucket << {
+  :body => <<-TEXT
+Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
+tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
+quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
+consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
+cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
+proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+  TEXT
+}
+spam_bucket = Squish::Bucket.new("spam")
+spam_bucket << {
+  :name => "Disabled",
+  :url => "http://www.artpromcompany.com/",
+  :body => <<-TEXT
+I much have powerfully interested your resource. As you see on that be
+changed reference http://www.artpromcompany.com
+  TEXT
+}
+spam_bucket << {
+  :name => "gamble in poker casinos",
+  :url => "http://www.see-the-dealer.com/",
+  :body => <<-TEXT
+The eye has some latin bingo. Some interest has one new issue. Select craps
+is some rolling Revolution. One confident Revolution poked thanks to a
+middle-class side. Lesser keno is one socialist table. It’s social to be
+overhung! It’s violent to be sat! The eldest poker misread some keno
+conductively.
+  TEXT
+}
+spam_bucket << {
+  :name => "currency rates",
+  :url => "http://www.allinforex.com/",
+  :body => <<-TEXT
+It’s valid to be lent! The exchange is untactfully legal. One war has a
+capitalist forex investment. Some currency has this wonderful forex.
+Goodness, one integral forex rates sociably input excepting that religious
+forex investment. I mean, one face is less unemployed than a medical girl.
+  TEXT
+}
+spam_bucket.magnets << /viagra/i
+valid_bucket = Squish::Bucket.new("valid")
+valid_bucket << {
+  :name => "ninja",
+  :url => nil,
+  :body => <<-TEXT
+Hey Bob,
+This article sounds alot like you’re encouraging rather than discouraging
+strict OOP. Not what I heard from you last time we spoke...
+Dont bother constructing an elaborate counter-argument... I’m too ignorant
+to understand what’s going on on this site anyway, and probabaly wouldnt
+understand what you’re talking about :)
+– Reda
+p.s – how’s it going?
+  TEXT
+}
+valid_bucket.magnets << "Bob"
+context "An empty bucket" do
+  setup do
+    @empty = Squish::Bucket.new("empty")
+  end
+  specify "should have the correct name" do
+    @empty.name.should == "empty"
+  end
+  specify "should not have any documents" do
+    @empty.documents.should.be.empty
+  end
+  specify "should not have any magnets" do
+    @empty.magnets.should.be.empty
+  end
+end
+context "A bucket containing 'lorem ipsum' text" do
+  setup do
+    @lorem = lorem_bucket
+  end
+  specify "should have the correct name" do
+    @lorem.name.should == "lorem"
+  end
+  specify "should have at least one 'lorem ipsum' document" do
+    @lorem.documents.should.not.be.empty
+    @lorem.documents.size.should >= 1
+  end
+  specify "should not have any magnets" do
+    @lorem.magnets.should.be.empty
+  end
+end
+context "A bucket containing several spammy documents and a viagra magnet" do
+  setup do
+    @spam = spam_bucket
+  end
+  specify "should have the correct name" do
+    @spam.name.should == "spam"
+  end
+  specify "should have multiple documents" do
+    @spam.documents.should.not.be.empty
+    @spam.documents.size.should >= 3
+  end
+  specify "should only have one magnet" do
+    @spam.magnets.should.not.be.empty
+    @spam.magnets.size.should == 1
+  end
+end
+context "With an array of several buckets, Squish" do
+  setup do
+    @buckets = [lorem_bucket, spam_bucket, valid_bucket]
+  end
+  specify "should correctly classify valid documents" do
+    Squish.classify!(
+      "Hi Bob, what have you been up to lately?  Anything interesting?",
+      @buckets
+    ).should == "valid"
+    Squish.classify!(
+      "Bob, I figured out what was wrong with your ruby program.",
+      @buckets
+    ).should == "valid"
+  end
+  specify "should correctly classify spam documents" do
+    # Give the bucket a little help here, since it doesn't have enough
+    # training data.
+    spam_bucket << "Check currancy rates online!"
+    spam_bucket << "Online poker casino!"
+    spam_bucket << "Penis enlargement!"
+    spam_bucket << "Cheap online pharmacy sells viagra and cialis!"
+    spam_bucket << "Amazing mortgage rates!  Buy your home for less!"
+    spam_bucket << "Viagra! Cialis!"
+    Squish.classify!(
+      "Invest money on the foreign exchange!",
+      @buckets
+    ).should == "spam"
+    Squish.classify!(
+      "Play bingo and poker online!  Make money!",
+      @buckets
+    ).should == "spam"
+    Squish.classify!(
+      "Enlarge your penis for cheap!  She will fall in love with you again!",
+      @buckets
+    ).should == "spam"
+    Squish.classify!(
+      "\\/|agr@!",
+      @buckets
+    ).should == "spam"
+    Squish.classify!(
+      "V / a G r A",
+      @buckets
+    ).should == "spam"
+    Squish.classify!(
+      "Buy viagra and cialis!",
+      @buckets
+    ).should == "spam"
+  end
+  specify "should correctly classify 'lorem ipsum' documents" do
+    Squish.classify!(
+      %{
+       Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Fusce
+       tincidunt augue a augue. Ut nunc. Fusce porta, sem a luctus mattis,
+       dolor dui gravida diam, a eleifend augue nibh eget nibh. Duis eu
+       justo. In viverra enim a turpis. Nullam eros. Nullam vestibulum
+       nunc vel nisi. Vestibulum ante ipsum primis in faucibus orci luctus
+       et ultrices posuere cubilia Curae; Integer feugiat lorem ut dolor.
+       Cras eget nulla. Donec velit pede, posuere vel, iaculis quis, commodo
+       sit amet, diam. Praesent pharetra velit ac enim. Donec porta tortor
+       congue nunc. Duis eu enim sit amet nulla tincidunt bibendum.
+       Donec mollis.
+      },
+      @buckets
+    ).should == "lorem"
+    Squish.classify!(
+      "Lorem ipsum dolor sit amet.",
+      @buckets
+    ).should == "lorem"
+    Squish.classify!(
+      "Lorem ipsum, you scallywag!  Vestibulum ante ipsum and such!",
+      @buckets
+    ).should == "lorem"
+    Squish.classify!(
+      "Lorem ipsum, you scallywag!",
+      @buckets
+    ).should == "lorem"
+  end
+end

metadata ADDED

@@ -0,0 +1,72 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.9.0
+specification_version: 1
+name: squish
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+date: 2007-03-30 00:00:00 -04:00
+summary: Resource classification library.
+require_paths:
+- lib
+email: bob@sporkmonger.com
+homepage: http://sporkmonger.com/
+rubyforge_project: squish
+description: Squish is a simple classification library that uses a modified Huffman compression algorithm to classify resources into buckets.  While it is  orders of magnitude slower than a naive Bayes classifier, it is potentially more effective for certain types of data.
+autorequire:
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
+  version:
+platform: ruby
+signing_key:
+cert_chain:
+post_install_message:
+authors:
+- Bob Aman
+files:
+- lib/squish
+- lib/squish.rb
+- lib/squish/version.rb
+- spec/squish
+- spec/squish/squish_spec.rb
+- doc/specs
+- CHANGELOG
+- README
+- rakefile
+test_files: []
+rdoc_options:
+- --main
+- README
+extra_rdoc_files:
+- README
+executables: []
+extensions: []
+requirements: []
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rake
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.7.2
+    version:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Version::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.7.1
+    version: