RubyGems - rsemantic - Versions diffs - 0.1.3 → 0.1.4 - Mend

rsemantic 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/{README.txt → README.md} +19 -10
data/lib/semantic.rb +8 -5
data/lib/semantic/compare.rb +4 -1
data/lib/semantic/corpus.rb +61 -0
data/lib/semantic/document.rb +39 -0
data/lib/semantic/matrix_transformer.rb +4 -5
data/lib/semantic/parser.rb +22 -10
data/lib/semantic/search.rb +22 -16
data/lib/semantic/search_result.rb +16 -0
data/lib/semantic/transform/lsa_transform.rb +47 -22
data/lib/semantic/transform/tf_idf_transform.rb +12 -23
data/lib/semantic/vector_space/builder.rb +29 -22
data/lib/semantic/vector_space/model.rb +14 -13
data/lib/semantic/version.rb +1 -1
data/lib/tasks/rspec.rake +13 -0
metadata +75 -107
data/Manifest.txt +0 -38
data/Rakefile +0 -9
data/config/hoe.rb +0 -69
data/config/requirements.rb +0 -15
data/gem_tasks/deployment.rake +0 -34
data/gem_tasks/environment.rake +0 -7
data/gem_tasks/examples.rake +0 -29
data/gem_tasks/fix_cr_lf.rake +0 -10
data/gem_tasks/gemspec.rake +0 -6
data/gem_tasks/rspec.rake +0 -33
data/gem_tasks/website.rake +0 -17
data/rsemantic.gemspec +0 -41
data/spec/semantic/compare_spec.rb +0 -16
data/spec/semantic/matrix_transformer_spec.rb +0 -51
data/spec/semantic/parser_spec.rb +0 -34
data/spec/semantic/search_spec.rb +0 -129
data/spec/semantic/transform/lsa_transform_spec.rb +0 -59
data/spec/semantic/transform/tf_idf_transform_spec.rb +0 -35
data/spec/semantic/vector_space/builder_spec.rb +0 -44
data/spec/semantic/vector_space/model_spec.rb +0 -22
data/spec/spec.opts +0 -2
data/spec/spec_helper.rb +0 -7

data/{README.txt → README.md} RENAMED Viewed

@@ -1,8 +1,6 @@
-= Rsemantic
+# Rsemantic
-* http://github.com/josephwilk/rsemantic
-== DESCRIPTION:
+[![Build Status](https://secure.travis-ci.org/josephwilk/rsemantic.png?branch=master)](http://travis-ci.org/josephwilk/rsemantic)
 A Ruby document vector search with flexible matrix transforms. Current supported transforms:
@@ -11,20 +9,31 @@ A Ruby document vector search with flexible matrix transforms. Current supported
 Documentation: http://github.com/josephwilk/rsemantic/wikis/home
-== REQUIREMENTS:
+## Requirements:
-* Linalg - http://rubyforge.org/projects/linalg/
+* GSL - http://www.gnu.org/software/gsl
 * stemmer - http://rubyforge.org/projects/stemmer/
-== INSTALL:
+## INSTALL:
+Note 'brew install GSL' installs 1.15 which is not supported yet by the gsl gem. So you have to switch your GSL version to 1.14.
+See here for details on how to do that with homebrew: http://bretthard.in/2012/03/getting-related_posts-lsi-and-gsl-to-work-in-jekyll/
+<pre><code>git clone git://github.com/josephwilk/rsemantic.git
+cd rsemantic
+brew install GSL
+bundle install
+</code></pre>
-* git clone git://github.com/josephwilk/rsemantic.git
+## Contributors
+* @josephwilk
+* @dominikhonnef
-== LICENSE
+## LICENSE
 (The MIT License)
-Copyright (c) 2008 Joseph Wilk
+Copyright (c) 2008-2012 Joseph Wilk
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

data/lib/semantic.rb CHANGED Viewed

@@ -9,10 +9,13 @@ require "semantic/search"
 require "semantic/transform"
 require "semantic/version"
+require "semantic/corpus"
+require "semantic/document"
+require "semantic/search_result"
 require 'rubygems'
-require 'linalg'
-#http://rubyforge.org/projects/stemmer/
-#A processor for removing the commoner morphological and inflexional endings from words in English
+require 'gsl'
 require 'stemmer'
 require 'logger'
@@ -21,7 +24,7 @@ module Semantic
   class << self
     attr_writer :logger
   end
   def self.logger
     return @logger if @logger
     @logger = Logger.new(STDOUT)
@@ -29,5 +32,5 @@ module Semantic
     @logger.level = Logger::ERROR
     @logger
   end
 end

data/lib/semantic/compare.rb CHANGED Viewed

@@ -9,7 +9,10 @@ module Semantic
       def cosine(vector1, vector2)
         unless vector2.nil? or vector1.nil?
-          (vector2.dot(vector1)) / (vector1.norm * vector2.norm)
+          v1 = vector1.row
+          v2 = vector2
+          score =  (v1 * v2) / (vector1.norm * vector2.norm)
+          score.nan? ?  0.0 : score
         end
       end

data/lib/semantic/corpus.rb ADDED Viewed

@@ -0,0 +1,61 @@
+module Semantic
+  class Corpus
+    # @return [Array<Document>]
+    attr_reader :documents
+    # @param [Array<Document>] documents The {Document documents} to
+    #   index
+    # @param [Hash] options
+    # TODO document options
+    def initialize(documents = [], options = {})
+      @documents = documents
+      @options   = options
+      @search    = nil
+    end
+    # Adds a new {Document document} to the index.
+    #
+    # @param [Document] document
+    # @return [void]
+    def add_document(document)
+      @documents << document
+      document.corpora << self
+    end
+    alias_method :<<, :add_document
+    # Build the index. This is required to be able to search for words
+    # or compute related documents.
+    #
+    # If you add new documents, you have to rebuild the index.
+    #
+    # @return [void]
+    def build_index
+      @search = Semantic::Search.new(@documents.map(&:text), @options)
+    end
+    def search(*words)
+      # TODO raise if no index built yet
+      results = @search.search(words)
+      results.map.with_index { |result, index|
+        document = @documents[index]
+        Semantic::SearchResult.new(document, result)
+      }.sort
+    end
+    def find_related_document(document)
+      @search.related(@documents.index(document)).map.with_index { |result, index|
+        document = @documents[index]
+        Semantic::SearchResult.new(document, result)
+      }.sort
+    end
+    def find_keywords(document, num = 5)
+      # TODO allow limiting keywords to words that occur in this document
+    end
+    def to_s
+      "#<%s %d documents, @options=%s>" % [self.class.name, @documents.size, @options.inspect]
+    end
+  end
+end

data/lib/semantic/document.rb ADDED Viewed

@@ -0,0 +1,39 @@
+module Semantic
+  class Document
+    attr_reader :text
+    attr_reader :attributes
+    attr_reader :corpora
+    def initialize(text, attributes = {})
+      if text.respond_to?(:read)
+        @text = text.read
+      else
+        @text = text
+      end
+      @attributes = attributes
+      @corpora    = []
+    end
+    def to_s
+      "#<%s @attributes=%s>" % [self.class.name, @attributes.inspect]
+    end
+    def [](key)
+      @attributes[key]
+    end
+    # @todo document that it has to be part of at least one corpus
+    def related
+      results = {}
+      @corpora.each do |corpus|
+        results[corpus] = corpus.find_related_document(self)
+      end
+      results
+    end
+    def keywords(num = 5)
+    end
+  end
+end

data/lib/semantic/matrix_transformer.rb CHANGED Viewed

@@ -1,9 +1,8 @@
 module Semantic
   class MatrixTransformer
-    def initialize(options={})
-      @transforms = options[:transforms] || [:TFIDF, :LSA]
-      @options = options
+    def initialize(transforms)
+      @transforms = transforms
     end
     def apply_transforms(vector_space_model)
@@ -11,9 +10,9 @@ module Semantic
         begin
           transform_class = Semantic::Transform.const_get(transform)
           Semantic.logger.info("Applying #{transform} transform")
-          vector_space_model.matrix = transform_class.send(:transform, vector_space_model.matrix) if transform_class.respond_to?(:transform)
+          transform_class.transform!(vector_space_model.matrix)
           Semantic.logger.info(vector_space_model)
-        rescue Exception => e
+        rescue => e
           Semantic.logger.error("Error: Cannot perform transform: #{transform}")
           Semantic.logger.error(e)
         end

data/lib/semantic/parser.rb CHANGED Viewed

@@ -1,13 +1,17 @@
 require 'stemmer'
+require "set"
 module Semantic
   class Parser
-    def initialize
-      #English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
-      #TODO: nicer way to reference stop file location?
-      File.open(File.dirname(__FILE__)+'/../../resources/english.stop', 'r') do |file|
-        @stopwords = file.read().split()
+    def initialize(options = {})
+      # English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
+      # TODO: nicer way to reference stop file location?
+      @filter_stop_words = options[:filter_stop_words]
+      @stem_words        = options[:stem_words]
+      if @filter_stop_words
+        File.open(File.dirname(__FILE__) + '/../../resources/english.stop', 'r') do |file|
+          @stopwords = Set.new(file.read().split())
+        end
       end
     end
@@ -16,7 +20,7 @@ module Semantic
       remove_stop_words(word_list)
     end
-    #remove any nasty grammar tokens from string
+    # remove any nasty grammar tokens from string
     def clean(string)
       string = string.gsub(".","")
       string = string.gsub(/\s+/," ")
@@ -24,16 +28,24 @@ module Semantic
       return string
     end
-    #stop words are common words which have no search value
+    # stop words are common words which have no search value
     def remove_stop_words(list)
-      list.select {|word| word unless @stopwords.include? word }
+      if @filter_stop_words
+        list.select {|word| !@stopwords.include?(word) }
+      else
+        list
+      end
     end
     def tokenise_and_stem(string)
       string = clean(string)
       words = string.split(" ")
-      words.map {|word| word.stem }
+      if @stem_words
+        words.map {|word| Stemmer.stem_word(word) }
+      else
+        words
+      end
     end
   end

data/lib/semantic/search.rb CHANGED Viewed

@@ -1,35 +1,41 @@
 module Semantic
   class Search
-    def initialize(documents, options={})
-      Semantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
-      @builder = VectorSpace::Builder.new(options)
-      @matrix_transformer = MatrixTransformer.new(options)
+    def initialize(documents, options = {})
+      options = {
+        :transforms => [:TFIDF, :LSA],
+        :verbose    => false,
+        :filter_stop_words => true,
+        :stem_words => true,
+      }.merge(options)
+      Semantic.logger.level = options[:verbose] ? Logger::INFO : Logger::ERROR
+      @builder = VectorSpace::Builder.new(:filter_stop_words => options[:filter_stop_words], :stem_words => options[:stem_words])
+      @matrix_transformer = MatrixTransformer.new(options[:transforms])
       @vector_space_model = @builder.build_document_matrix(documents)
       Semantic.logger.info(@vector_space_model)
       @vector_space_model = @matrix_transformer.apply_transforms(@vector_space_model)
     end
-    def related(documentId)
+    def related(document_id)
       ratings = []
-      for index in (0...@vector_space_model.ncol)
-        ratings << Compare.similarity(@vector_space_model.column(documentId), @vector_space_model.column(index))
+      @vector_space_model.each_column do |column|
+        ratings << Compare.similarity(@vector_space_model.column(document_id), column)
       end
       ratings
     end
-    def search(searchList)
+    def search(search_list)
       ratings = []
-      query_vector = @builder.build_query_vector(searchList)
-      for index in (0...@vector_space_model.ncol)
-        ratings << Compare.similarity(query_vector, @vector_space_model.column(index))
+      query_vector = @builder.build_query_vector(search_list)
+      @vector_space_model.each_column do |column|
+        ratings << Compare.similarity(query_vector.col, column)
       end
       ratings
     end
   end
 end

data/lib/semantic/search_result.rb ADDED Viewed

@@ -0,0 +1,16 @@
+module Semantic
+  class SearchResult
+    include Comparable
+    attr_reader :document
+    attr_reader :score
+    def initialize(document, score)
+      @document = document
+      @score    = score
+    end
+    def <=>(other)
+      @score <=> other.score
+    end
+  end
+end

data/lib/semantic/transform/lsa_transform.rb CHANGED Viewed

@@ -4,39 +4,64 @@ module Semantic
       class << self
-        def transform(matrix, number_of_dimensions_to_reduce = 1)
-          columns = matrix.num_columns
+        def transform!(matrix, rank = nil)
+          # TODO configurable rank
+          columns = matrix.size2
-          if number_of_dimensions_to_reduce <= columns #Its a valid reduction
-            u, sigma, vt = matrix.singular_value_decomposition
+          u, v, sigma = matrix.SV_decomp_mod
+          reduce_dimensions!(sigma, rank)
+          sigma = GSL::Matrix.diagonal(sigma)
-            sigma_prime = reduce_dimensions(number_of_dimensions_to_reduce, sigma)
+          GSL::Matrix.swap(matrix, u * sigma * v.transpose)
+        end
-            matrix_prime = u * sigma_prime * vt
+        private
+        def reduce_dimensions!(vector, rank)
+          # the vector is already sorted (biggest to smallest), so we
+          # only have to zero the elements we do not want
+          if rank.nil?
+            rank = determine_rank(vector)
           else
-            raise Exception, "dimension reduction cannot be greater than %s" % columns
+            rank = valid_rank(vector, rank)
           end
-          matrix_prime
+          num_to_zero_out = vector.size - rank
+          vector[rank, num_to_zero_out] = 0
         end
-        private
-        def reduce_dimensions(number_of_dimensions_to_reduce, matrix)
-          for diagonal_index in dimensions_to_be_reduced(matrix, number_of_dimensions_to_reduce)
-            matrix[diagonal_index, diagonal_index] = 0
+        def determine_rank(vector)
+          if vector.size <= 15
+            # for less than 15 documents, n-1 is usually the best we
+            # can do. LSA generally works better with bigger data
+            # sets.
+            rank = vector.size - 1
+          elsif vector.size <= 1000
+            # ~500 is a value to work well for really big data sets,
+            # but for less than that, it probably is too big, so we
+            # go for n/3 in this case.
+            rank = vector.size / 3
+          else
+            # if we have more than 1000 documents, using the magical
+            # number 500 (which can be found in various documents)
+            # seems to be the best guess for now.
+            rank = 500
           end
-          matrix
-        end
-        def dimensions_to_be_reduced(matrix, number_of_dimensions_to_reduce)
-          (diagonal_matrix_length(matrix) - number_of_dimensions_to_reduce)...diagonal_matrix_length(matrix)
         end
-        def diagonal_matrix_length(matrix)
-          matrix.num_columns < matrix.num_rows ? matrix.num_columns : matrix.num_rows
+        def valid_rank(vector, rank)
+          if rank <= 0
+            # for negative ranks, keep that many dimensions
+            rank = vector.size + rank
+          elsif rank > vector.size
+            # if the rank is > the vector size, limit it to that
+            rank = vector.size
+          else
+            rank
+          end
         end
       end
     end
   end
-end
+end