RubyGems - tf-idf-similarity - Versions diffs - 0.0.9 → 0.1.0 - Mend

tf-idf-similarity 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/.travis.yml +29 -0
data/Gemfile +4 -0
data/README.md +41 -29
data/lib/tf-idf-similarity.rb +12 -1
data/lib/tf-idf-similarity/document.rb +35 -28
data/lib/tf-idf-similarity/extras/document.rb +2 -125
data/lib/tf-idf-similarity/extras/tf_idf_model.rb +192 -0
data/lib/tf-idf-similarity/matrix_methods.rb +164 -0
data/lib/tf-idf-similarity/term_count_model.rb +78 -0
data/lib/tf-idf-similarity/tf_idf_model.rb +81 -0
data/lib/tf-idf-similarity/token.rb +34 -12
data/lib/tf-idf-similarity/version.rb +1 -1
data/spec/document_spec.rb +136 -0
data/spec/extras/tf_idf_model_spec.rb +269 -0
data/spec/spec_helper.rb +21 -0
data/spec/term_count_model_spec.rb +108 -0
data/spec/tf_idf_model_spec.rb +174 -0
data/spec/token_spec.rb +34 -0
data/td-idf-similarity.gemspec +3 -3
metadata +91 -63
data/lib/tf-idf-similarity/collection.rb +0 -205
data/lib/tf-idf-similarity/extras/collection.rb +0 -110

data/spec/token_spec.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# coding: utf-8
+require 'spec_helper'
+describe TfIdfSimilarity::Token do
+  describe '#valid?' do
+    it 'should return false if all of its characters are numbers, punctuation or whitespace characters' do
+      TfIdfSimilarity::Token.new('1 2 3 ! @ #').valid?.should == false
+    end
+    it 'should return true if not all of its characters are numbers, punctuation or whitespace characters' do
+      TfIdfSimilarity::Token.new('1 2 3 ! @ # a').valid?.should == true
+    end
+  end
+  describe '#lowercase_filter' do
+    it 'should lowercase the token' do
+      TfIdfSimilarity::Token.new('HÉTÉROGÉNÉITÉ').lowercase_filter.should == 'hétérogénéité'
+    end
+  end
+  describe '#classic_filter' do
+    it 'should remove all periods' do
+      TfIdfSimilarity::Token.new('X.Y.Z.').classic_filter.should == 'XYZ'
+    end
+    it 'should remove ending possessives' do
+      TfIdfSimilarity::Token.new("foo's").classic_filter.should == 'foo'
+    end
+    it 'should not remove infix possessives' do
+      TfIdfSimilarity::Token.new("foo's bar").classic_filter.should == "foo's bar"
+    end
+  end
+end

data/td-idf-similarity.gemspec CHANGED Viewed

@@ -1,6 +1,5 @@
 # -*- encoding: utf-8 -*-
-$:.push File.expand_path("../lib", __FILE__)
-require "tf-idf-similarity/version"
+require File.expand_path('../lib/tf-idf-similarity/version', __FILE__)
 Gem::Specification.new do |s|
   s.name        = "tf-idf-similarity"
@@ -16,7 +15,8 @@ Gem::Specification.new do |s|
   s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
   s.require_paths = ["lib"]
-  s.add_runtime_dependency('unicode_utils')
+  s.add_runtime_dependency('unicode_utils') unless RUBY_VERSION < '1.9'
   s.add_development_dependency('rspec', '~> 2.10')
   s.add_development_dependency('rake')
+  s.add_development_dependency('coveralls')
 end

metadata CHANGED Viewed

@@ -1,71 +1,76 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: tf-idf-similarity
-version: !ruby/object:Gem::Version
-  version: 0.0.9
+version: !ruby/object:Gem::Version
+  hash: 27
   prerelease:
+  segments:
+  - 0
+  - 1
+  - 0
+  version: 0.1.0
 platform: ruby
-authors:
+authors:
 - Open North
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-01-07 00:00:00.000000000 Z
-dependencies:
-- !ruby/object:Gem::Dependency
-  name: unicode_utils
-  requirement: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
-- !ruby/object:Gem::Dependency
+date: 2013-06-03 00:00:00 -04:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
   name: rspec
-  requirement: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ~>
-      - !ruby/object:Gem::Version
-        version: '2.10'
-  type: :development
   prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
+  requirement: &id001 !ruby/object:Gem::Requirement
     none: false
-    requirements:
+    requirements:
     - - ~>
-      - !ruby/object:Gem::Version
-        version: '2.10'
-- !ruby/object:Gem::Dependency
+      - !ruby/object:Gem::Version
+        hash: 23
+        segments:
+        - 2
+        - 10
+        version: "2.10"
+  type: :development
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
   name: rake
-  requirement: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
   type: :development
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: coveralls
   prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
+  requirement: &id003 !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :development
+  version_requirements: *id003
 description:
-email:
+email:
 - info@opennorth.ca
 executables: []
 extensions: []
 extra_rdoc_files: []
-files:
+files:
 - .gitignore
 - .travis.yml
 - .yardopts
@@ -75,36 +80,59 @@ files:
 - Rakefile
 - USAGE
 - lib/tf-idf-similarity.rb
-- lib/tf-idf-similarity/collection.rb
 - lib/tf-idf-similarity/document.rb
-- lib/tf-idf-similarity/extras/collection.rb
 - lib/tf-idf-similarity/extras/document.rb
+- lib/tf-idf-similarity/extras/tf_idf_model.rb
+- lib/tf-idf-similarity/matrix_methods.rb
+- lib/tf-idf-similarity/term_count_model.rb
+- lib/tf-idf-similarity/tf_idf_model.rb
 - lib/tf-idf-similarity/token.rb
 - lib/tf-idf-similarity/version.rb
+- spec/document_spec.rb
+- spec/extras/tf_idf_model_spec.rb
+- spec/spec_helper.rb
+- spec/term_count_model_spec.rb
+- spec/tf_idf_model_spec.rb
+- spec/token_spec.rb
 - td-idf-similarity.gemspec
+has_rdoc: true
 homepage: http://github.com/opennorth/tf-idf-similarity
 licenses: []
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
+required_ruby_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ! '>='
-    - !ruby/object:Gem::Version
-      version: '0'
-required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ! '>='
-    - !ruby/object:Gem::Version
-      version: '0'
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 1.6.2
 signing_key:
 specification_version: 3
 summary: Calculates the similarity between texts using tf*idf
-test_files: []
-has_rdoc:
+test_files:
+- spec/document_spec.rb
+- spec/extras/tf_idf_model_spec.rb
+- spec/spec_helper.rb
+- spec/term_count_model_spec.rb
+- spec/tf_idf_model_spec.rb
+- spec/token_spec.rb

data/lib/tf-idf-similarity/collection.rb DELETED Viewed

@@ -1,205 +0,0 @@
-# @todo Do speed comparison between these gsl and narray, to load fastest first.
-begin
-  require 'gsl'
-rescue LoadError
-  begin
-    require 'narray'
-  rescue LoadError
-    require 'matrix'
-  end
-end
-class TfIdfSimilarity::Collection
-  class CollectionError < StandardError; end
-  # The documents in the collection.
-  attr_reader :documents
-  # The number of times each term appears in all documents.
-  attr_reader :term_counts
-  # The number of documents each term appears in.
-  attr_reader :document_counts
-  def initialize
-    @documents       = []
-    @term_counts     = Hash.new 0
-    @document_counts = Hash.new 0
-  end
-  def <<(document)
-    document.term_counts.each do |term,count|
-      @term_counts[term]     += count
-      @document_counts[term] += 1
-    end
-    @documents << document
-  end
-  # @return [Array<String>] the set of the collection's terms with no duplicates
-  def terms
-    term_counts.keys
-  end
-  # @param [Hash] opts optional arguments
-  # @option opts [Symbol] :function one of :tfidf (default) or :bm25
-  #
-  # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
-  # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
-  # @see http://en.wikipedia.org/wiki/Vector_space_model
-  # @see http://en.wikipedia.org/wiki/Document-term_matrix
-  # @see http://en.wikipedia.org/wiki/Cosine_similarity
-  # @see http://en.wikipedia.org/wiki/Okapi_BM25
-  def similarity_matrix(opts = {})
-    if documents.empty?
-      raise CollectionError, "No documents in collection"
-    end
-    # Calculate tf*idf.
-    if stdlib?
-      idf = []
-      matrix = Matrix.build(terms.size, documents.size) do |i,j|
-        idf[i] ||= inverse_document_frequency(terms[i], opts)
-        idf[i] * term_frequency(documents[j], terms[i], opts)
-      end
-    else
-      matrix = initialize_matrix
-      terms.each_with_index do |term,i|
-        idf = inverse_document_frequency(term, opts)
-        documents.each_with_index do |document,j|
-          value = idf * term_frequency(document, term, opts)
-          # NArray puts the dimensions in a different order.
-          # @see http://narray.rubyforge.org/SPEC.en
-          if narray?
-            matrix[j, i] = value
-          else
-            matrix[i, j] = value
-          end
-        end
-      end
-    end
-    # Columns are normalized to unit vectors, so we can calculate the cosine
-    # similarity of all document vectors. BM25 doesn't normalize columns, but
-    # BM25 wasn't written with this use case in mind.
-    matrix = normalize matrix
-    if nmatrix?
-      matrix.transpose.dot matrix
-    else
-      matrix.transpose * matrix
-    end
-  end
-  # @param [Document] document a document
-  # @param [String] term a term
-  # @param [Hash] opts optional arguments
-  # @option opts [Symbol] :function one of :tfidf (default) or :bm25
-  # @return [Float] the term's frequency in the document
-  def term_frequency_inverse_document_frequency(document, term, opts = {})
-    inverse_document_frequency(term, opts) * term_frequency(document, term, opts)
-  end
-  alias_method :tfidf, :term_frequency_inverse_document_frequency
-  # @param [String] term a term
-  # @param [Hash] opts optional arguments
-  # @option opts [Symbol] :function one of :tfidf (default) or :bm25
-  # @return [Float] the term's inverse document frequency
-  def inverse_document_frequency(term, opts = {})
-    if opts[:function] == :bm25
-      Math.log (documents.size - document_counts[term] + 0.5) / (document_counts[term] + 0.5)
-    else
-      1 + Math.log(documents.size / (document_counts[term].to_f + 1))
-    end
-  end
-  alias_method :idf, :inverse_document_frequency
-  # @param [Document] document a document
-  # @param [String] term a term
-  # @param [Hash] opts optional arguments
-  # @option opts [Symbol] :function one of :tfidf (default) or :bm25
-  # @return [Float] the term's frequency in the document
-  #
-  # @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
-  def term_frequency(document, term, opts = {})
-    if opts[:function] == :bm25
-      (document.term_counts[term].to_i * 2.2) / (document.term_counts[term].to_i + 0.3 + 0.9 * document.size / average_document_size)
-    else
-      document.term_frequency term
-    end
-  end
-  alias_method :tf, :term_frequency
-  # @return [Float] the average document size, in terms
-  def average_document_size
-    if documents.empty?
-      raise CollectionError, "No documents in collection"
-    end
-    @average_document_size ||= documents.map(&:size).reduce(:+) / documents.size.to_f
-  end
-  # Resets the average document size.
-  #
-  # If you have already made a similarity matrix and are adding more documents,
-  # call this method before creating a new similarity matrix.
-  def reset_average_document_size!
-    @average_document_size = nil
-  end
-  # @param [Document] matrix a term-document matrix
-  # @return [GSL::Matrix,NMatrix,Matrix] a matrix in which all document vectors are unit vectors
-  #
-  # @note Lucene normalizes document length differently.
-  def normalize(matrix)
-    if gsl?
-      matrix.each_col(&:normalize!)
-    elsif narray?
-      # @see https://github.com/masa16/narray/issues/21
-      NMatrix.refer(matrix / NMath.sqrt((matrix ** 2).sum(1).reshape(documents.size, 1)))
-    elsif nmatrix?
-      # @see https://github.com/SciRuby/nmatrix/issues/38
-      (0...matrix.shape[1]).each do |j|
-        # @see https://github.com/SciRuby/nmatrix/pull/46
-        column = matrix.column(j)
-        norm = Math.sqrt(column.transpose.dot(column)[0, 0])
-        (0...m.shape[0]).each do |i|
-          m[i, j] /= norm
-        end
-      end
-      matrix.cast :yale, :float64
-    else
-      Matrix.columns matrix.column_vectors.map(&:normalize)
-    end
-  end
-private
-  # @return a matrix
-  def initialize_matrix
-    if gsl?
-      GSL::Matrix.alloc terms.size, documents.size
-    elsif narray?
-      NArray.float documents.size, terms.size
-    elsif nmatrix?
-      NMatrix.new(:list, [terms.size, documents.size], :float64)
-    end
-  end
-  # @return [Boolean] whether to use the GSL gem
-  def gsl?
-    @gsl     ||= Object.const_defined?(:GSL)
-  end
-  # @return [Boolean] whether to use the NArray gem
-  def narray?
-    @narray  ||= Object.const_defined?(:NArray) && !gsl?
-  end
-  # @return [Boolean] whether to use the NMatrix gem
-  def nmatrix?
-    @nmatrix ||= Object.const_defined?(:NMatrix) && !gsl? && !narray?
-  end
-  # @return [Boolean] whether to use the standard library
-  def stdlib?
-    @matrix  ||= Object.const_defined?(:Matrix)
-  end
-end