tf-idf-similarity 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/.rspec +2 -0
 - data/.travis.yml +13 -9
 - data/.yardopts +0 -1
 - data/Gemfile +3 -3
 - data/LICENSE +1 -1
 - data/README.md +49 -23
 - data/lib/tf-idf-similarity.rb +2 -6
 - data/lib/tf-idf-similarity/bm25_model.rb +1 -1
 - data/lib/tf-idf-similarity/document.rb +1 -1
 - data/lib/tf-idf-similarity/extras/tf_idf_model.rb +1 -1
 - data/lib/tf-idf-similarity/matrix_methods.rb +1 -1
 - data/lib/tf-idf-similarity/token.rb +3 -6
 - data/lib/tf-idf-similarity/version.rb +1 -1
 - data/spec/extras/tf_idf_model_spec.rb +3 -3
 - data/spec/spec_helper.rb +5 -1
 - data/spec/token_spec.rb +8 -0
 - data/td-idf-similarity.gemspec +6 -8
 - metadata +17 -20
 - data/USAGE +0 -1
 - data/ext/mkrf_conf.rb +0 -15
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 736ca4c4b93d14ea046cbc4bdae930c8b88082be
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 6b43e8356c59e0f48ac08f300186d4e12497368d
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 635ea3047ba54a951020f95ab7e9412adf07a39d6042b85e605fcd0517345d506690bac11ab05f7f20e16f80106e95e8002fd1ae2ab4e466a27cc4f143ac15d6
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 693ac6c70f9daf3f0a1ed06ba693d170654e7c871702641d598e59a6fc69cbd5316e76441da062aca05d0b8f67c0ba0c958a4115e2c9da17316a2ebef2190738
         
     | 
    
        data/.rspec
    ADDED
    
    
    
        data/.travis.yml
    CHANGED
    
    | 
         @@ -1,21 +1,25 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            sudo: false
         
     | 
| 
       1 
2 
     | 
    
         
             
            language: ruby
         
     | 
| 
      
 3 
     | 
    
         
            +
            cache: bundler
         
     | 
| 
       2 
4 
     | 
    
         
             
            rvm:
         
     | 
| 
       3 
     | 
    
         
            -
              - 1.9.2
         
     | 
| 
       4 
5 
     | 
    
         
             
              - 1.9.3
         
     | 
| 
       5 
6 
     | 
    
         
             
              - 2.0.0
         
     | 
| 
       6 
7 
     | 
    
         
             
              - 2.1.0
         
     | 
| 
      
 8 
     | 
    
         
            +
              - 2.2.0
         
     | 
| 
       7 
9 
     | 
    
         
             
            env:
         
     | 
| 
       8 
10 
     | 
    
         
             
              - MATRIX_LIBRARY=gsl
         
     | 
| 
       9 
11 
     | 
    
         
             
              - MATRIX_LIBRARY=narray
         
     | 
| 
       10 
12 
     | 
    
         
             
              - MATRIX_LIBRARY=nmatrix
         
     | 
| 
       11 
13 
     | 
    
         
             
              - MATRIX_LIBRARY=matrix
         
     | 
| 
      
 14 
     | 
    
         
            +
            addons:
         
     | 
| 
      
 15 
     | 
    
         
            +
              apt:
         
     | 
| 
      
 16 
     | 
    
         
            +
                packages:
         
     | 
| 
      
 17 
     | 
    
         
            +
                - gsl-bin
         
     | 
| 
      
 18 
     | 
    
         
            +
                - libgsl0-dev
         
     | 
| 
      
 19 
     | 
    
         
            +
                # Installing ATLAS will install BLAS.
         
     | 
| 
      
 20 
     | 
    
         
            +
                - libatlas-dev
         
     | 
| 
      
 21 
     | 
    
         
            +
                - libatlas-base-dev
         
     | 
| 
      
 22 
     | 
    
         
            +
                - libatlas3gf-base
         
     | 
| 
       12 
23 
     | 
    
         
             
            before_install:
         
     | 
| 
       13 
24 
     | 
    
         
             
              - bundle config build.nmatrix --with-lapacklib
         
     | 
| 
       14 
     | 
    
         
            -
              -  
     | 
| 
       15 
     | 
    
         
            -
              - if [ $MATRIX_LIBRARY = 'gsl' ]; then sudo apt-get install gsl-bin libgsl0-dev; fi
         
     | 
| 
       16 
     | 
    
         
            -
              # Installing ATLAS will install BLAS.
         
     | 
| 
       17 
     | 
    
         
            -
              - if [ $MATRIX_LIBRARY = 'nmatrix' ]; then sudo apt-get install -qq libatlas-dev libatlas-base-dev libatlas3gf-base; fi
         
     | 
| 
       18 
     | 
    
         
            -
              - if [ $MATRIX_LIBRARY = 'nmatrix' ]; then export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas; fi
         
     | 
| 
       19 
     | 
    
         
            -
            # Travis sometimes runs without Bundler.
         
     | 
| 
       20 
     | 
    
         
            -
            install: bundle
         
     | 
| 
       21 
     | 
    
         
            -
            script: bundle exec rake --trace
         
     | 
| 
      
 25 
     | 
    
         
            +
              - export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas
         
     | 
    
        data/.yardopts
    CHANGED
    
    
    
        data/Gemfile
    CHANGED
    
    | 
         @@ -1,8 +1,8 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            source  
     | 
| 
      
 1 
     | 
    
         
            +
            source 'https://rubygems.org'
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
     | 
    
         
            -
            gem 'rb-gsl', '~> 1.16.0.2' 
     | 
| 
      
 3 
     | 
    
         
            +
            gem 'rb-gsl', '~> 1.16.0.2' if ENV['MATRIX_LIBRARY'] == 'gsl'
         
     | 
| 
       4 
4 
     | 
    
         
             
            gem 'narray', '~> 0.6.0.0' if ENV['MATRIX_LIBRARY'] == 'narray'
         
     | 
| 
       5 
     | 
    
         
            -
            gem 'nmatrix', '~> 0.1.0.rc5' 
     | 
| 
      
 5 
     | 
    
         
            +
            gem 'nmatrix', '~> 0.1.0.rc5' if ENV['MATRIX_LIBRARY'] == 'nmatrix' && RUBY_VERSION >= '1.9'
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
7 
     | 
    
         
             
            # Specify your gem's dependencies in the gemspec
         
     | 
| 
       8 
8 
     | 
    
         
             
            gemspec
         
     | 
    
        data/LICENSE
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | 
         @@ -1,12 +1,12 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # Ruby Vector Space Model (VSM) with tf*idf weights
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
     | 
    
         
            -
            []( 
     | 
| 
       4 
     | 
    
         
            -
            [](https://badge.fury.io/rb/tf-idf-similarity)
         
     | 
| 
      
 4 
     | 
    
         
            +
            [](https://travis-ci.org/jpmckinney/tf-idf-similarity)
         
     | 
| 
      
 5 
     | 
    
         
            +
            [](https://gemnasium.com/jpmckinney/tf-idf-similarity)
         
     | 
| 
      
 6 
     | 
    
         
            +
            [](https://coveralls.io/r/jpmckinney/tf-idf-similarity)
         
     | 
| 
      
 7 
     | 
    
         
            +
            [](https://codeclimate.com/github/jpmckinney/tf-idf-similarity)
         
     | 
| 
       8 
8 
     | 
    
         | 
| 
       9 
     | 
    
         
            -
            Calculates the similarity between texts using a [bag-of-words]( 
     | 
| 
      
 9 
     | 
    
         
            +
            Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
         
     | 
| 
       10 
10 
     | 
    
         | 
| 
       11 
11 
     | 
    
         
             
            ## Usage
         
     | 
| 
       12 
12 
     | 
    
         | 
| 
         @@ -24,13 +24,13 @@ document3 = TfIdfSimilarity::Document.new("Nam scelerisque dui sed leo...") 
     | 
|
| 
       24 
24 
     | 
    
         
             
            corpus = [document1, document2, document3]
         
     | 
| 
       25 
25 
     | 
    
         
             
            ```
         
     | 
| 
       26 
26 
     | 
    
         | 
| 
       27 
     | 
    
         
            -
            Create a document-term matrix using [Term Frequency-Inverse Document Frequency function]( 
     | 
| 
      
 27 
     | 
    
         
            +
            Create a document-term matrix using [Term Frequency-Inverse Document Frequency function](https://en.wikipedia.org/wiki/Tf–idf):
         
     | 
| 
       28 
28 
     | 
    
         | 
| 
       29 
29 
     | 
    
         
             
            ```ruby
         
     | 
| 
       30 
30 
     | 
    
         
             
            model = TfIdfSimilarity::TfIdfModel.new(corpus)
         
     | 
| 
       31 
31 
     | 
    
         
             
            ```
         
     | 
| 
       32 
32 
     | 
    
         | 
| 
       33 
     | 
    
         
            -
            Or, create a document-term matrix using the [Okapi BM25 ranking function]( 
     | 
| 
      
 33 
     | 
    
         
            +
            Or, create a document-term matrix using the [Okapi BM25 ranking function](https://en.wikipedia.org/wiki/Okapi_BM25):
         
     | 
| 
       34 
34 
     | 
    
         | 
| 
       35 
35 
     | 
    
         
             
            ```ruby
         
     | 
| 
       36 
36 
     | 
    
         
             
            model = TfIdfSimilarity::BM25Model.new(corpus)
         
     | 
| 
         @@ -58,16 +58,46 @@ end 
     | 
|
| 
       58 
58 
     | 
    
         
             
            puts tfidf_by_term.sort_by{|_,tfidf| -tfidf}
         
     | 
| 
       59 
59 
     | 
    
         
             
            ```
         
     | 
| 
       60 
60 
     | 
    
         | 
| 
      
 61 
     | 
    
         
            +
            Tokenize a document yourself, for example by excluding stop words:
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 64 
     | 
    
         
            +
            require 'unicode_utils'
         
     | 
| 
      
 65 
     | 
    
         
            +
            text = "Lorem ipsum dolor sit amet..."
         
     | 
| 
      
 66 
     | 
    
         
            +
            tokens = UnicodeUtils.each_word(text).to_a - ['and', 'the', 'to']
         
     | 
| 
      
 67 
     | 
    
         
            +
            document1 = TfIdfSimilarity::Document.new(text, :tokens => tokens)
         
     | 
| 
      
 68 
     | 
    
         
            +
            ```
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
            Provide, by yourself, the number of times each term appears and the number of tokens in the document:
         
     | 
| 
      
 71 
     | 
    
         
            +
             
     | 
| 
      
 72 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 73 
     | 
    
         
            +
            require 'unicode_utils'
         
     | 
| 
      
 74 
     | 
    
         
            +
            text = "Lorem ipsum dolor sit amet..."
         
     | 
| 
      
 75 
     | 
    
         
            +
            tokens = UnicodeUtils.each_word(text).to_a - ['and', 'the', 'to']
         
     | 
| 
      
 76 
     | 
    
         
            +
            term_counts = Hash.new(0)
         
     | 
| 
      
 77 
     | 
    
         
            +
            size = 0
         
     | 
| 
      
 78 
     | 
    
         
            +
            tokens.each do |token|
         
     | 
| 
      
 79 
     | 
    
         
            +
              # Unless the token is numeric.
         
     | 
| 
      
 80 
     | 
    
         
            +
              unless token[/\A\d+\z/]
         
     | 
| 
      
 81 
     | 
    
         
            +
                # Remove all punctuation from tokens.
         
     | 
| 
      
 82 
     | 
    
         
            +
                term_counts[token.gsub(/\p{Punct}/, '')] += 1
         
     | 
| 
      
 83 
     | 
    
         
            +
                size += 1
         
     | 
| 
      
 84 
     | 
    
         
            +
              end
         
     | 
| 
      
 85 
     | 
    
         
            +
            end
         
     | 
| 
      
 86 
     | 
    
         
            +
            document1 = TfIdfSimilarity::Document.new(text, :term_counts => term_counts, :size => size)
         
     | 
| 
      
 87 
     | 
    
         
            +
            ```
         
     | 
| 
      
 88 
     | 
    
         
            +
             
     | 
| 
       61 
89 
     | 
    
         
             
            [Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
         
     | 
| 
       62 
90 
     | 
    
         | 
| 
       63 
91 
     | 
    
         
             
            ## Speed
         
     | 
| 
       64 
92 
     | 
    
         | 
| 
       65 
93 
     | 
    
         
             
            Instead of using the Ruby Standard Library's [Matrix](http://www.ruby-doc.org/stdlib-2.0/libdoc/matrix/rdoc/Matrix.html) class, you can use one of the [GNU Scientific Library (GSL)](http://www.gnu.org/software/gsl/), [NArray](http://narray.rubyforge.org/) or [NMatrix](https://github.com/SciRuby/nmatrix) (0.0.9 or greater) gems for faster matrix operations. For example:
         
     | 
| 
       66 
94 
     | 
    
         | 
| 
       67 
     | 
    
         
            -
                require ' 
     | 
| 
       68 
     | 
    
         
            -
                model = TfIdfSimilarity::TfIdfModel.new(corpus, :library => : 
     | 
| 
      
 95 
     | 
    
         
            +
                require 'narray'
         
     | 
| 
      
 96 
     | 
    
         
            +
                model = TfIdfSimilarity::TfIdfModel.new(corpus, :library => :narray)
         
     | 
| 
       69 
97 
     | 
    
         | 
| 
       70 
     | 
    
         
            -
             
     | 
| 
      
 98 
     | 
    
         
            +
            NArray seems to have the best performance of the three libraries.
         
     | 
| 
      
 99 
     | 
    
         
            +
             
     | 
| 
      
 100 
     | 
    
         
            +
            The NMatrix gem gives access to [Automatically Tuned Linear Algebra Software (ATLAS)](http://math-atlas.sourceforge.net/), which you may know of through [Linear Algebra PACKage (LAPACK)](http://www.netlib.org/lapack/) or [Basic Linear Algebra Subprograms (BLAS)](http://www.netlib.org/blas/). Follow [these instructions](https://github.com/SciRuby/nmatrix#installation) to install the NMatrix gem.
         
     | 
| 
       71 
101 
     | 
    
         | 
| 
       72 
102 
     | 
    
         
             
            ## Extras
         
     | 
| 
       73 
103 
     | 
    
         | 
| 
         @@ -76,7 +106,7 @@ You can access more term frequency, document frequency, and normalization formul 
     | 
|
| 
       76 
106 
     | 
    
         
             
                require 'tf-idf-similarity/extras/document'
         
     | 
| 
       77 
107 
     | 
    
         
             
                require 'tf-idf-similarity/extras/tf_idf_model'
         
     | 
| 
       78 
108 
     | 
    
         | 
| 
       79 
     | 
    
         
            -
            The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0 
     | 
| 
      
 109 
     | 
    
         
            +
            The default tf*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
         
     | 
| 
       80 
110 
     | 
    
         | 
| 
       81 
111 
     | 
    
         
             
            ## Why?
         
     | 
| 
       82 
112 
     | 
    
         | 
| 
         @@ -115,17 +145,13 @@ Adapters for the following projects were also considered: 
     | 
|
| 
       115 
145 
     | 
    
         | 
| 
       116 
146 
     | 
    
         
             
            ## Further Reading
         
     | 
| 
       117 
147 
     | 
    
         | 
| 
       118 
     | 
    
         
            -
            Lucene implements many more [similarity functions](http://lucene.apache.org/core/4_0_0 
     | 
| 
       119 
     | 
    
         
            -
             
     | 
| 
       120 
     | 
    
         
            -
            * a [divergence from randomness (DFR) framework](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/DFRSimilarity.html)
         
     | 
| 
       121 
     | 
    
         
            -
            * a [framework for the family of information-based models](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/IBSimilarity.html)
         
     | 
| 
       122 
     | 
    
         
            -
            * a [language model with Bayesian smoothing using Dirichlet priors](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/LMDirichletSimilarity.html)
         
     | 
| 
       123 
     | 
    
         
            -
            * a [language model with Jelinek-Mercer smoothing](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.html)
         
     | 
| 
       124 
     | 
    
         
            -
             
     | 
| 
       125 
     | 
    
         
            -
            Lucene can even [combine similarity measures](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/MultiSimilarity.html).
         
     | 
| 
      
 148 
     | 
    
         
            +
            Lucene implements many more [similarity functions](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/Similarity.html), such as:
         
     | 
| 
       126 
149 
     | 
    
         | 
| 
       127 
     | 
    
         
            -
             
     | 
| 
      
 150 
     | 
    
         
            +
            * a [divergence from randomness (DFR) framework](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/DFRSimilarity.html)
         
     | 
| 
      
 151 
     | 
    
         
            +
            * a [framework for the family of information-based models](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/IBSimilarity.html)
         
     | 
| 
      
 152 
     | 
    
         
            +
            * a [language model with Bayesian smoothing using Dirichlet priors](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/LMDirichletSimilarity.html)
         
     | 
| 
      
 153 
     | 
    
         
            +
            * a [language model with Jelinek-Mercer smoothing](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.html)
         
     | 
| 
       128 
154 
     | 
    
         | 
| 
       129 
     | 
    
         
            -
             
     | 
| 
      
 155 
     | 
    
         
            +
            Lucene can even [combine similarity measures](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/MultiSimilarity.html).
         
     | 
| 
       130 
156 
     | 
    
         | 
| 
       131 
     | 
    
         
            -
            Copyright (c) 2012  
     | 
| 
      
 157 
     | 
    
         
            +
            Copyright (c) 2012 James McKinney, released under the MIT license
         
     | 
    
        data/lib/tf-idf-similarity.rb
    CHANGED
    
    | 
         @@ -1,12 +1,8 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            require 'forwardable'
         
     | 
| 
       2 
2 
     | 
    
         
             
            require 'set'
         
     | 
| 
       3 
3 
     | 
    
         | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
              require 'unicode_utils/each_word'
         
     | 
| 
       7 
     | 
    
         
            -
            rescue LoadError
         
     | 
| 
       8 
     | 
    
         
            -
              # Ruby 1.8
         
     | 
| 
       9 
     | 
    
         
            -
            end
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'unicode_utils/downcase'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'unicode_utils/each_word'
         
     | 
| 
       10 
6 
     | 
    
         | 
| 
       11 
7 
     | 
    
         
             
            module TfIdfSimilarity
         
     | 
| 
       12 
8 
     | 
    
         
             
            end
         
     | 
| 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # A document-term matrix using the BM25 function.
         
     | 
| 
       2 
2 
     | 
    
         
             
            #
         
     | 
| 
       3 
3 
     | 
    
         
             
            # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/BM25Similarity.html
         
     | 
| 
       4 
     | 
    
         
            -
            # @see  
     | 
| 
      
 4 
     | 
    
         
            +
            # @see https://en.wikipedia.org/wiki/Okapi_BM25
         
     | 
| 
       5 
5 
     | 
    
         
             
            module TfIdfSimilarity
         
     | 
| 
       6 
6 
     | 
    
         
             
              class BM25Model < Model
         
     | 
| 
       7 
7 
     | 
    
         
             
                # Return the term's inverse document frequency.
         
     | 
| 
         @@ -76,7 +76,7 @@ module TfIdfSimilarity 
     | 
|
| 
       76 
76 
     | 
    
         
             
                # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
         
     | 
| 
       77 
77 
     | 
    
         
             
                # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
         
     | 
| 
       78 
78 
     | 
    
         
             
                def tokenize(text)
         
     | 
| 
       79 
     | 
    
         
            -
                  @tokens ||  
     | 
| 
      
 79 
     | 
    
         
            +
                  @tokens || UnicodeUtils.each_word(text)
         
     | 
| 
       80 
80 
     | 
    
         
             
                end
         
     | 
| 
       81 
81 
     | 
    
         
             
              end
         
     | 
| 
       82 
82 
     | 
    
         
             
            end
         
     | 
| 
         @@ -110,7 +110,7 @@ module TfIdfSimilarity 
     | 
|
| 
       110 
110 
     | 
    
         
             
                end
         
     | 
| 
       111 
111 
     | 
    
         
             
                alias_method :binary_tf, :binary_term_frequency
         
     | 
| 
       112 
112 
     | 
    
         | 
| 
       113 
     | 
    
         
            -
                # @see  
     | 
| 
      
 113 
     | 
    
         
            +
                # @see https://en.wikipedia.org/wiki/Tf*idf
         
     | 
| 
       114 
114 
     | 
    
         
             
                # @see http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
         
     | 
| 
       115 
115 
     | 
    
         
             
                def normalized_term_frequency(document, term, a = 0)
         
     | 
| 
       116 
116 
     | 
    
         
             
                  a + (1 - a) * document.term_count(term) / document.maximum_term_count
         
     | 
| 
         @@ -9,7 +9,7 @@ 
     | 
|
| 
       9 
9 
     | 
    
         
             
            # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory
         
     | 
| 
       10 
10 
     | 
    
         
             
            # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
         
     | 
| 
       11 
11 
     | 
    
         
             
            module TfIdfSimilarity
         
     | 
| 
       12 
     | 
    
         
            -
              class Token <  
     | 
| 
      
 12 
     | 
    
         
            +
              class Token < SimpleDelegator
         
     | 
| 
       13 
13 
     | 
    
         
             
                # Returns a falsy value if all its characters are numbers, punctuation,
         
     | 
| 
       14 
14 
     | 
    
         
             
                # whitespace or control characters.
         
     | 
| 
       15 
15 
     | 
    
         
             
                #
         
     | 
| 
         @@ -35,10 +35,7 @@ module TfIdfSimilarity 
     | 
|
| 
       35 
35 
     | 
    
         
             
                #
         
     | 
| 
       36 
36 
     | 
    
         
             
                # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory
         
     | 
| 
       37 
37 
     | 
    
         
             
                def lowercase_filter
         
     | 
| 
       38 
     | 
    
         
            -
                  self.class.new( 
     | 
| 
       39 
     | 
    
         
            -
                    "ÀÁÂÃÄÅĀĂĄÇĆĈĊČÐĎĐÈÉÊËĒĔĖĘĚĜĞĠĢĤĦÌÍÎÏĨĪĬĮĴĶĹĻĽĿŁÑŃŅŇŊÒÓÔÕÖØŌŎŐŔŖŘŚŜŞŠŢŤŦÙÚÛÜŨŪŬŮŰŲŴÝŶŸŹŻŽ",
         
     | 
| 
       40 
     | 
    
         
            -
                    "àáâãäåāăąçćĉċčðďđèéêëēĕėęěĝğġģĥħìíîïĩīĭįĵķĺļľŀłñńņňŋòóôõöøōŏőŕŗřśŝşšţťŧùúûüũūŭůűųŵýŷÿźżž"
         
     | 
| 
       41 
     | 
    
         
            -
                  ).downcase)
         
     | 
| 
      
 38 
     | 
    
         
            +
                  self.class.new(UnicodeUtils.downcase(self))
         
     | 
| 
       42 
39 
     | 
    
         
             
                end
         
     | 
| 
       43 
40 
     | 
    
         | 
| 
       44 
41 
     | 
    
         
             
                # Returns a string with no English possessive or periods in acronyms.
         
     | 
| 
         @@ -47,7 +44,7 @@ module TfIdfSimilarity 
     | 
|
| 
       47 
44 
     | 
    
         
             
                #
         
     | 
| 
       48 
45 
     | 
    
         
             
                # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
         
     | 
| 
       49 
46 
     | 
    
         
             
                def classic_filter
         
     | 
| 
       50 
     | 
    
         
            -
                  self.class.new(self.gsub('.', ''). 
     | 
| 
      
 47 
     | 
    
         
            +
                  self.class.new(self.gsub('.', '').sub(/['`’]s\z/, ''))
         
     | 
| 
       51 
48 
     | 
    
         
             
                end
         
     | 
| 
       52 
49 
     | 
    
         
             
              end
         
     | 
| 
       53 
50 
     | 
    
         
             
            end
         
     | 
| 
         @@ -33,7 +33,7 @@ module TfIdfSimilarity 
     | 
|
| 
       33 
33 
     | 
    
         
             
                    build_model(documents)
         
     | 
| 
       34 
34 
     | 
    
         
             
                  end
         
     | 
| 
       35 
35 
     | 
    
         | 
| 
       36 
     | 
    
         
            -
                   
     | 
| 
      
 36 
     | 
    
         
            +
                  skip "Add #search"
         
     | 
| 
       37 
37 
     | 
    
         
             
                end
         
     | 
| 
       38 
38 
     | 
    
         | 
| 
       39 
39 
     | 
    
         
             
                # @see https://github.com/bbcrd/Similarity/blob/master/test/test_corpus.rb
         
     | 
| 
         @@ -98,7 +98,7 @@ module TfIdfSimilarity 
     | 
|
| 
       98 
98 
     | 
    
         
             
                  end
         
     | 
| 
       99 
99 
     | 
    
         | 
| 
       100 
100 
     | 
    
         
             
                  it 'should return the similarity matrix' do
         
     | 
| 
       101 
     | 
    
         
            -
                     
     | 
| 
      
 101 
     | 
    
         
            +
                    skip "Calculate the tf*idf matrix like the similarity gem does"
         
     | 
| 
       102 
102 
     | 
    
         
             
                  end
         
     | 
| 
       103 
103 
     | 
    
         | 
| 
       104 
104 
     | 
    
         
             
                  it 'should return the number of documents in which a term appears' do
         
     | 
| 
         @@ -113,7 +113,7 @@ module TfIdfSimilarity 
     | 
|
| 
       113 
113 
     | 
    
         
             
                  end
         
     | 
| 
       114 
114 
     | 
    
         | 
| 
       115 
115 
     | 
    
         
             
                  it 'should return the document vector' do
         
     | 
| 
       116 
     | 
    
         
            -
                     
     | 
| 
      
 116 
     | 
    
         
            +
                    skip "Calculate the tf*idf matrix like the similarity gem does"
         
     | 
| 
       117 
117 
     | 
    
         
             
                  end
         
     | 
| 
       118 
118 
     | 
    
         
             
                end
         
     | 
| 
       119 
119 
     | 
    
         | 
    
        data/spec/spec_helper.rb
    CHANGED
    
    | 
         @@ -1,7 +1,11 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            require 'rubygems'
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
      
 3 
     | 
    
         
            +
            require 'simplecov'
         
     | 
| 
       3 
4 
     | 
    
         
             
            require 'coveralls'
         
     | 
| 
       4 
     | 
    
         
            -
            Coveralls 
     | 
| 
      
 5 
     | 
    
         
            +
            SimpleCov.formatter = Coveralls::SimpleCov::Formatter
         
     | 
| 
      
 6 
     | 
    
         
            +
            SimpleCov.start do
         
     | 
| 
      
 7 
     | 
    
         
            +
              add_filter 'spec'
         
     | 
| 
      
 8 
     | 
    
         
            +
            end
         
     | 
| 
       5 
9 
     | 
    
         | 
| 
       6 
10 
     | 
    
         
             
            require 'rspec'
         
     | 
| 
       7 
11 
     | 
    
         
             
            require File.dirname(__FILE__) + '/../lib/tf-idf-similarity'
         
     | 
    
        data/spec/token_spec.rb
    CHANGED
    
    | 
         @@ -28,6 +28,14 @@ module TfIdfSimilarity 
     | 
|
| 
       28 
28 
     | 
    
         
             
                    Token.new("foo's").classic_filter.should == 'foo'
         
     | 
| 
       29 
29 
     | 
    
         
             
                  end
         
     | 
| 
       30 
30 
     | 
    
         | 
| 
      
 31 
     | 
    
         
            +
                  it 'should remove ending possessives with nonstandard apostrophe 1' do
         
     | 
| 
      
 32 
     | 
    
         
            +
                    Token.new("foo`s").classic_filter.should == 'foo'
         
     | 
| 
      
 33 
     | 
    
         
            +
                  end
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                  it 'should remove ending possessives with nonstandard apostrophe 2' do
         
     | 
| 
      
 36 
     | 
    
         
            +
                    Token.new("foo’s").classic_filter.should == 'foo'
         
     | 
| 
      
 37 
     | 
    
         
            +
                  end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
       31 
39 
     | 
    
         
             
                  it 'should not remove infix possessives' do
         
     | 
| 
       32 
40 
     | 
    
         
             
                    Token.new("foo's bar").classic_filter.should == "foo's bar"
         
     | 
| 
       33 
41 
     | 
    
         
             
                  end
         
     | 
    
        data/td-idf-similarity.gemspec
    CHANGED
    
    | 
         @@ -5,9 +5,8 @@ Gem::Specification.new do |s| 
     | 
|
| 
       5 
5 
     | 
    
         
             
              s.name        = "tf-idf-similarity"
         
     | 
| 
       6 
6 
     | 
    
         
             
              s.version     = TfIdfSimilarity::VERSION
         
     | 
| 
       7 
7 
     | 
    
         
             
              s.platform    = Gem::Platform::RUBY
         
     | 
| 
       8 
     | 
    
         
            -
              s.authors     = [" 
     | 
| 
       9 
     | 
    
         
            -
              s. 
     | 
| 
       10 
     | 
    
         
            -
              s.homepage    = "http://github.com/opennorth/tf-idf-similarity"
         
     | 
| 
      
 8 
     | 
    
         
            +
              s.authors     = ["James McKinney"]
         
     | 
| 
      
 9 
     | 
    
         
            +
              s.homepage    = "https://github.com/jpmckinney/tf-idf-similarity"
         
     | 
| 
       11 
10 
     | 
    
         
             
              s.summary     = %q{Calculates the similarity between texts using tf*idf}
         
     | 
| 
       12 
11 
     | 
    
         
             
              s.license     = 'MIT'
         
     | 
| 
       13 
12 
     | 
    
         | 
| 
         @@ -16,10 +15,9 @@ Gem::Specification.new do |s| 
     | 
|
| 
       16 
15 
     | 
    
         
             
              s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
         
     | 
| 
       17 
16 
     | 
    
         
             
              s.require_paths = ["lib"]
         
     | 
| 
       18 
17 
     | 
    
         | 
| 
       19 
     | 
    
         
            -
              s. 
     | 
| 
       20 
     | 
    
         
            -
              s.add_development_dependency('rake')
         
     | 
| 
       21 
     | 
    
         
            -
              s.add_development_dependency('coveralls')
         
     | 
| 
       22 
     | 
    
         
            -
              s.add_development_dependency('mime-types', '~> 1.25') # 2.0 requires Ruby 1.9.2
         
     | 
| 
      
 18 
     | 
    
         
            +
              s.add_runtime_dependency('unicode_utils', '~> 1.4')
         
     | 
| 
       23 
19 
     | 
    
         | 
| 
       24 
     | 
    
         
            -
              s. 
     | 
| 
      
 20 
     | 
    
         
            +
              s.add_development_dependency('coveralls')
         
     | 
| 
      
 21 
     | 
    
         
            +
              s.add_development_dependency('rake')
         
     | 
| 
      
 22 
     | 
    
         
            +
              s.add_development_dependency('rspec', '~> 2.10')
         
     | 
| 
       25 
23 
     | 
    
         
             
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,31 +1,31 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: tf-idf-similarity
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.1. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.1.5
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
     | 
    
         
            -
            -  
     | 
| 
      
 7 
     | 
    
         
            +
            - James McKinney
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2016-01-18 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
     | 
    
         
            -
              name:  
     | 
| 
      
 14 
     | 
    
         
            +
              name: unicode_utils
         
     | 
| 
       15 
15 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
       16 
16 
     | 
    
         
             
                requirements:
         
     | 
| 
       17 
17 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       18 
18 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       19 
     | 
    
         
            -
                    version: ' 
     | 
| 
       20 
     | 
    
         
            -
              type: : 
     | 
| 
      
 19 
     | 
    
         
            +
                    version: '1.4'
         
     | 
| 
      
 20 
     | 
    
         
            +
              type: :runtime
         
     | 
| 
       21 
21 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       22 
22 
     | 
    
         
             
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
       23 
23 
     | 
    
         
             
                requirements:
         
     | 
| 
       24 
24 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       25 
25 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       26 
     | 
    
         
            -
                    version: ' 
     | 
| 
      
 26 
     | 
    
         
            +
                    version: '1.4'
         
     | 
| 
       27 
27 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       28 
     | 
    
         
            -
              name:  
     | 
| 
      
 28 
     | 
    
         
            +
              name: coveralls
         
     | 
| 
       29 
29 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
       30 
30 
     | 
    
         
             
                requirements:
         
     | 
| 
       31 
31 
     | 
    
         
             
                - - ">="
         
     | 
| 
         @@ -39,7 +39,7 @@ dependencies: 
     | 
|
| 
       39 
39 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       40 
40 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       41 
41 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       42 
     | 
    
         
            -
              name:  
     | 
| 
      
 42 
     | 
    
         
            +
              name: rake
         
     | 
| 
       43 
43 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
       44 
44 
     | 
    
         
             
                requirements:
         
     | 
| 
       45 
45 
     | 
    
         
             
                - - ">="
         
     | 
| 
         @@ -53,36 +53,33 @@ dependencies: 
     | 
|
| 
       53 
53 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       54 
54 
     | 
    
         
             
                    version: '0'
         
     | 
| 
       55 
55 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       56 
     | 
    
         
            -
              name:  
     | 
| 
      
 56 
     | 
    
         
            +
              name: rspec
         
     | 
| 
       57 
57 
     | 
    
         
             
              requirement: !ruby/object:Gem::Requirement
         
     | 
| 
       58 
58 
     | 
    
         
             
                requirements:
         
     | 
| 
       59 
59 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       60 
60 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       61 
     | 
    
         
            -
                    version: ' 
     | 
| 
      
 61 
     | 
    
         
            +
                    version: '2.10'
         
     | 
| 
       62 
62 
     | 
    
         
             
              type: :development
         
     | 
| 
       63 
63 
     | 
    
         
             
              prerelease: false
         
     | 
| 
       64 
64 
     | 
    
         
             
              version_requirements: !ruby/object:Gem::Requirement
         
     | 
| 
       65 
65 
     | 
    
         
             
                requirements:
         
     | 
| 
       66 
66 
     | 
    
         
             
                - - "~>"
         
     | 
| 
       67 
67 
     | 
    
         
             
                  - !ruby/object:Gem::Version
         
     | 
| 
       68 
     | 
    
         
            -
                    version: ' 
     | 
| 
      
 68 
     | 
    
         
            +
                    version: '2.10'
         
     | 
| 
       69 
69 
     | 
    
         
             
            description: 
         
     | 
| 
       70 
     | 
    
         
            -
            email:
         
     | 
| 
       71 
     | 
    
         
            -
            - info@opennorth.ca
         
     | 
| 
      
 70 
     | 
    
         
            +
            email: 
         
     | 
| 
       72 
71 
     | 
    
         
             
            executables: []
         
     | 
| 
       73 
     | 
    
         
            -
            extensions:
         
     | 
| 
       74 
     | 
    
         
            -
            - ext/mkrf_conf.rb
         
     | 
| 
      
 72 
     | 
    
         
            +
            extensions: []
         
     | 
| 
       75 
73 
     | 
    
         
             
            extra_rdoc_files: []
         
     | 
| 
       76 
74 
     | 
    
         
             
            files:
         
     | 
| 
       77 
75 
     | 
    
         
             
            - ".gitignore"
         
     | 
| 
      
 76 
     | 
    
         
            +
            - ".rspec"
         
     | 
| 
       78 
77 
     | 
    
         
             
            - ".travis.yml"
         
     | 
| 
       79 
78 
     | 
    
         
             
            - ".yardopts"
         
     | 
| 
       80 
79 
     | 
    
         
             
            - Gemfile
         
     | 
| 
       81 
80 
     | 
    
         
             
            - LICENSE
         
     | 
| 
       82 
81 
     | 
    
         
             
            - README.md
         
     | 
| 
       83 
82 
     | 
    
         
             
            - Rakefile
         
     | 
| 
       84 
     | 
    
         
            -
            - USAGE
         
     | 
| 
       85 
     | 
    
         
            -
            - ext/mkrf_conf.rb
         
     | 
| 
       86 
83 
     | 
    
         
             
            - lib/tf-idf-similarity.rb
         
     | 
| 
       87 
84 
     | 
    
         
             
            - lib/tf-idf-similarity/bm25_model.rb
         
     | 
| 
       88 
85 
     | 
    
         
             
            - lib/tf-idf-similarity/document.rb
         
     | 
| 
         @@ -102,7 +99,7 @@ files: 
     | 
|
| 
       102 
99 
     | 
    
         
             
            - spec/tf_idf_model_spec.rb
         
     | 
| 
       103 
100 
     | 
    
         
             
            - spec/token_spec.rb
         
     | 
| 
       104 
101 
     | 
    
         
             
            - td-idf-similarity.gemspec
         
     | 
| 
       105 
     | 
    
         
            -
            homepage:  
     | 
| 
      
 102 
     | 
    
         
            +
            homepage: https://github.com/jpmckinney/tf-idf-similarity
         
     | 
| 
       106 
103 
     | 
    
         
             
            licenses:
         
     | 
| 
       107 
104 
     | 
    
         
             
            - MIT
         
     | 
| 
       108 
105 
     | 
    
         
             
            metadata: {}
         
     | 
| 
         @@ -122,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement 
     | 
|
| 
       122 
119 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       123 
120 
     | 
    
         
             
            requirements: []
         
     | 
| 
       124 
121 
     | 
    
         
             
            rubyforge_project: 
         
     | 
| 
       125 
     | 
    
         
            -
            rubygems_version: 2. 
     | 
| 
      
 122 
     | 
    
         
            +
            rubygems_version: 2.4.5
         
     | 
| 
       126 
123 
     | 
    
         
             
            signing_key: 
         
     | 
| 
       127 
124 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       128 
125 
     | 
    
         
             
            summary: Calculates the similarity between texts using tf*idf
         
     | 
    
        data/USAGE
    DELETED
    
    | 
         @@ -1 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            See README.md for full usage details.
         
     | 
    
        data/ext/mkrf_conf.rb
    DELETED
    
    | 
         @@ -1,15 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            # @see http://www.programmersparadox.com/2012/05/21/gemspec-loading-dependent-gems-based-on-the-users-system/
         
     | 
| 
       2 
     | 
    
         
            -
            require 'rubygems/dependency_installer.rb'
         
     | 
| 
       3 
     | 
    
         
            -
             
     | 
| 
       4 
     | 
    
         
            -
            installer = Gem::DependencyInstaller.new
         
     | 
| 
       5 
     | 
    
         
            -
            begin
         
     | 
| 
       6 
     | 
    
         
            -
              unless RUBY_VERSION < '1.9'
         
     | 
| 
       7 
     | 
    
         
            -
                installer.install('unicode_utils', '>=0')
         
     | 
| 
       8 
     | 
    
         
            -
              end
         
     | 
| 
       9 
     | 
    
         
            -
            rescue
         
     | 
| 
       10 
     | 
    
         
            -
              exit(1)
         
     | 
| 
       11 
     | 
    
         
            -
            end
         
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
            f = File.open(File.join(File.dirname(__FILE__), "Rakefile"), "w")
         
     | 
| 
       14 
     | 
    
         
            -
            f.write("task :default\n")
         
     | 
| 
       15 
     | 
    
         
            -
            f.close
         
     |