tf-idf-similarity 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
 - data/.gitignore +1 -0
 - data/.travis.yml +1 -1
 - data/README.md +7 -6
 - data/lib/tf-idf-similarity.rb +0 -3
 - data/lib/tf-idf-similarity/document.rb +7 -5
 - data/lib/tf-idf-similarity/token.rb +7 -0
 - data/lib/tf-idf-similarity/tokenizer.rb +19 -0
 - data/lib/tf-idf-similarity/version.rb +1 -1
 - metadata +4 -3
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
     | 
    
         
            -
             
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 2 
     | 
    
         
            +
            SHA256:
         
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 605ac457508eaf64a7e583e8a4a71af231d3d9d2f9c30ee82b25fb9f647d1312
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: f24b89dccdcbef3c4fcaa59d15050f064455859c134c550fd6a432346883eb31
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: a41195c6543dea206baa8ce3e2095437d1df94fabedcc76a8151fa5af5991524d96530710a7216c1fef48a7008f88a43773ce2a2323afa563fa29f5abed9909c
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: aadbb85d6bd74625088d0aa7cb58b4127337d5c1dcc2af13c22664f1562013c59d79d8b3bcc3564a2861dfd968d39770205d3b401114e8bdf870b2ac412fda26
         
     | 
    
        data/.gitignore
    CHANGED
    
    
    
        data/.travis.yml
    CHANGED
    
    | 
         @@ -18,7 +18,7 @@ addons: 
     | 
|
| 
       18 
18 
     | 
    
         
             
                # Installing ATLAS will install BLAS.
         
     | 
| 
       19 
19 
     | 
    
         
             
                - libatlas-dev
         
     | 
| 
       20 
20 
     | 
    
         
             
                - libatlas-base-dev
         
     | 
| 
       21 
     | 
    
         
            -
                -  
     | 
| 
      
 21 
     | 
    
         
            +
                - libatlas3-base
         
     | 
| 
       22 
22 
     | 
    
         
             
            before_install:
         
     | 
| 
       23 
23 
     | 
    
         
             
              - bundle config build.nmatrix --with-lapacklib
         
     | 
| 
       24 
24 
     | 
    
         
             
              - export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/atlas
         
     | 
    
        data/README.md
    CHANGED
    
    | 
         @@ -1,12 +1,11 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            # Ruby Vector Space Model (VSM) with tf 
     | 
| 
      
 1 
     | 
    
         
            +
            # Ruby Vector Space Model (VSM) with tf\*idf weights
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            [](https://badge.fury.io/rb/tf-idf-similarity)
         
     | 
| 
       4 
4 
     | 
    
         
             
            [](https://travis-ci.org/jpmckinney/tf-idf-similarity)
         
     | 
| 
       5 
     | 
    
         
            -
            [](https://gemnasium.com/jpmckinney/tf-idf-similarity)
         
     | 
| 
       6 
5 
     | 
    
         
             
            [](https://coveralls.io/r/jpmckinney/tf-idf-similarity)
         
     | 
| 
       7 
6 
     | 
    
         
             
            [](https://codeclimate.com/github/jpmckinney/tf-idf-similarity)
         
     | 
| 
       8 
7 
     | 
    
         | 
| 
       9 
     | 
    
         
            -
            Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf 
     | 
| 
      
 8 
     | 
    
         
            +
            Calculates the similarity between texts using a [bag-of-words](https://en.wikipedia.org/wiki/Bag_of_words_model) [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model) with [Term Frequency-Inverse Document Frequency (tf\*idf)](https://en.wikipedia.org/wiki/Tf–idf) weights. If your use case demands performance, use [Lucene](http://lucene.apache.org/core/) (see below).
         
     | 
| 
       10 
9 
     | 
    
         | 
| 
       11 
10 
     | 
    
         
             
            ## Usage
         
     | 
| 
       12 
11 
     | 
    
         | 
| 
         @@ -48,7 +47,7 @@ Find the similarity of two documents in the matrix: 
     | 
|
| 
       48 
47 
     | 
    
         
             
            matrix[model.document_index(document1), model.document_index(document2)]
         
     | 
| 
       49 
48 
     | 
    
         
             
            ```
         
     | 
| 
       50 
49 
     | 
    
         | 
| 
       51 
     | 
    
         
            -
            Print the tf 
     | 
| 
      
 50 
     | 
    
         
            +
            Print the tf\*idf values for terms in a document:
         
     | 
| 
       52 
51 
     | 
    
         | 
| 
       53 
52 
     | 
    
         
             
            ```ruby
         
     | 
| 
       54 
53 
     | 
    
         
             
            tfidf_by_term = {}
         
     | 
| 
         @@ -86,6 +85,8 @@ end 
     | 
|
| 
       86 
85 
     | 
    
         
             
            document1 = TfIdfSimilarity::Document.new(text, :term_counts => term_counts, :size => size)
         
     | 
| 
       87 
86 
     | 
    
         
             
            ```
         
     | 
| 
       88 
87 
     | 
    
         | 
| 
      
 88 
     | 
    
         
            +
            Or, use your own classes for the tokenizer and tokens, like in [this example](https://gist.github.com/satoryu/0183a4eba365cc67e28988a09f3035b3).
         
     | 
| 
      
 89 
     | 
    
         
            +
             
     | 
| 
       89 
90 
     | 
    
         
             
            [Read the documentation at RubyDoc.info.](http://rubydoc.info/gems/tf-idf-similarity)
         
     | 
| 
       90 
91 
     | 
    
         | 
| 
       91 
92 
     | 
    
         
             
            ## Troubleshooting
         
     | 
| 
         @@ -114,11 +115,11 @@ You can access more term frequency, document frequency, and normalization formul 
     | 
|
| 
       114 
115 
     | 
    
         
             
                require 'tf-idf-similarity/extras/document'
         
     | 
| 
       115 
116 
     | 
    
         
             
                require 'tf-idf-similarity/extras/tf_idf_model'
         
     | 
| 
       116 
117 
     | 
    
         | 
| 
       117 
     | 
    
         
            -
            The default tf 
     | 
| 
      
 118 
     | 
    
         
            +
            The default tf\*idf formula follows the [Lucene Conceptual Scoring Formula](http://lucene.apache.org/core/4_0_0/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
         
     | 
| 
       118 
119 
     | 
    
         | 
| 
       119 
120 
     | 
    
         
             
            ## Why?
         
     | 
| 
       120 
121 
     | 
    
         | 
| 
       121 
     | 
    
         
            -
            At the time of writing, no other Ruby gem implemented the tf 
     | 
| 
      
 122 
     | 
    
         
            +
            At the time of writing, no other Ruby gem implemented the tf\*idf formula used by Lucene, Sphinx and Ferret.
         
     | 
| 
       122 
123 
     | 
    
         | 
| 
       123 
124 
     | 
    
         
             
            * [rsemantic](https://github.com/josephwilk/rsemantic) now uses the same [term frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L14) and [document frequency](https://github.com/josephwilk/rsemantic/blob/master/lib/semantic/transform/tf_idf_transform.rb#L13) formulas as Lucene.
         
     | 
| 
       124 
125 
     | 
    
         
             
            * [treat](https://github.com/louismullie/treat) offers many term frequency formulas, [one of which](https://github.com/louismullie/treat/blob/master/lib/treat/workers/extractors/tf_idf/native.rb#L13) is the same as Lucene.
         
     | 
    
        data/lib/tf-idf-similarity.rb
    CHANGED
    
    
| 
         @@ -1,3 +1,5 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'tf-idf-similarity/tokenizer'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
       1 
3 
     | 
    
         
             
            # A document.
         
     | 
| 
       2 
4 
     | 
    
         
             
            module TfIdfSimilarity
         
     | 
| 
       3 
5 
     | 
    
         
             
              class Document
         
     | 
| 
         @@ -19,7 +21,8 @@ module TfIdfSimilarity 
     | 
|
| 
       19 
21 
     | 
    
         
             
                def initialize(text, opts = {})
         
     | 
| 
       20 
22 
     | 
    
         
             
                  @text   = text
         
     | 
| 
       21 
23 
     | 
    
         
             
                  @id     = opts[:id] || object_id
         
     | 
| 
       22 
     | 
    
         
            -
                  @tokens = opts[:tokens]
         
     | 
| 
      
 24 
     | 
    
         
            +
                  @tokens = Array(opts[:tokens]).map { |t| Token.new(t) } if opts[:tokens]
         
     | 
| 
      
 25 
     | 
    
         
            +
                  @tokenizer = opts[:tokenizer] || Tokenizer.new
         
     | 
| 
       23 
26 
     | 
    
         | 
| 
       24 
27 
     | 
    
         
             
                  if opts[:term_counts]
         
     | 
| 
       25 
28 
     | 
    
         
             
                    @term_counts = opts[:term_counts]
         
     | 
| 
         @@ -51,10 +54,9 @@ module TfIdfSimilarity 
     | 
|
| 
       51 
54 
     | 
    
         | 
| 
       52 
55 
     | 
    
         
             
                # Tokenizes the text and counts terms and total tokens.
         
     | 
| 
       53 
56 
     | 
    
         
             
                def set_term_counts_and_size
         
     | 
| 
       54 
     | 
    
         
            -
                  tokenize(text).each do | 
     | 
| 
       55 
     | 
    
         
            -
                    token = Token.new(word)
         
     | 
| 
      
 57 
     | 
    
         
            +
                  tokenize(text).each do |token|
         
     | 
| 
       56 
58 
     | 
    
         
             
                    if token.valid?
         
     | 
| 
       57 
     | 
    
         
            -
                      term = token. 
     | 
| 
      
 59 
     | 
    
         
            +
                      term = token.to_s
         
     | 
| 
       58 
60 
     | 
    
         
             
                      @term_counts[term] += 1
         
     | 
| 
       59 
61 
     | 
    
         
             
                      @size += 1
         
     | 
| 
       60 
62 
     | 
    
         
             
                    end
         
     | 
| 
         @@ -76,7 +78,7 @@ module TfIdfSimilarity 
     | 
|
| 
       76 
78 
     | 
    
         
             
                # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
         
     | 
| 
       77 
79 
     | 
    
         
             
                # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
         
     | 
| 
       78 
80 
     | 
    
         
             
                def tokenize(text)
         
     | 
| 
       79 
     | 
    
         
            -
                  @tokens ||  
     | 
| 
      
 81 
     | 
    
         
            +
                  @tokens || @tokenizer.tokenize(text)
         
     | 
| 
       80 
82 
     | 
    
         
             
                end
         
     | 
| 
       81 
83 
     | 
    
         
             
              end
         
     | 
| 
       82 
84 
     | 
    
         
             
            end
         
     | 
| 
         @@ -1,5 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            # coding: utf-8
         
     | 
| 
       2 
2 
     | 
    
         
             
            require 'delegate'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'unicode_utils/downcase'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'unicode_utils/each_word'
         
     | 
| 
       3 
5 
     | 
    
         | 
| 
       4 
6 
     | 
    
         
             
            # A token.
         
     | 
| 
       5 
7 
     | 
    
         
             
            #
         
     | 
| 
         @@ -47,5 +49,10 @@ module TfIdfSimilarity 
     | 
|
| 
       47 
49 
     | 
    
         
             
                def classic_filter
         
     | 
| 
       48 
50 
     | 
    
         
             
                  self.class.new(self.gsub('.', '').sub(/['`’]s\z/, ''))
         
     | 
| 
       49 
51 
     | 
    
         
             
                end
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
                def to_s
         
     | 
| 
      
 54 
     | 
    
         
            +
                  # Don't call #lowercase_filter and #classic_filter to avoid creating unnecessary objects.
         
     | 
| 
      
 55 
     | 
    
         
            +
                  UnicodeUtils.downcase(self).gsub('.', '').sub(/['`’]s\z/, '')
         
     | 
| 
      
 56 
     | 
    
         
            +
                end
         
     | 
| 
       50 
57 
     | 
    
         
             
              end
         
     | 
| 
       51 
58 
     | 
    
         
             
            end
         
     | 
| 
         @@ -0,0 +1,19 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'unicode_utils/each_word'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'tf-idf-similarity/token'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            # A tokenizer using UnicodeUtils to tokenize a text.
         
     | 
| 
      
 5 
     | 
    
         
            +
            #
         
     | 
| 
      
 6 
     | 
    
         
            +
            # @see https://github.com/lang/unicode_utils
         
     | 
| 
      
 7 
     | 
    
         
            +
            module TfIdfSimilarity
         
     | 
| 
      
 8 
     | 
    
         
            +
              class Tokenizer
         
     | 
| 
      
 9 
     | 
    
         
            +
                # Tokenizes a text.
         
     | 
| 
      
 10 
     | 
    
         
            +
                #
         
     | 
| 
      
 11 
     | 
    
         
            +
                # @param [String] text
         
     | 
| 
      
 12 
     | 
    
         
            +
                # @return [Enumerator] an enumerator of Token objects
         
     | 
| 
      
 13 
     | 
    
         
            +
                def tokenize(text)
         
     | 
| 
      
 14 
     | 
    
         
            +
                  UnicodeUtils.each_word(text).map do |word|
         
     | 
| 
      
 15 
     | 
    
         
            +
                    Token.new(word)
         
     | 
| 
      
 16 
     | 
    
         
            +
                  end
         
     | 
| 
      
 17 
     | 
    
         
            +
                end
         
     | 
| 
      
 18 
     | 
    
         
            +
              end
         
     | 
| 
      
 19 
     | 
    
         
            +
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: tf-idf-similarity
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.2.0
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - James McKinney
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2019-12-19 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: unicode_utils
         
     | 
| 
         @@ -104,6 +104,7 @@ files: 
     | 
|
| 
       104 
104 
     | 
    
         
             
            - lib/tf-idf-similarity/term_count_model.rb
         
     | 
| 
       105 
105 
     | 
    
         
             
            - lib/tf-idf-similarity/tf_idf_model.rb
         
     | 
| 
       106 
106 
     | 
    
         
             
            - lib/tf-idf-similarity/token.rb
         
     | 
| 
      
 107 
     | 
    
         
            +
            - lib/tf-idf-similarity/tokenizer.rb
         
     | 
| 
       107 
108 
     | 
    
         
             
            - lib/tf-idf-similarity/version.rb
         
     | 
| 
       108 
109 
     | 
    
         
             
            - spec/bm25_model_spec.rb
         
     | 
| 
       109 
110 
     | 
    
         
             
            - spec/document_spec.rb
         
     | 
| 
         @@ -133,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement 
     | 
|
| 
       133 
134 
     | 
    
         
             
                  version: '0'
         
     | 
| 
       134 
135 
     | 
    
         
             
            requirements: []
         
     | 
| 
       135 
136 
     | 
    
         
             
            rubyforge_project: 
         
     | 
| 
       136 
     | 
    
         
            -
            rubygems_version: 2. 
     | 
| 
      
 137 
     | 
    
         
            +
            rubygems_version: 2.7.6
         
     | 
| 
       137 
138 
     | 
    
         
             
            signing_key: 
         
     | 
| 
       138 
139 
     | 
    
         
             
            specification_version: 4
         
     | 
| 
       139 
140 
     | 
    
         
             
            summary: Calculates the similarity between texts using tf*idf
         
     |